From e974ebe0235a7d19a250b70aaf84dcafa32d9671 Mon Sep 17 00:00:00 2001 From: la2738 Date: Sun, 8 Dec 2019 02:43:49 -0500 Subject: [PATCH] :) --- .gitignore | 4 +++ Assignment7.Rmd | 65 +++++++++++++++++++++++++++++++++++++--------- assignment7.Rproj | 13 ++++++++++ tree1.ps | Bin 0 -> 5833 bytes 4 files changed, 70 insertions(+), 12 deletions(-) create mode 100644 .gitignore create mode 100644 assignment7.Rproj create mode 100644 tree1.ps diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5b6a065 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata diff --git a/Assignment7.Rmd b/Assignment7.Rmd index 105cbdf..3c68a55 100644 --- a/Assignment7.Rmd +++ b/Assignment7.Rmd @@ -1,7 +1,7 @@ --- title: "Assignment 7 - Answers" -author: "Charles Lang" -date: "11/30/2016" +author: "Ling Ai" +date: "12/2/2019" output: html_document --- @@ -11,60 +11,85 @@ In the following assignment you will be looking at data from an one level of an #Upload data ```{r} - +D1 <-read.csv("online.data.csv") ``` #Visualization ```{r} +library(ggplot2) +library(dplyr) +library(tidyr) #Start by creating histograms of the distributions for all variables (#HINT: look up "facet" in the ggplot documentation) +D1$level.up <- ifelse(D1$level.up == "yes",1,0) +D2 <- gather(D1, "variable", "score", 2:7) + +ggplot(D2, aes(score)) + facet_wrap(~variable, scales = "free") + geom_histogram() #Then visualize the relationships between variables +pairs(D1) #Try to capture an intution about the data and the relationships ``` #Classification tree ```{r} +library(rpart) #Create a classification tree that predicts whether a student "levels up" in the online course using three variables of your choice (As we did last time, set all controls to their minimums) +c.tree1 <- rpart(level.up ~ post.test.score + post.test.score + messages + forum.posts + av.assignment.score, method="class", data= D1) #Plot and generate a CP table for your tree +printcp(c.tree1) +post(c.tree1, file = "tree1.ps", title = "level up") #Generate a probability value that represents the probability that a student levels up based your classification tree -D1$pred <- predict(rp, type = "prob")[,2]#Last class we used type = "class" which predicted the classification for us, this time we are using type = "prob" to see the probability that our classififcation is based on. +D1$pred <- predict(c.tree1, type = "prob")[,2]#Last class we used type = "class" which predicted the classification for us, this time we are using type = "prob" to see the probability that our classififcation is based on. ``` ## Part II #Now you can generate the ROC curve for your model. You will need to install the package ROCR to do this. ```{r} +#install.packages("ROCR") library(ROCR) #Plot the curve pred.detail <- prediction(D1$pred, D1$level.up) -plot(performance(pred.detail, "tpr", "fpr")) +plot(performance(pred.detail, "tpr", "fpr")) #"tpr" true positive rate, "fpr" false positive rate abline(0, 1, lty = 2) #Calculate the Area Under the Curve -unlist(slot(performance(Pred2,"auc"), "y.values"))#Unlist liberates the AUC value from the "performance" object created by ROCR +unlist(slot(performance(pred.detail,"auc"), "y.values")) + +#Unlist liberates the AUC value from the "performance" object created by ROCR +#the area under the curve is 1 #Now repeat this process, but using the variables you did not use for the previous model and compare the plots & results of your two models. Which one do you think was the better model? Why? + +pred.detail1 <- prediction(D1$post.test.score, D1$level.up) +plot(performance(pred.detail1, "tpr", "fpr")) +abline(0, 1, lty = 2) +unlist(slot(performance(pred.detail1,"auc"), "y.values")) + +# The first model is better because it's AUC value is 1 and the second model has 0.919925. ``` ## Part III #Thresholds ```{r} #Look at the ROC plot for your first model. Based on this plot choose a probability threshold that balances capturing the most correct predictions against false positives. Then generate a new variable in your data set that classifies each student according to your chosen threshold. -threshold.pred1 <- +D1$threshold.pred1 <- ifelse(D1$pred >= 0.5, 1, 0) #Now generate three diagnostics: -D1$accuracy.model1 <- +D1$accuracy.model1 <- mean(ifelse(D1$level.up == D1$threshold.pred1, 1, 0)) +D1$accuracy.model1 <- as.integer(D1$accuracy.model1) +accuracy1 <- sum(D1$accuracy.model1) / length(D1$accuracy.model1) -D1$precision.model1 <- - -D1$recall.model1 <- +D1$precision.model1 <- ifelse(D1$level.up == 1 & D1$threshold.pred1 == 1, 1, 0) +precision1 <- sum(D1$precision.model1) / sum (D1$threshold.pred1) +D1$recall.model1 <- ifelse(D1$level.up == 1 & D1$threshold.pred1 == 1, 1, 0) +recall1 <- sum(D1$precision.model1) / sum(D1$level.up) #Finally, calculate Kappa for your model according to: - #First generate the table of comparisons table1 <- table(D1$level.up, D1$threshold.pred1) @@ -75,7 +100,23 @@ matrix1 <- as.matrix(table1) kappa(matrix1, exact = TRUE)/kappa(matrix1) #Now choose a different threshold value and repeat these diagnostics. What conclusions can you draw about your two thresholds? +D1$threshold.pred2 <- ifelse(D1$pred >= 0.9, 1, 0) + +D1$accuracy.model2 <- mean(ifelse(D1$level.up == D1$threshold.pred2, 1, 0)) +D1$accuracy.model2 <- as.integer(D1$accuracy.model2) +accuracy2 <- sum(D1$accuracy.model2) / length(D1$accuracy.model2) + +D1$precision.model2 <- ifelse(D1$level.up == 1 & D1$threshold.pred2 == 1, 1, 0) +precision2 <- sum(D1$precision.model2) / sum (D1$threshold.pred2) +D1$recall.model2 <- ifelse(D1$level.up == 1 & D1$threshold.pred2 == 1, 1, 0) +recall2 <- sum(D1$precision.model2) / sum(D1$level.up) + + +table2 <- table(D1$level.up, D1$threshold.pred2) +matrix2 <- as.matrix(table2) +kappa(matrix2, exact = TRUE)/kappa(matrix2) +#For two models the value of kappa are the same. ``` ### To Submit Your Assignment diff --git a/assignment7.Rproj b/assignment7.Rproj new file mode 100644 index 0000000..8e3c2eb --- /dev/null +++ b/assignment7.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/tree1.ps b/tree1.ps new file mode 100644 index 0000000000000000000000000000000000000000..54ad3cf5ff581be7369b9d9018b195d09e2c565d GIT binary patch literal 5833 zcmd5=+iu)O5`E`a)IJE1v6vnU>^)WaLCq(yA;VKIkE$-{(Vn1 z*}RO#juT)3C$eUWRn=9eZe7&b-`;)L{FK)_y-~t-&dy%c+2K}K{jXZ*I)AUb`p{;& zyP)g3>gg9<-s!%`Qb_*&=+Wk~F7s2*uXbhepMyTdVwFei55M2;>T==BTb&o_1*JX( zNkFkL$))or_<_uCMPKR*dQU&MX>(I#9bFyz=Fo%nywz!6x6J%dU-zHWmVJ1a?itQ` z+UqLk6Rz3PncU#HF@|Ma@DFlN~JcByS}aey>>h^3~R+D6}ptR*-9C(;iFF; zqK9)w5R$UG`JEJPNXZ9Hb|JI>5*PX^}R z=EH(mhP_eWP&j2ET*>SDJ4{61R5@X!H}MQ;vkvG0Yba0$BK)5uRG%FFg_eUtj=6PSy?oQ#^Ziz_d6~p-Q!D!QL*k!G&^&%wA5zQ zLe6|Hu$>V`7Q~iA!pxX^`bgX7uU}rKoz8!{d~ThB?9->TeW9~D*Kgkbx_Ay2LF=CK zLqo1`RpP}|>SGBb#=q{RClXJ2lJes2;p<{JDhMdebCTv8q{$OLBG}MZ)QW3eb|)-b z;#aRpne~4%b1ct35h9FS35PkKoI_ zE&J`O*Dp}lbdUS$f$q`%JOUnQNZ$jtfi^8mXl!3rs5i_Bdi~+*4KCJ7PEYu*BgzhK zi<^_}fI?9_+gI>G4t)k0TtlY#dB*%9oMG{pF&X-Y0_OciV0JZHU>hI(Q1H}Wgty%O z77!PYe<+wd#IRa?XL*iW>~{G4e*pPP7X@E&6C3Nep$l3RFqB;>5^RW$^p%K`L0_>xo--`y51c`t zSv8C!T0W^L7Czog(kF~DbvC$un7$Kf*A@GUUlv4{VP}7ei^21Ap_n7R~fSFiy%bI7;l8j9z@AR>@va8-Ad?<$$i#@D@R-1nhLhlJOoWmNUwFt92i_1^KBSPw1D{VrWJfVX*cGAi0B>Y` z(GNO-kjSi%oXZUeN$}lLkOO58#}S~0GjMvulsz7(0n=KBPoM@pCEzZo!C!*}l!XG2 zpazr%KcX-M0HFcT#s(U^G>>630R=Tg)WjOp5NwkMrZ@@{!7-iW80+eY)#)q4lTSW$ zOirx36A}*a3Z%&i#7v4!b$~FTnWDyJ;d9kax#AmA6Wii_-=YZPs=Sbt-xKTV<%qRuGX)*yKW+Ad; ziX8fq2~&K_gAuUVBB%Xvm#~kgyTnvH;9>x-yJYA&laaYrhb}RFx(q3L0Ta;{zsU&t z!Wf;>Gg6SVnL|HfCfKBc8oOjX{#2I?POScHWz8;&*;Y9R)RsT@{M;onjPQC%%5^#H zw^$-J%7g?Z9wq{d1}c37PyU%?LL+?%O}9X?L2)=ec$6&iAri zb_~Mv&~>-IJG1SEf*lIIywI}@Y!3+-LF8>`t-y0#_T$Smv(GR z3u7Y#n+E>Cb`GuCROH4yhvcX;n;IFT&Jn6_?T%(W^rqF9(6H