diff --git a/8_PREDMACHLEARN/.directory b/8_PREDMACHLEARN/.directory new file mode 100644 index 0000000..f88ed35 --- /dev/null +++ b/8_PREDMACHLEARN/.directory @@ -0,0 +1,4 @@ +[Dolphin] +Timestamp=2017,2,16,14,12,56 +Version=3 +ViewMode=1 diff --git a/8_PREDMACHLEARN/Practical Machine Learning Course Notes.Rmd b/8_PREDMACHLEARN/Practical Machine Learning Course Notes.Rmd index 96fae3d..ace370a 100644 --- a/8_PREDMACHLEARN/Practical Machine Learning Course Notes.Rmd +++ b/8_PREDMACHLEARN/Practical Machine Learning Course Notes.Rmd @@ -13,6 +13,10 @@ header-includes: \usepackage{graphicx} \usepackage{mathtools} --- +```{r setup, include=FALSE, error=TRUE, message=FALSE} +knitr::opts_chunk$set(echo = TRUE, comment = NA, error = TRUE) +``` + $\pagebreak$ @@ -427,7 +431,8 @@ folds$test[[1]] ```{r} # returns the arguments of the default train function -args(train.default) +# args(train.default) <- this dosn't work anymore in caret +args(train) ``` * `train` function has a large set of parameters, below are the default options @@ -774,8 +779,11 @@ testing <- spam[-inTrain,] preProc <- preProcess(log10(training[,-58]+1),method="pca",pcaComp=2) # calculate PCs for training data trainPC <- predict(preProc,log10(training[,-58]+1)) +# add variable `type` to trainPC +type <- training$type +trainPC <- data.frame(trainPC, type) # join variable `type` # run model on outcome and principle components -modelFit <- train(training$type ~ .,method="glm",data=trainPC) +modelFit <- train(type ~ .,method="glm",data=trainPC) # calculate PCs for test data testPC <- predict(preProc,log10(testing[,-58]+1)) # compare results @@ -789,7 +797,7 @@ confusionMatrix(testing$type,predict(modelFit,testPC)) ```{r message = FALSE, warning = FALSE} # construct model -modelFit <- train(training$type ~ .,method="glm",preProcess="pca",data=training) +modelFit <- train(type ~ .,method="glm",preProcess="pca",data=training) # print results of model confusionMatrix(testing$type,predict(modelFit,testing)) ``` @@ -1321,6 +1329,7 @@ pred.lda - ***example: `caret` package*** ```{r message = F, warning = F} +# package needed: klaR # using the same data from iris, run naive Bayes on training data nb <- train(Species ~ ., data=training,method="nb") # predict test outcomes using naive Bayes model diff --git a/8_PREDMACHLEARN/Practical_Machine_Learning_Course_Notes.html b/8_PREDMACHLEARN/Practical_Machine_Learning_Course_Notes.html index 1c9b791..826225d 100644 --- a/8_PREDMACHLEARN/Practical_Machine_Learning_Course_Notes.html +++ b/8_PREDMACHLEARN/Practical_Machine_Learning_Course_Notes.html @@ -8,38 +8,58 @@ + Practical Machine Learning Course Notes - + - - - - + + + + + + + @@ -63,17 +110,43 @@ color: inherit; background-color: rgba(0, 0, 0, 0.04); } -img { - max-width:100%; - height: auto; +img { + max-width:100%; + height: auto; +} +.tabbed-pane { + padding-top: 12px; +} +button.code-folding-btn:focus { + outline: none; } + + +
+ + + + + + + -
@@ -329,7 +403,7 @@

Prediction Study Design

  • important to not tune model to quiz set specifically
  • -

    +

    Sample Division Guidelines for Prediction Study Design

    -

    \(\pagebreak\)

    +

    \(\pagebreak\)

    @@ -404,9 +478,9 @@

    Types of Errors

    Notable Measurements for Error – Binary Variables

    -

    +

    -

    +

    Notable Measurements for Error – Continuous Variables

    -
  • median absolute deviation = \[median(|Prediction_i - Truth_i|)\]
  • +
  • median absolute deviation = \[median(|Prediction_i - Truth_i|)\]
  • -

    \(\pagebreak\)

    +

    \(\pagebreak\)

    @@ -440,11 +514,11 @@

    Receiver Operating Characteristic Curves

  • are commonly used techniques to measure the quality of a prediction algorithm.
  • predictions for binary classification often are quantitative (i.e. probability, scale of 1 to 10)
  • -

    +

    -

    +

    -

    -

    \(\pagebreak\)

    +

    +

    \(\pagebreak\)

    Cross Validation

    @@ -504,7 +578,7 @@

    Cross Validation

    Random Subsampling

    -

    +

    K-Fold

    -

    +

    Leave One Out

    -

    +

    -

    \(\pagebreak\)

    +

    \(\pagebreak\)

    @@ -550,10 +624,10 @@

    caret Package (\(\rightarrow\) preProcess() -
  • cross validation/data splitting \(\rightarrow\) createDataPartition(), createResample(), createTimeSlices()
  • -
  • train algorithms on training data and apply to test sets \(\rightarrow\) train(), predict()
  • -
  • model comparison (evaluate the accuracy of model on new data) \(\rightarrow\) confusionMatrix()
  • +
  • preprocessing/cleaning data \(\rightarrow\) preProcess()
  • +
  • cross validation/data splitting \(\rightarrow\) createDataPartition(), createResample(), createTimeSlices()
  • +
  • train algorithms on training data and apply to test sets \(\rightarrow\) train(), predict()
  • +
  • model comparison (evaluate the accuracy of model on new data) \(\rightarrow\) confusionMatrix()
  • machine learning algorithms in caret package
  • +
    List of 10
    + $ Fold01: int [1:460] 15 16 18 40 45 62 68 81 82 102 ...
    + $ Fold02: int [1:459] 1 41 55 58 67 75 117 123 151 175 ...
    + $ Fold03: int [1:461] 3 14 66 69 70 80 90 112 115 135 ...
    + $ Fold04: int [1:460] 5 19 25 65 71 83 85 88 91 93 ...
    + $ Fold05: int [1:460] 6 10 17 21 26 56 57 104 107 116 ...
    + $ Fold06: int [1:459] 7 8 13 39 52 54 76 89 99 106 ...
    + $ Fold07: int [1:461] 4 23 27 29 32 33 34 38 49 51 ...
    + $ Fold08: int [1:460] 2 9 30 31 36 37 43 46 47 48 ...
    + $ Fold09: int [1:461] 12 20 24 44 53 59 60 64 84 98 ...
    + $ Fold10: int [1:460] 11 22 28 35 42 61 72 86 92 118 ...
    +
    # return first 10 elements of the first training set
    +folds[[1]][1:10]
    +
     [1]  1  2  3  4  6  7  8  9 10 12
    -
    # create 10 resamples
    +
    # create 10 resamples
     resamples <- createResample(y=spam$type,times=10,list=TRUE)
     # structure of the resamples (note some samples are repeated)
    -str(resamples)
    -
    ## List of 10
    -##  $ Resample01: int [1:4601] 1 4 4 4 7 8 12 13 13 14 ...
    -##  $ Resample02: int [1:4601] 3 3 5 7 10 12 12 13 13 14 ...
    -##  $ Resample03: int [1:4601] 1 2 2 3 4 5 8 10 11 12 ...
    -##  $ Resample04: int [1:4601] 1 3 3 4 7 8 8 9 10 14 ...
    -##  $ Resample05: int [1:4601] 2 4 5 6 7 7 8 8 9 12 ...
    -##  $ Resample06: int [1:4601] 3 6 6 7 8 9 12 13 13 14 ...
    -##  $ Resample07: int [1:4601] 1 2 2 5 5 6 7 8 9 10 ...
    -##  $ Resample08: int [1:4601] 2 2 3 4 4 7 7 8 8 9 ...
    -##  $ Resample09: int [1:4601] 1 4 7 8 8 9 12 13 15 15 ...
    -##  $ Resample10: int [1:4601] 1 3 4 4 7 7 9 9 10 11 ...
    +str(resamples)
    +
    List of 10
    + $ Resample01: int [1:4601] 1 4 4 4 7 8 12 13 13 14 ...
    + $ Resample02: int [1:4601] 3 3 5 7 10 12 12 13 13 14 ...
    + $ Resample03: int [1:4601] 1 2 2 3 4 5 8 10 11 12 ...
    + $ Resample04: int [1:4601] 1 3 3 4 7 8 8 9 10 14 ...
    + $ Resample05: int [1:4601] 2 4 5 6 7 7 8 8 9 12 ...
    + $ Resample06: int [1:4601] 3 6 6 7 8 9 12 13 13 14 ...
    + $ Resample07: int [1:4601] 1 2 2 5 5 6 7 8 9 10 ...
    + $ Resample08: int [1:4601] 2 2 3 4 4 7 7 8 8 9 ...
    + $ Resample09: int [1:4601] 1 4 7 8 8 9 12 13 15 15 ...
    + $ Resample10: int [1:4601] 1 3 4 4 7 7 9 9 10 11 ...
    -
    # create time series data
    +
    # create time series data
     tme <- 1:1000
     # create time slices
     folds <- createTimeSlices(y=tme,initialWindow=20,horizon=10)
     # name of lists
    -names(folds)
    -
    ## [1] "train" "test"
    -
    # first training set
    -folds$train[[1]]
    -
    ##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
    -
    # first test set
    -folds$test[[1]]
    -
    ##  [1] 21 22 23 24 25 26 27 28 29 30
    +names(folds)
    +
    [1] "train" "test" 
    +
    # first training set
    +folds$train[[1]]
    +
     [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
    +
    # first test set
    +folds$test[[1]]
    +
     [1] 21 22 23 24 25 26 27 28 29 30

    Training Options (tutorial)

    -
    # returns the arguments of the default train function
    -args(train.default)
    -
    ## function (x, y, method = "rf", preProcess = NULL, ..., weights = NULL, 
    -##     metric = ifelse(is.factor(y), "Accuracy", "RMSE"), maximize = ifelse(metric == 
    -##         "RMSE", FALSE, TRUE), trControl = trainControl(), tuneGrid = NULL, 
    -##     tuneLength = 3) 
    -## NULL
    +
    # returns the arguments of the default train function
    +# args(train.default)   <- this dosn't work anymore in caret
    +args(train)
    +
    function (x, ...) 
    +NULL
    -
    # returns the default arguments for the trainControl object
    -args(trainControl)
    -
    ## function (method = "boot", number = ifelse(grepl("cv", method), 
    -##     10, 25), repeats = ifelse(grepl("cv", method), 1, number), 
    -##     p = 0.75, initialWindow = NULL, horizon = 1, fixedWindow = TRUE, 
    -##     verboseIter = FALSE, returnData = TRUE, returnResamp = "final", 
    -##     savePredictions = FALSE, classProbs = FALSE, summaryFunction = defaultSummary, 
    -##     selectionFunction = "best", preProcOptions = list(thresh = 0.95, 
    -##         ICAcomp = 3, k = 5), index = NULL, indexOut = NULL, timingSamps = 0, 
    -##     predictionBounds = rep(FALSE, 2), seeds = NA, adaptive = list(min = 5, 
    -##         alpha = 0.05, method = "gls", complete = TRUE), trim = FALSE, 
    -##     allowParallel = TRUE) 
    -## NULL
    +
    # returns the default arguments for the trainControl object
    +args(trainControl)
    +
    function (method = "boot", number = ifelse(grepl("cv", method), 
    +    10, 25), repeats = ifelse(grepl("cv", method), 1, number), 
    +    p = 0.75, search = "grid", initialWindow = NULL, horizon = 1, 
    +    fixedWindow = TRUE, skip = 0, verboseIter = FALSE, returnData = TRUE, 
    +    returnResamp = "final", savePredictions = FALSE, classProbs = FALSE, 
    +    summaryFunction = defaultSummary, selectionFunction = "best", 
    +    preProcOptions = list(thresh = 0.95, ICAcomp = 3, k = 5, 
    +        freqCut = 95/5, uniqueCut = 10, cutoff = 0.9), sampling = NULL, 
    +    index = NULL, indexOut = NULL, indexFinal = NULL, timingSamps = 0, 
    +    predictionBounds = rep(FALSE, 2), seeds = NA, adaptive = list(min = 5, 
    +        alpha = 0.05, method = "gls", complete = TRUE), trim = FALSE, 
    +    allowParallel = TRUE) 
    +NULL

    Preprocessing (tutorial)

    -
    # load spam data
    +
    +
                  mean      std
    +train 6.097035e-18 1.000000
    +test  7.548133e-02 1.633866
    -
    # set up BoxCox transforms
    +
    # set up BoxCox transforms
     preObj <- preProcess(training[,-58],method=c("BoxCox"))
     # perform preprocessing on training data
     trainCapAveS <- predict(preObj,training[,-58])$capitalAve
     # plot histogram and QQ Plot
     # Note: the transformation definitely helped to
     # normalize the data but it does not produce perfect result
    -par(mfrow=c(1,2)); hist(trainCapAveS); qqnorm(trainCapAveS)
    -

    +par(mfrow=c(1,2)); hist(trainCapAveS); qqnorm(trainCapAveS)
    +

    -
    # Make some values NA
    +
    # Make some values NA
     training$capAve <- training$capitalAve
     selectNA <- rbinom(dim(training)[1],size=1,prob=0.05)==1
     training$capAve[selectNA] <- NA
    @@ -977,10 +1051,10 @@ 

    Preprocessing ( capAveTruth <- training$capitalAve capAveTruth <- (capAveTruth-mean(capAveTruth))/sd(capAveTruth) # compute differences between imputed values and true values -quantile(capAve - capAveTruth)

    -
    ##            0%           25%           50%           75%          100% 
    -## -1.656344e+00  2.377772e-05  1.286900e-03  1.881653e-03  3.174413e-01
    -

    \(\pagebreak\)

    +quantile(capAve - capAveTruth)
    +
               0%           25%           50%           75%          100% 
    +-1.656344e+00  2.377772e-05  1.286900e-03  1.880821e-03  3.174413e-01 
    +

    \(\pagebreak\)

    @@ -998,7 +1072,7 @@

    Covariate Creation/Feature Extraction

  • generally, more knowledge and understanding you have of the system/data, the easier it will be to extract the summarizing features
      -
    • when in doubt, more features is always safer \(\rightarrow\) lose less information and the features can be filtered during model construction
    • +
    • when in doubt, more features is always safer \(\rightarrow\) lose less information and the features can be filtered during model construction
  • this process can be automated (i.e. PCA) but generally have to be very careful, as one very useful feature in the training data set may not have as much effect on the test data set
  • Note: science is the key here, Google “feature extraction for [data type]” for more guidance @@ -1021,31 +1095,31 @@

    Covariate Creation/Feature Extraction

    Creating Dummy Variables

      -
    • convert factor variables to indicator/dummy variable \(\rightarrow\) qualitative become quantitative
    • +
    • convert factor variables to indicator/dummy variable \(\rightarrow\) qualitative become quantitative
    • dummyVars(outcome~var, data=training) = creates a dummy variable object that can be used through predict function to create dummy variables
      • predict(dummyObj, newdata=training) = creates appropriate columns to represent the factor variable with appropriate 0s and 1s
          -
        • 2 factor variable \(\rightarrow\) two columns which have 0 or 1 depending on the outcome
        • -
        • 3 factor variable \(\rightarrow\) three columns which have 0, 0, and 1 representing the outcome
        • +
        • 2 factor variable \(\rightarrow\) two columns which have 0 or 1 depending on the outcome
        • +
        • 3 factor variable \(\rightarrow\) three columns which have 0, 0, and 1 representing the outcome
        • Note: only one of the columns can have values of 1 for each observation
    -
    # setting up data
    +
    # setting up data
     inTrain <- createDataPartition(y=Wage$wage,p=0.7, list=FALSE)
     training <- Wage[inTrain,]; testing <- Wage[-inTrain,]
     # create a dummy variable object
     dummies <- dummyVars(wage ~ jobclass,data=training)
     # create the dummy variable columns
    -head(predict(dummies,newdata=training))
    -
    ##        jobclass.1. Industrial jobclass.2. Information
    -## 231655                      1                       0
    -## 86582                       0                       1
    -## 161300                      1                       0
    -## 155159                      0                       1
    -## 11443                       0                       1
    -## 376662                      0                       1
    +head(predict(dummies,newdata=training))
    +
           jobclass.1. Industrial jobclass.2. Information
    +231655                      1                       0
    +86582                       0                       1
    +161300                      1                       0
    +155159                      0                       1
    +11443                       0                       1
    +376662                      0                       1
  • Removing Zero Covariates

    @@ -1061,21 +1135,21 @@

    Removing Zero Covariates

  • Note: when nzv = TRUE, those variables should be thrown out
  • -
    # print nearZeroVar table
    -nearZeroVar(training,saveMetrics=TRUE)
    -
    ##            freqRatio percentUnique zeroVar   nzv
    -## year        1.017647    0.33301618   FALSE FALSE
    -## age         1.231884    2.85442436   FALSE FALSE
    -## sex         0.000000    0.04757374    TRUE  TRUE
    -## maritl      3.329571    0.23786870   FALSE FALSE
    -## race        8.480583    0.19029496   FALSE FALSE
    -## education   1.393750    0.23786870   FALSE FALSE
    -## region      0.000000    0.04757374    TRUE  TRUE
    -## jobclass    1.070936    0.09514748   FALSE FALSE
    -## health      2.526846    0.09514748   FALSE FALSE
    -## health_ins  2.209160    0.09514748   FALSE FALSE
    -## logwage     1.011765   18.83920076   FALSE FALSE
    -## wage        1.011765   18.83920076   FALSE FALSE
    +
    # print nearZeroVar table
    +nearZeroVar(training,saveMetrics=TRUE)
    +
               freqRatio percentUnique zeroVar   nzv
    +year        1.017647    0.33301618   FALSE FALSE
    +age         1.231884    2.85442436   FALSE FALSE
    +sex         0.000000    0.04757374    TRUE  TRUE
    +maritl      3.329571    0.23786870   FALSE FALSE
    +race        8.480583    0.19029496   FALSE FALSE
    +education   1.393750    0.23786870   FALSE FALSE
    +region      0.000000    0.04757374    TRUE  TRUE
    +jobclass    1.070936    0.09514748   FALSE FALSE
    +health      2.526846    0.09514748   FALSE FALSE
    +health_ins  2.209160    0.09514748   FALSE FALSE
    +logwage     1.011765   18.83920076   FALSE FALSE
    +wage        1.011765   18.83920076   FALSE FALSE

    Creating Splines (Polynomial Functions)

    @@ -1086,7 +1160,7 @@

    Creating Splines (Polynomial Functions)

  • gam() function can also be used and it allows for smoothing of multiple variables with different values for each variable
  • Note: the same polynomial operations must be performed on the test sets using the predict function
  • -
    # load splines package
    +
    # load splines package
     library(splines)
     # create polynomial function
     bsBasis <- bs(training$age,df=3)
    @@ -1095,17 +1169,17 @@ 

    Creating Splines (Polynomial Functions)

    # plot all age vs wage data plot(training$age,training$wage,pch=19,cex=0.5) # plot the fitted polynomial function -points(training$age,predict(lm1,newdata=training),col="red",pch=19,cex=0.5)
    -

    -
    # predict on test values
    -head(predict(bsBasis,age=testing$age))
    -
    ##              1          2           3
    -## [1,] 0.0000000 0.00000000 0.000000000
    -## [2,] 0.2368501 0.02537679 0.000906314
    -## [3,] 0.4163380 0.32117502 0.082587862
    -## [4,] 0.4308138 0.29109043 0.065560908
    -## [5,] 0.3625256 0.38669397 0.137491189
    -## [6,] 0.3063341 0.42415495 0.195763821
    +points(training$age,predict(lm1,newdata=training),col="red",pch=19,cex=0.5)
    +

    +
    # predict on test values
    +head(predict(bsBasis,age=testing$age))
    +
                 1          2           3
    +[1,] 0.0000000 0.00000000 0.000000000
    +[2,] 0.2368501 0.02537679 0.000906314
    +[3,] 0.4163380 0.32117502 0.082587862
    +[4,] 0.4308138 0.29109043 0.065560908
    +[5,] 0.3625256 0.38669397 0.137491189
    +[6,] 0.3063341 0.42415495 0.195763821

    Multicore Parallel Processing

    @@ -1122,7 +1196,7 @@

    Multicore Parallel Processing

  • Note: once registered, you should see in your task manager/activity monitor that 4 “R Session” appear when you run your code
  • -

    \(\pagebreak\)

    +

    \(\pagebreak\)

    @@ -1152,7 +1226,7 @@

    prcomp Function

    -
    # load  spam data
    +
    # load  spam data
     data(spam)
     # perform PCA on dataset
     prComp <- prcomp(log10(spam[,-58]+1))
     # print out the eigenvector/rotations first 5 rows and PCs
    -head(prComp$rotation[, 1:5], 5)
    -
    ##                 PC1           PC2         PC3         PC4          PC5
    -## make    0.019370409  0.0427855959 -0.01631961  0.02798232 -0.014903314
    -## address 0.010827343  0.0408943785  0.07074906 -0.01407049  0.037237531
    -## all     0.040923168  0.0825569578 -0.03603222  0.04563653  0.001222215
    -## num3d   0.006486834 -0.0001333549  0.01234374 -0.01005991 -0.001282330
    -## our     0.036963221  0.0941456085 -0.01871090  0.05098463 -0.010582039
    -
    # create new variable that marks spam as 2 and nospam as 1
    +head(prComp$rotation[, 1:5], 5)
    +
                    PC1           PC2         PC3         PC4          PC5
    +make    0.019370409  0.0427855959 -0.01631961  0.02798232 -0.014903314
    +address 0.010827343  0.0408943785  0.07074906 -0.01407049  0.037237531
    +all     0.040923168  0.0825569578 -0.03603222  0.04563653  0.001222215
    +num3d   0.006486834 -0.0001333549  0.01234374 -0.01005991 -0.001282330
    +our     0.036963221  0.0941456085 -0.01871090  0.05098463 -0.010582039
    +
    # create new variable that marks spam as 2 and nospam as 1
     typeColor <- ((spam$type=="spam")*1 + 1)
     # plot the first two principal components
    -plot(prComp$x[,1],prComp$x[,2],col=typeColor,xlab="PC1",ylab="PC2")
    -

    +plot(prComp$x[,1],prComp$x[,2],col=typeColor,xlab="PC1",ylab="PC2")
    +

    caret Package

    @@ -1201,7 +1275,7 @@

    caret Package

  • Note: the same PCA must be performed on the test set
  • -
    # create train and test sets
    +
    # create train and test sets
     inTrain <- createDataPartition(y=spam$type,p=0.75, list=FALSE)
     training <- spam[inTrain,]
     testing <- spam[-inTrain,]
    @@ -1209,117 +1283,120 @@ 

    caret Package

    preProc <- preProcess(log10(training[,-58]+1),method="pca",pcaComp=2) # calculate PCs for training data trainPC <- predict(preProc,log10(training[,-58]+1)) +# add variable `type` to trainPC +type <- training$type +trainPC <- data.frame(trainPC, type) # join variable `type` # run model on outcome and principle components -modelFit <- train(training$type ~ .,method="glm",data=trainPC) +modelFit <- train(type ~ .,method="glm",data=trainPC) # calculate PCs for test data testPC <- predict(preProc,log10(testing[,-58]+1)) # compare results -confusionMatrix(testing$type,predict(modelFit,testPC))
    -
    ## Confusion Matrix and Statistics
    -## 
    -##           Reference
    -## Prediction nonspam spam
    -##    nonspam     656   41
    -##    spam         82  371
    -##                                           
    -##                Accuracy : 0.893           
    -##                  95% CI : (0.8737, 0.9103)
    -##     No Information Rate : 0.6417          
    -##     P-Value [Acc > NIR] : < 2.2e-16       
    -##                                           
    -##                   Kappa : 0.7724          
    -##  Mcnemar's Test P-Value : 0.0003101       
    -##                                           
    -##             Sensitivity : 0.8889          
    -##             Specificity : 0.9005          
    -##          Pos Pred Value : 0.9412          
    -##          Neg Pred Value : 0.8190          
    -##              Prevalence : 0.6417          
    -##          Detection Rate : 0.5704          
    -##    Detection Prevalence : 0.6061          
    -##       Balanced Accuracy : 0.8947          
    -##                                           
    -##        'Positive' Class : nonspam         
    -## 
    +confusionMatrix(testing$type,predict(modelFit,testPC))
    +
    Confusion Matrix and Statistics
    +
    +          Reference
    +Prediction nonspam spam
    +   nonspam     656   41
    +   spam         82  371
    +                                          
    +               Accuracy : 0.893           
    +                 95% CI : (0.8737, 0.9103)
    +    No Information Rate : 0.6417          
    +    P-Value [Acc > NIR] : < 2.2e-16       
    +                                          
    +                  Kappa : 0.7724          
    + Mcnemar's Test P-Value : 0.0003101       
    +                                          
    +            Sensitivity : 0.8889          
    +            Specificity : 0.9005          
    +         Pos Pred Value : 0.9412          
    +         Neg Pred Value : 0.8190          
    +             Prevalence : 0.6417          
    +         Detection Rate : 0.5704          
    +   Detection Prevalence : 0.6061          
    +      Balanced Accuracy : 0.8947          
    +                                          
    +       'Positive' Class : nonspam         
    +                                          
    -
    # construct model
    -modelFit <- train(training$type ~ .,method="glm",preProcess="pca",data=training)
    +
    # construct model
    +modelFit <- train(type ~ .,method="glm",preProcess="pca",data=training)
     # print results of model
    -confusionMatrix(testing$type,predict(modelFit,testing))
    -
    ## Confusion Matrix and Statistics
    -## 
    -##           Reference
    -## Prediction nonspam spam
    -##    nonspam     668   29
    -##    spam         59  394
    -##                                           
    -##                Accuracy : 0.9235          
    -##                  95% CI : (0.9066, 0.9382)
    -##     No Information Rate : 0.6322          
    -##     P-Value [Acc > NIR] : < 2.2e-16       
    -##                                           
    -##                   Kappa : 0.8379          
    -##  Mcnemar's Test P-Value : 0.001992        
    -##                                           
    -##             Sensitivity : 0.9188          
    -##             Specificity : 0.9314          
    -##          Pos Pred Value : 0.9584          
    -##          Neg Pred Value : 0.8698          
    -##              Prevalence : 0.6322          
    -##          Detection Rate : 0.5809          
    -##    Detection Prevalence : 0.6061          
    -##       Balanced Accuracy : 0.9251          
    -##                                           
    -##        'Positive' Class : nonspam         
    -## 
    -

    \(\pagebreak\)

    +confusionMatrix(testing$type,predict(modelFit,testing))
    +
    Confusion Matrix and Statistics
    +
    +          Reference
    +Prediction nonspam spam
    +   nonspam     668   29
    +   spam         59  394
    +                                          
    +               Accuracy : 0.9235          
    +                 95% CI : (0.9066, 0.9382)
    +    No Information Rate : 0.6322          
    +    P-Value [Acc > NIR] : < 2.2e-16       
    +                                          
    +                  Kappa : 0.8379          
    + Mcnemar's Test P-Value : 0.001992        
    +                                          
    +            Sensitivity : 0.9188          
    +            Specificity : 0.9314          
    +         Pos Pred Value : 0.9584          
    +         Neg Pred Value : 0.8698          
    +             Prevalence : 0.6322          
    +         Detection Rate : 0.5809          
    +   Detection Prevalence : 0.6061          
    +      Balanced Accuracy : 0.9251          
    +                                          
    +       'Positive' Class : nonspam         
    +                                          
    +

    \(\pagebreak\)

    Predicting with Regression

    R Commands and Examples

      -
    • lm<-lm(y ~ x, data=train) = runs a linear model of outcome y on predictor x \(\rightarrow\) univariate regression +
    • lm<-lm(y ~ x, data=train) = runs a linear model of outcome y on predictor x \(\rightarrow\) univariate regression
        -
      • summary(lm) = returns summary of the linear regression model, which will include coefficients, standard errors, \(t\) statistics, and p values
      • +
      • summary(lm) = returns summary of the linear regression model, which will include coefficients, standard errors, \(t\) statistics, and p values
      • lm(y ~ x1+x2+x3, data=train) = run linear model of outcome y on predictors x1, x2, and x3
      • lm(y ~ ., data=train = run linear model of outcome y on all predictors
    • -
    • predict(lm, newdata=df) = use the constructed linear model to predict outcomes (\(\hat Y_i\)) for the new values +
    • predict(lm, newdata=df) = use the constructed linear model to predict outcomes (\(\hat Y_i\)) for the new values
      • newdata data frame must have the same variables (factors must have the same levels) as the training data
      • newdata=test = predict outcomes for the test set based on linear regression model from the training
      • @@ -1327,10 +1404,10 @@

        R Commands and Examples

    • RSME can be calculated to measure the accuracy of the linear model
        -
      • Note: \(RSME_{test}\), which estimates the out-of-sample error, is almost always GREATER than \(RSME_{train}\)
      • +
      • Note: \(RSME_{test}\), which estimates the out-of-sample error, is almost always GREATER than \(RSME_{train}\)
    -
    # load data
    +
    # load data
     data(faithful)
     # create train and test sets
     inTrain <- createDataPartition(y=faithful$waiting, p=0.5, list=FALSE)
    @@ -1338,31 +1415,31 @@ 

    R Commands and Examples

    # build linear model lm1 <- lm(eruptions ~ waiting,data=trainFaith) # print summary of linear model -summary(lm1)
    -
    ## 
    -## Call:
    -## lm(formula = eruptions ~ waiting, data = trainFaith)
    -## 
    -## Residuals:
    -##      Min       1Q   Median       3Q      Max 
    -## -1.24867 -0.36292  0.00002  0.35768  1.19858 
    -## 
    -## Coefficients:
    -##              Estimate Std. Error t value Pr(>|t|)    
    -## (Intercept) -2.165648   0.227486   -9.52   <2e-16 ***
    -## waiting      0.079396   0.003146   25.24   <2e-16 ***
    -## ---
    -## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    -## 
    -## Residual standard error: 0.5013 on 135 degrees of freedom
    -## Multiple R-squared:  0.8251, Adjusted R-squared:  0.8238 
    -## F-statistic: 636.9 on 1 and 135 DF,  p-value: < 2.2e-16
    -
    # predict eruptions for new waiting time
    +summary(lm1)
    +
    
    +Call:
    +lm(formula = eruptions ~ waiting, data = trainFaith)
    +
    +Residuals:
    +     Min       1Q   Median       3Q      Max 
    +-1.30246 -0.40746  0.03955  0.40465  1.19221 
    +
    +Coefficients:
    +             Estimate Std. Error t value Pr(>|t|)    
    +(Intercept) -1.858966   0.237636  -7.823 1.33e-12 ***
    +waiting      0.075444   0.003287  22.952  < 2e-16 ***
    +---
    +Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    +
    +Residual standard error: 0.5272 on 135 degrees of freedom
    +Multiple R-squared:  0.796, Adjusted R-squared:  0.7945 
    +F-statistic: 526.8 on 1 and 135 DF,  p-value: < 2.2e-16
    +
    # predict eruptions for new waiting time
     newdata <- data.frame(waiting=80)
    -predict(lm1,newdata)
    -
    ##     1 
    -## 4.186
    -
    # create 1 x 2 panel plot
    +predict(lm1,newdata)
    +
           1 
    +4.176566 
    +
    # create 1 x 2 panel plot
     par(mfrow=c(1,2))
     # plot train data with the regression line
     plot(trainFaith$waiting,trainFaith$eruptions,pch=19,col="blue",xlab="Waiting",
    @@ -1371,30 +1448,30 @@ 

    R Commands and Examples

    # plot test data with the regression line plot(testFaith$waiting,testFaith$eruptions,pch=19,col="blue",xlab="Waiting", ylab="Duration", main = "Test") -lines(testFaith$waiting,predict(lm1,newdata=testFaith),lwd=3)
    -

    -
    # Calculate RMSE on training and test sets
    +lines(testFaith$waiting,predict(lm1,newdata=testFaith),lwd=3)
    +

    +
    # Calculate RMSE on training and test sets
     c(trainRMSE = sqrt(sum((lm1$fitted-trainFaith$eruptions)^2)),
    -    testRMSE = sqrt(sum((predict(lm1,newdata=testFaith)-testFaith$eruptions)^2)))
    -
    ## trainRMSE  testRMSE 
    -##  5.824859  5.788547
    + testRMSE = sqrt(sum((predict(lm1,newdata=testFaith)-testFaith$eruptions)^2)))
    +
    trainRMSE  testRMSE 
    + 6.125867  5.388723 
    -
    # calculate prediction interval
    +
    # calculate prediction interval
     pred1 <- predict(lm1,newdata=testFaith,interval="prediction")
     # plot data points (eruptions, waiting)
     plot(testFaith$waiting,testFaith$eruptions,pch=19,col="blue")
     # plot fit line and prediction interval
    -matlines(testFaith$waiting,pred1,type="l",,col=c(1,2,2),lty = c(1,1,1), lwd=3)
    -

    +matlines(testFaith$waiting,pred1,type="l",,col=c(1,2,2),lty = c(1,1,1), lwd=3)
    +

    -
    # create train and test sets
    +
    # create train and test sets
     inTrain <- createDataPartition(y=Wage$wage,p=0.7, list=FALSE)
     training <- Wage[inTrain,]; testing <- Wage[-inTrain,]
     # fit linear model for age jobclass and education
    @@ -1419,14 +1496,14 @@ 

    R Commands and Examples

    # set up 2 x 2 panel plot par(mfrow = c(2, 2)) # construct diagnostic plots for model -plot(finMod,pch=19,cex=0.5,col="#00000010")
    -

    +plot(finMod,pch=19,cex=0.5,col="#00000010")
    +

    -
    # plot fitted values by residuals 
    -qplot(finMod$fitted, finMod$residuals, color=race, data=training)
    -

    +
    # plot fitted values by residuals 
    +qplot(finMod$fitted, finMod$residuals, color=race, data=training)
    +

    -
    # plot residual by index
    -plot(finMod$residuals,pch=19,cex=0.5)
    -

    +
    # plot residual by index
    +plot(finMod$residuals,pch=19,cex=0.5)
    +

    -

    \(\pagebreak\)

    +

    \(\pagebreak\)

    Prediction with Trees

    Measures of Impurity (Reference)

    -

    \[\hat{p}_{mk} = \frac{\sum_{i}^m \mathbb{1}(y_i = k)}{N_m}\]

    +

    \[\hat{p}_{mk} = \frac{\sum_{i}^m \mathbb{1}(y_i = k)}{N_m}\]

    -
    # set margin and seed
    +
    # set margin and seed
     par(mar=c(1,1,1,1), mfrow = c(1, 2)); set.seed(1234);
     # simulate data
     x = rep(1:4,each=4); y = rep(1:4,4)
     # plot first scenario
     plot(x,y,xaxt="n",yaxt="n",cex=3,col=c(rep("blue",15),rep("red",1)),pch=19)
     # plot second scenario
    -plot(x,y,xaxt="n",yaxt="n",cex=3,col=c(rep("blue",8),rep("red",8)),pch=19)
    -

    +plot(x,y,xaxt="n",yaxt="n",cex=3,col=c(rep("blue",8),rep("red",8)),pch=19)
    +

    @@ -1540,12 +1617,12 @@

    Constructing Trees with caret Package

  • [rattle package] fancyRpartPlot(tree$finalModel) = produces more readable, better formatted classification tree diagrams
  • each split will have the condition/node in bold and the splits/leafs on the left and right sides following the “yes” or “no” indicators
  • -
    # load iris data set
    +
    # load iris data set
     data(iris)
     # create test/train data sets
     inTrain <- createDataPartition(y=iris$Species,p=0.7, list=FALSE)
    @@ -1554,31 +1631,31 @@ 

    Constructing Trees with caret Package

    # fit classification tree as a model modFit <- train(Species ~ .,method="rpart",data=training) # print the classification tree -print(modFit$finalModel)
    -
    ## n= 105 
    -## 
    -## node), split, n, loss, yval, (yprob)
    -##       * denotes terminal node
    -## 
    -## 1) root 105 70 setosa (0.33333333 0.33333333 0.33333333)  
    -##   2) Petal.Length< 2.45 35  0 setosa (1.00000000 0.00000000 0.00000000) *
    -##   3) Petal.Length>=2.45 70 35 versicolor (0.00000000 0.50000000 0.50000000)  
    -##     6) Petal.Width< 1.65 34  1 versicolor (0.00000000 0.97058824 0.02941176) *
    -##     7) Petal.Width>=1.65 36  2 virginica (0.00000000 0.05555556 0.94444444) *
    -
    # plot the classification tree
    -rattle::fancyRpartPlot(modFit$finalModel)
    -

    -
    # predict on test values
    -predict(modFit,newdata=testing)
    -
    ##  [1] setosa     setosa     setosa     setosa     setosa     setosa    
    -##  [7] setosa     setosa     setosa     setosa     setosa     setosa    
    -## [13] setosa     setosa     setosa     versicolor versicolor versicolor
    -## [19] versicolor versicolor versicolor versicolor versicolor versicolor
    -## [25] versicolor versicolor versicolor versicolor versicolor versicolor
    -## [31] virginica  virginica  virginica  virginica  virginica  virginica 
    -## [37] versicolor virginica  virginica  versicolor versicolor virginica 
    -## [43] virginica  virginica  virginica 
    -## Levels: setosa versicolor virginica
    +print(modFit$finalModel)
    +
    n= 105 
    +
    +node), split, n, loss, yval, (yprob)
    +      * denotes terminal node
    +
    +1) root 105 70 setosa (0.33333333 0.33333333 0.33333333)  
    +  2) Petal.Length< 2.45 35  0 setosa (1.00000000 0.00000000 0.00000000) *
    +  3) Petal.Length>=2.45 70 35 versicolor (0.00000000 0.50000000 0.50000000)  
    +    6) Petal.Width< 1.65 34  1 versicolor (0.00000000 0.97058824 0.02941176) *
    +    7) Petal.Width>=1.65 36  2 virginica (0.00000000 0.05555556 0.94444444) *
    +
    # plot the classification tree
    +rattle::fancyRpartPlot(modFit$finalModel)
    +

    +
    # predict on test values
    +predict(modFit,newdata=testing)
    +
     [1] setosa     setosa     setosa     setosa     setosa     setosa    
    + [7] setosa     setosa     setosa     setosa     setosa     setosa    
    +[13] setosa     setosa     setosa     versicolor versicolor versicolor
    +[19] versicolor versicolor versicolor versicolor versicolor versicolor
    +[25] versicolor versicolor versicolor versicolor versicolor versicolor
    +[31] virginica  virginica  virginica  virginica  virginica  virginica 
    +[37] versicolor virginica  virginica  versicolor versicolor virginica 
    +[43] virginica  virginica  virginica 
    +Levels: setosa versicolor virginica
    @@ -1600,7 +1677,7 @@

    Bagging

    -
    # load data
    +
    # load data
     library(ElemStatLearn); data(ozone,package="ElemStatLearn")
     # reorder rows based on ozone variable
     ozone <- ozone[order(ozone$ozone),]
    @@ -1622,8 +1699,8 @@ 

    Bagging

    # plot each prediction model for(i in 1:10){lines(1:155,ll[i,],col="grey",lwd=2)} # plot the average in red -lines(1:155,apply(ll,2,mean),col="red",lwd=2)
    -

    +lines(1:155,apply(ll,2,mean),col="red",lwd=2)
    +

    Bagging Algorithms

  • example
  • -
    # load relevant package and data
    +
    # load relevant package and data
     library(party); data(ozone,package="ElemStatLearn")
     # reorder rows based on ozone variable
     ozone <- ozone[order(ozone$ozone),]
    @@ -1667,9 +1744,9 @@ 

    Bagging Algorithms

    # plot the first fit points(ozone$ozone,predict(treebag$fits[[1]]$fit,predictors),pch=19,col="red") # plot the aggregated predictions -points(ozone$ozone,predict(treebag,predictors),pch=19,col="blue")
    -

    -

    \(\pagebreak\)

    +points(ozone$ozone,predict(treebag,predictors),pch=19,col="blue")
    +

    +

    \(\pagebreak\)

    @@ -1680,7 +1757,7 @@

    Random Forest

  • one of the most used/accurate algorithms along with boosting
  • -

    +

  • getTree(rf$finalModel, k=2) = return specific tree from random forest model
  • @@ -1725,7 +1802,7 @@

    R Commands and Examples

  • example
  • -
    # load data
    +
    # load data
     data(iris)
     # create train/test data sets
     inTrain <- createDataPartition(y=iris$Species,p=0.7, list=FALSE)
    @@ -1734,38 +1811,38 @@ 

    R Commands and Examples

    # apply random forest modFit <- train(Species~ .,data=training,method="rf",prox=TRUE) # return the second tree (first 6 rows) -head(getTree(modFit$finalModel,k=2))
    -
    ##   left daughter right daughter split var split point status prediction
    -## 1             2              3         4        0.70      1          0
    -## 2             0              0         0        0.00     -1          1
    -## 3             4              5         4        1.75      1          0
    -## 4             6              7         3        5.30      1          0
    -## 5             0              0         0        0.00     -1          3
    -## 6             8              9         3        4.95      1          0
    -
    # compute cluster centers
    +head(getTree(modFit$finalModel,k=2))
    +
      left daughter right daughter split var split point status prediction
    +1             2              3         3        2.60      1          0
    +2             0              0         0        0.00     -1          1
    +3             4              5         4        1.65      1          0
    +4             6              7         3        5.25      1          0
    +5             8              9         3        4.85      1          0
    +6             0              0         0        0.00     -1          2
    +
    # compute cluster centers
     irisP <- classCenter(training[,c(3,4)], training$Species, modFit$finalModel$prox)
     # convert irisP to data frame and add Species column
     irisP <- as.data.frame(irisP); irisP$Species <- rownames(irisP)
     # plot data points
     p <- qplot(Petal.Width, Petal.Length, col=Species,data=training)
     # add the cluster centers
    -p + geom_point(aes(x=Petal.Width,y=Petal.Length,col=Species),size=5,shape=4,data=irisP)
    -

    -
    # predict outcome for test data set using the random forest model
    +p + geom_point(aes(x=Petal.Width,y=Petal.Length,col=Species),size=5,shape=4,data=irisP)
    +

    +
    # predict outcome for test data set using the random forest model
     pred <- predict(modFit,testing)
     # logic value for whether or not the rf algorithm predicted correctly
     testing$predRight <- pred==testing$Species
     # tabulate results
    -table(pred,testing$Species)
    -
    ##             
    -## pred         setosa versicolor virginica
    -##   setosa         15          0         0
    -##   versicolor      0         15         2
    -##   virginica       0          0        13
    -
    # plot data points with the incorrect classification highlighted
    -qplot(Petal.Width,Petal.Length,colour=predRight,data=testing,main="newdata Predictions")
    -

    -

    \(\pagebreak\)

    +table(pred,testing$Species)
    +
                
    +pred         setosa versicolor virginica
    +  setosa         15          0         0
    +  versicolor      0         14         1
    +  virginica       0          1        14
    +
    # plot data points with the incorrect classification highlighted
    +qplot(Petal.Width,Petal.Length,colour=predRight,data=testing,main="newdata Predictions")
    +

    +

    \(\pagebreak\)

    @@ -1774,28 +1851,28 @@

    Boosting

  • boosting = one of the most widely used and accurate prediction models, along with random forest
  • boosting can be done with any set of classifiers, and a well-known approach is gradient boosting
  • more detail tutorial can be found here

  • -
  • process: take a group of weak predictors \(\rightarrow\) weight them and add them up \(\rightarrow\) result in a stronger predictor +
  • process: take a group of weak predictors \(\rightarrow\) weight them and add them up \(\rightarrow\) result in a stronger predictor
  • example

  • -

    +

    -

    +

    @@ -1816,7 +1893,7 @@

    R Commands and Examples

  • predict function can be used to apply the model to test data, similar to the rest of the algorithms in caret package

  • example

  • -
    # load data
    +
    # load data
     data(Wage)
     # remove log wage variable (we are trying to predict wage)
     Wage <- subset(Wage,select=-c(logwage))
    @@ -1826,37 +1903,35 @@ 

    R Commands and Examples

    # run the gbm model modFit <- train(wage ~ ., method="gbm",data=training,verbose=FALSE) # print model summary -print(modFit)
    -
    ## Stochastic Gradient Boosting 
    -## 
    -## 2102 samples
    -##   10 predictor
    -## 
    -## No pre-processing
    -## Resampling: Bootstrapped (25 reps) 
    -## 
    -## Summary of sample sizes: 2102, 2102, 2102, 2102, 2102, 2102, ... 
    -## 
    -## Resampling results across tuning parameters:
    -## 
    -##   interaction.depth  n.trees  RMSE      Rsquared   RMSE SD   Rsquared SD
    -##   1                   50      35.64972  0.3317422  1.495165  0.02312183 
    -##   1                  100      34.95593  0.3429594  1.503223  0.02319651 
    -##   1                  150      34.84473  0.3451634  1.496016  0.02324176 
    -##   2                   50      34.91119  0.3462101  1.490004  0.02425481 
    -##   2                  100      34.74433  0.3487227  1.480423  0.02278780 
    -##   2                  150      34.74823  0.3487136  1.472941  0.02314265 
    -##   3                   50      34.83480  0.3467828  1.493846  0.02292988 
    -##   3                  100      34.85342  0.3449018  1.482881  0.02373242 
    -##   3                  150      34.99413  0.3401694  1.544378  0.02498133 
    -## 
    -## Tuning parameter 'shrinkage' was held constant at a value of 0.1
    -## 
    -## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
    -## RMSE was used to select the optimal model using  the smallest value.
    -## The final values used for the model were n.trees = 100,
    -##  interaction.depth = 2, shrinkage = 0.1 and n.minobsinnode = 10.
    -

    \(\pagebreak\)

    +print(modFit)
    +
    Stochastic Gradient Boosting 
    +
    +2102 samples
    +  10 predictors
    +
    +No pre-processing
    +Resampling: Bootstrapped (25 reps) 
    +Summary of sample sizes: 2102, 2102, 2102, 2102, 2102, 2102, ... 
    +Resampling results across tuning parameters:
    +
    +  interaction.depth  n.trees  RMSE      Rsquared 
    +  1                   50      35.24716  0.3061446
    +  1                  100      34.74137  0.3165154
    +  1                  150      34.70108  0.3179425
    +  2                   50      34.67703  0.3199690
    +  2                  100      34.58430  0.3226554
    +  2                  150      34.67104  0.3202411
    +  3                   50      34.59842  0.3219575
    +  3                  100      34.73009  0.3183701
    +  3                  150      34.97108  0.3107605
    +
    +Tuning parameter 'shrinkage' was held constant at a value of 0.1
    +
    +Tuning parameter 'n.minobsinnode' was held constant at a value of 10
    +RMSE was used to select the optimal model using  the smallest value.
    +The final values used for the model were n.trees = 100,
    + interaction.depth = 2, shrinkage = 0.1 and n.minobsinnode = 10. 
    +

    \(\pagebreak\)

    @@ -1869,34 +1944,34 @@

    Model Based Prediction

  • can be reasonably accurate on real problems
  • this approach does make additional assumptions about the data, which can lead to model failure/reduced accuracy if they are too far off
  • -
  • goal = build parameter-based model (based on probabilities) for conditional distribution \(P(Y = k~|~X = x)\), or the probability of the outcome \(Y\) is equal to a particular value \(k\) given a specific set of predictor variables \(x\) +
  • goal = build parameter-based model (based on probabilities) for conditional distribution \(P(Y = k~|~X = x)\), or the probability of the outcome \(Y\) is equal to a particular value \(k\) given a specific set of predictor variables \(x\)
  • typical approach/process
      -
    1. start with the quantity \(P(Y = k~|~X = x)\)
    2. -
    3. apply Bayes’ Theorem such that \[ P(Y = k ~|~ X=x) = \frac{P(X=x~|~Y=k)P(Y=k)}{\sum_{\ell=1}^K P(X=x ~|~Y = \ell) P(Y=\ell)}\] where the denominator is simply the sum of probabilities for the predictor variables are the set specified in \(x\) for all outcomes of \(Y\)
    4. -
    5. assume the term \(P(X=x~|~Y=k)\) in the numerator follows a parameter-based probability distribution, or \(f_k(x)\) +
    6. start with the quantity \(P(Y = k~|~X = x)\)
    7. +
    8. apply Bayes’ Theorem such that \[ P(Y = k ~|~ X=x) = \frac{P(X=x~|~Y=k)P(Y=k)}{\sum_{\ell=1}^K P(X=x ~|~Y = \ell) P(Y=\ell)}\] where the denominator is simply the sum of probabilities for the predictor variables are the set specified in \(x\) for all outcomes of \(Y\)
    9. +
    10. assume the term \(P(X=x~|~Y=k)\) in the numerator follows a parameter-based probability distribution, or \(f_k(x)\)
        -
      • common choice = Gaussian distribution \[f_k(x) = \frac{1}{\sigma_k \sqrt{2 \pi}}e^{-\frac{(x-\mu_k)^2}{2\sigma_k^2}}\]
      • +
      • common choice = Gaussian distribution \[f_k(x) = \frac{1}{\sigma_k \sqrt{2 \pi}}e^{-\frac{(x-\mu_k)^2}{2\sigma_k^2}}\]
    11. -
    12. assume the probability for the outcome \(Y\) to take on value of \(k\), or \(P(Y=k)\), is determined from the data to be some known quantity \(\pi_k\) +
    13. assume the probability for the outcome \(Y\) to take on value of \(k\), or \(P(Y=k)\), is determined from the data to be some known quantity \(\pi_k\)
    14. -
    15. so the quantity \(P(Y = k~|~X = x)\) can be rewritten as \[P(Y = k ~|~ X=x) = \frac{f_k(x) \pi_k}{\sum_{\ell = 1}^K f_{\ell}(x) \pi_{\ell}}\]
    16. -
    17. estimate the parameters (\(\mu_k\), \(\sigma_k^2\)) for the function \(f_k(x)\) from the data
    18. -
    19. calculate \(P(Y = k~|~X = x)\) using the parameters
    20. -
    21. the outcome \(Y\) is where the value of \(P(Y = k ~|~ X = x)\) is the highest
    22. +
    23. so the quantity \(P(Y = k~|~X = x)\) can be rewritten as \[P(Y = k ~|~ X=x) = \frac{f_k(x) \pi_k}{\sum_{\ell = 1}^K f_{\ell}(x) \pi_{\ell}}\]
    24. +
    25. estimate the parameters (\(\mu_k\), \(\sigma_k^2\)) for the function \(f_k(x)\) from the data
    26. +
    27. calculate \(P(Y = k~|~X = x)\) using the parameters
    28. +
    29. the outcome \(Y\) is where the value of \(P(Y = k ~|~ X = x)\) is the highest
  • prediction models that leverage this approach
  • Naive Bayes

    +
     [1] setosa     setosa     setosa     setosa     setosa     setosa    
    + [7] setosa     setosa     setosa     setosa     setosa     setosa    
    +[13] setosa     setosa     setosa     versicolor versicolor versicolor
    +[19] versicolor versicolor versicolor versicolor versicolor versicolor
    +[25] versicolor versicolor versicolor versicolor versicolor versicolor
    +[31] virginica  virginica  virginica  virginica  virginica  virginica 
    +[37] versicolor virginica  virginica  virginica  virginica  virginica 
    +[43] virginica  virginica  virginica 
    +Levels: setosa versicolor virginica

    Compare Results for LDA and Naive Bayes

    @@ -2022,22 +2099,22 @@

    Compare Results for LDA and Naive Bayes

  • linear discriminant analysis and naive Bayes generally produce similar results for small data sets
  • for our example data from iris data set, we can compare the prediction the results from the two models
  • -
    # tabulate the prediction results from LDA and naive Bayes
    -table(pred.lda,pred.nb)
    -
    ##             pred.nb
    -## pred.lda     setosa versicolor virginica
    -##   setosa         15          0         0
    -##   versicolor      0         15         0
    -##   virginica       0          0        15
    -
    # create logical variable that returns TRUE for when predictions from the two models match
    +
    # tabulate the prediction results from LDA and naive Bayes
    +table(pred.lda,pred.nb)
    +
                pred.nb
    +pred.lda     setosa versicolor virginica
    +  setosa         15          0         0
    +  versicolor      0         15         0
    +  virginica       0          1        14
    +
    # create logical variable that returns TRUE for when predictions from the two models match
     equalPredictions <- (pred.lda==pred.nb)
     # plot the comparison
    -qplot(Petal.Width,Sepal.Width,colour=equalPredictions,data=testing)
    -

    +qplot(Petal.Width,Sepal.Width,colour=equalPredictions,data=testing)
    +

    -

    \(\pagebreak\)

    +

    \(\pagebreak\)

    @@ -2049,7 +2126,7 @@

    Model Selection

  • the error for the prediction model on test set decreases first and then increases as number of predictors used approaches the total number of predictors available
  • -

    +

    -
    # load data and set seed
    +
    # load data and set seed
     data(prostate); set.seed(1)
     # define outcome y and predictors x
     covnames <- names(prostate[-(9:10)])
    @@ -2133,8 +2210,8 @@ 

    Example: Training vs Test Error for Combination of Predictors

    # plot line through the minimum test RSS data points in blue lines((1:p), minrss, col="red", lwd=1.7) # add legend -legend("topright", c("Train", "Test"), col=c("blue", "red"), pch=1)
    -

    +legend("topright", c("Train", "Test"), col=c("blue", "red"), pch=1)
    +

    @@ -2161,16 +2238,16 @@

    Split Samples

    Decompose Expected Prediction Error

    @@ -2186,9 +2263,9 @@

    Hard Thresholding

  • hard thresholding can help estimate the coefficients/model by taking subsets of predictors and building models
  • process
  • problem
  • -
    # load prostate data
    +
    # load prostate data
     data(prostate)
     # create subset of observations with 10 variables
     small = prostate[1:5,]
     # print linear regression
    -lm(lpsa ~ .,data =small)
    -
    ## 
    -## Call:
    -## lm(formula = lpsa ~ ., data = small)
    -## 
    -## Coefficients:
    -## (Intercept)       lcavol      lweight          age         lbph  
    -##     9.60615      0.13901     -0.79142      0.09516           NA  
    -##         svi          lcp      gleason        pgg45    trainTRUE  
    -##          NA           NA     -2.08710           NA           NA
    +lm(lpsa ~ .,data =small)
    +
    
    +Call:
    +lm(formula = lpsa ~ ., data = small)
    +
    +Coefficients:
    +(Intercept)       lcavol      lweight          age         lbph  
    +    9.60615      0.13901     -0.79142      0.09516           NA  
    +        svi          lcp      gleason        pgg45    trainTRUE  
    +         NA           NA     -2.08710           NA           NA  

    Regularized Regression Concept (Resource)

    -

    +

    Regularized Regression - LASSO Regression

    -
  • [caret package] predict(model,test) = use the model to predict on test set \(\rightarrow\) similar to all other caret algorithms
  • +
  • caret package predict(model,test) = use the model to predict on test set \(\rightarrow\) similar to all other caret algorithms
  • example: lars package
  • -
    # load lars package
    +
    # load lars package
     library(lars)
     # perform lasso regression
     lasso.fit <- lars(as.matrix(x), y, type="lasso", trace=TRUE)
    @@ -2374,12 +2451,12 @@ 

    Regularized Regression - LASSO Regression

    plot(lasso.fit, breaks=FALSE, cex = 0.75) # add legend legend("topleft", covnames, pch=8, lty=1:length(covnames), - col=1:length(covnames), cex = 0.6)
    -

    -
    # plots the cross validation curve
    -lasso.cv <- cv.lars(as.matrix(x), y, K=10, type="lasso", trace=TRUE)
    -

    -

    \(\pagebreak\)

    + col=1:length(covnames), cex = 0.6)
    +

    +
    # plots the cross validation curve
    +lasso.cv <- cv.lars(as.matrix(x), y, K=10, type="lasso", trace=TRUE)
    +

    +

    \(\pagebreak\)

    @@ -2437,7 +2514,7 @@

    Example - Majority Vote

  • each has 70% accuracy
  • majority vote accuracy (mva) = probability of the majority of the models achieving 70% at the same time
  • -

    \[\begin{aligned} +

    \[\begin{aligned} \mbox{majority vote accuracy} & = p(3~correct,~2~wrong) + p(4~correct,~1~wrong) \\ &\qquad+ p(5~correct) \\ & = {5 \choose 3} \times(0.7)^3(0.3)^2 + {5 \choose 4} \times(0.7)^4(0.3)^1 - {5 \choose 5} (0.7)^5 \\ @@ -2450,7 +2527,7 @@

    Example - Majority Vote

    Example - Model Ensembling

    -
    # set up data
    +
    # set up data
     inBuild <- createDataPartition(y=Wage$wage,p=0.7, list=FALSE)
     validation <- Wage[-inBuild,]; buildData <- Wage[inBuild,]
     inTrain <- createDataPartition(y=buildData$wage,p=0.7, list=FALSE)
    @@ -2483,11 +2560,11 @@ 

    Example - Model Ensembling

    # validation data set RMSE Errors validation = c(sqrt(sum((glm.pred.val-validation$wage)^2)), sqrt(sum((rf.pred.val-validation$wage)^2)), - sqrt(sum((comb.pred.val-validation$wage)^2))))
    -
    ##                  glm        rf  combined
    -## test        858.7074  888.0702  849.3771
    -## validation 1061.0891 1086.2027 1057.8264
    -

    \(\pagebreak\)

    + sqrt(sum((comb.pred.val-validation$wage)^2))))
    +
                     glm        rf  combined
    +test        858.7074  888.5536  849.4654
    +validation 1061.0891 1085.4773 1056.5813
    +

    \(\pagebreak\)

    @@ -2530,8 +2607,8 @@

    Forecasting

  • approaches
  • ts(googOpen, frequency=12) = convert data to a time series with frequency observations per time unit
  • decompose(ts) = decomposes time series into trend, seasonal, and irregular components by using moving averages @@ -2607,25 +2684,25 @@

    R Commands and Examples

  • quandl package is also used for finance-related predictions
  • example: decomposed time series
  • -
    # load quantmod package
    +
    # load quantmod package
     library(quantmod);
     # specify to and from dates
     from.dat <- as.Date("01/01/00", format="%m/%d/%y")
     to.dat <- as.Date("3/2/15", format="%m/%d/%y")
     # get data for AAPL from Google Finance for the specified dates
    -getSymbols("AAPL", src="google", from = from.dat, to = to.dat)
    -
    ## [1] "AAPL"
    -
    # convert the retrieved daily data to monthly data
    +getSymbols("AAPL", src="google", from = from.dat, to = to.dat)
    +
    [1] "AAPL"
    +
    # convert the retrieved daily data to monthly data
     mAAPL <- to.monthly(AAPL)
     # extract the closing price and convert it to yearly time series (12 observations per year)
     ts <- ts(Cl(mAAPL), frequency = 12)
     # plot the decomposed parts of the time series
    -plot(decompose(ts),xlab="Years")
    -

    +plot(decompose(ts),xlab="Years")
    +

    -
    # load forecast library
    +
    # load forecast library
     library(forecast)
     # find the number of rows (years)
     rows <- ceiling(length(ts)/12)
    @@ -2636,24 +2713,24 @@ 

    R Commands and Examples

    # plot the training set plot(ts.train) # add the moving average in red -lines(ma(ts.train,order=3),col="red")
    -

    -
    # compute the exponential smoothing average
    +lines(ma(ts.train,order=3),col="red")
    +

    +
    # compute the exponential smoothing average
     ets <- ets(ts.train,model="MMM")
     # construct a forecasting model using the exponential smoothing function
     fcast <- forecast(ets)
     # plot forecast and add actual data in red
    -plot(fcast); lines(ts.test,col="red")
    -

    -
    # print the accuracy of the forecast model
    -accuracy(fcast,ts.test)
    -
    ##                      ME      RMSE       MAE       MPE     MAPE      MASE
    -## Training set  0.1188298  2.825883  1.646959  -0.61217 10.68901 0.1924329
    -## Test set     -7.8132889 16.736910 15.079222 -13.64900 20.31005 1.7618772
    -##                    ACF1 Theil's U
    -## Training set 0.09773823        NA
    -## Test set     0.84664431  3.360515
    -

    \(\pagebreak\)

    +plot(fcast); lines(ts.test,col="red")
    +

    +
    # print the accuracy of the forecast model
    +accuracy(fcast,ts.test)
    +
                        ME      RMSE       MAE        MPE     MAPE      MASE
    +Training set 0.3738734  2.576067  1.438703  0.2377946 10.38942 0.1759433
    +Test set     0.7292502 18.507031 15.347659 -3.6663982 19.01227 1.8769111
    +                  ACF1 Theil's U
    +Training set 0.1026281        NA
    +Test set     0.8661382  3.200942
    +

    \(\pagebreak\)

    @@ -2700,7 +2777,7 @@

    R Commands and Examples

  • cl_predict function in clue package provides similar functionality
  • -
    # load iris data
    +
    # load iris data
     data(iris)
     # create training and test sets
     inTrain <- createDataPartition(y=iris$Species,p=0.7, list=FALSE)
    @@ -2715,8 +2792,8 @@ 

    R Commands and Examples

    ggtitle("Clusters Classification") p2 <- qplot(Petal.Width,Petal.Length,colour=Species,data=training) + ggtitle("Species Classification (Truth)") -grid.arrange(p1, p2, ncol = 2)
    -

    +grid.arrange(p1, p2, ncol = 2)
    +

    -
    # tabulate the results from clustering and actual species
    -table(kMeans1$cluster,training$Species)
    -
    ##    
    -##     setosa versicolor virginica
    -##   1     35          0         0
    -##   2      0          0        27
    -##   3      0         35         8
    +
    # tabulate the results from clustering and actual species
    +table(kMeans1$cluster,training$Species)
    +
       
    +    setosa versicolor virginica
    +  1      0          2        27
    +  2     35          0         0
    +  3      0         33         8
    -
    # build classification trees using the k-means cluster
    -clustering <- train(clusters ~.,data=subset(training,select=-c(Species)),method="rpart")
    +
    # build classification trees using the k-means cluster
    +clustering <- train(clusters ~.,data=subset(training,select=-c(Species)),method="rpart")
    -
    # tabulate the prediction results on training set vs truth
    -table(predict(clustering,training),training$Species)
    -
    ##    
    -##     setosa versicolor virginica
    -##   1     35          0         0
    -##   2      0          0        29
    -##   3      0         35         6
    +
    # tabulate the prediction results on training set vs truth
    +table(predict(clustering,training),training$Species)
    +
       
    +    setosa versicolor virginica
    +  1      0          0        24
    +  2     35          0         0
    +  3      0         35        11
    -
    # tabulate the prediction results on test set vs truth
    -table(predict(clustering,testing),testing$Species)
    -
    ##    
    -##     setosa versicolor virginica
    -##   1     15          0         0
    -##   2      0          1        12
    -##   3      0         14         3
    +
    # tabulate the prediction results on test set vs truth
    +table(predict(clustering,testing),testing$Species)
    +
       
    +    setosa versicolor virginica
    +  1      0          0        10
    +  2     15          0         0
    +  3      0         15         5
    + + diff --git a/8_PREDMACHLEARN/Practical_Machine_Learning_Course_Notes.pdf b/8_PREDMACHLEARN/Practical_Machine_Learning_Course_Notes.pdf index eee0c0c..54eb951 100644 Binary files a/8_PREDMACHLEARN/Practical_Machine_Learning_Course_Notes.pdf and b/8_PREDMACHLEARN/Practical_Machine_Learning_Course_Notes.pdf differ