diff --git a/8_PREDMACHLEARN/.directory b/8_PREDMACHLEARN/.directory new file mode 100644 index 0000000..f88ed35 --- /dev/null +++ b/8_PREDMACHLEARN/.directory @@ -0,0 +1,4 @@ +[Dolphin] +Timestamp=2017,2,16,14,12,56 +Version=3 +ViewMode=1 diff --git a/8_PREDMACHLEARN/Practical Machine Learning Course Notes.Rmd b/8_PREDMACHLEARN/Practical Machine Learning Course Notes.Rmd index 96fae3d..ace370a 100644 --- a/8_PREDMACHLEARN/Practical Machine Learning Course Notes.Rmd +++ b/8_PREDMACHLEARN/Practical Machine Learning Course Notes.Rmd @@ -13,6 +13,10 @@ header-includes: \usepackage{graphicx} \usepackage{mathtools} --- +```{r setup, include=FALSE, error=TRUE, message=FALSE} +knitr::opts_chunk$set(echo = TRUE, comment = NA, error = TRUE) +``` + $\pagebreak$ @@ -427,7 +431,8 @@ folds$test[[1]] ```{r} # returns the arguments of the default train function -args(train.default) +# args(train.default) <- this dosn't work anymore in caret +args(train) ``` * `train` function has a large set of parameters, below are the default options @@ -774,8 +779,11 @@ testing <- spam[-inTrain,] preProc <- preProcess(log10(training[,-58]+1),method="pca",pcaComp=2) # calculate PCs for training data trainPC <- predict(preProc,log10(training[,-58]+1)) +# add variable `type` to trainPC +type <- training$type +trainPC <- data.frame(trainPC, type) # join variable `type` # run model on outcome and principle components -modelFit <- train(training$type ~ .,method="glm",data=trainPC) +modelFit <- train(type ~ .,method="glm",data=trainPC) # calculate PCs for test data testPC <- predict(preProc,log10(testing[,-58]+1)) # compare results @@ -789,7 +797,7 @@ confusionMatrix(testing$type,predict(modelFit,testPC)) ```{r message = FALSE, warning = FALSE} # construct model -modelFit <- train(training$type ~ .,method="glm",preProcess="pca",data=training) +modelFit <- train(type ~ .,method="glm",preProcess="pca",data=training) # print results of model confusionMatrix(testing$type,predict(modelFit,testing)) ``` @@ -1321,6 +1329,7 @@ pred.lda - ***example: `caret` package*** ```{r message = F, warning = F} +# package needed: klaR # using the same data from iris, run naive Bayes on training data nb <- train(Species ~ ., data=training,method="nb") # predict test outcomes using naive Bayes model diff --git a/8_PREDMACHLEARN/Practical_Machine_Learning_Course_Notes.html b/8_PREDMACHLEARN/Practical_Machine_Learning_Course_Notes.html index 1c9b791..826225d 100644 --- a/8_PREDMACHLEARN/Practical_Machine_Learning_Course_Notes.html +++ b/8_PREDMACHLEARN/Practical_Machine_Learning_Course_Notes.html @@ -8,38 +8,58 @@ +
\(\pagebreak\)
# load data
+# load data
library(kernlab); data(spam); set.seed(333)
# picking a small subset (10 values) from spam data set
smallSpam <- spam[sample(dim(spam)[1],size=10),]
# label spam = 2 and ham = 1
spamLabel <- (smallSpam$type=="spam")*1 + 1
# plot the capitalAve values for the dataset with colors differentiated by spam/ham (2 vs 1)
-plot(smallSpam$capitalAve,col=spamLabel)
-
-# first rule (over-fitting to capture all variation)
+plot(smallSpam$capitalAve,col=spamLabel)
+
+# first rule (over-fitting to capture all variation)
rule1 <- function(x){
prediction <- rep(NA,length(x))
prediction[x > 2.7] <- "spam"
@@ -237,12 +311,12 @@ In Sample vs Out of Sample Errors
return(prediction)
}
# tabulate results of prediction algorithm 1 (in sample error -> no error in this case)
-table(rule1(smallSpam$capitalAve),smallSpam$type)
-##
-## nonspam spam
-## nonspam 5 0
-## spam 0 5
-# second rule (simple, setting a threshold)
+table(rule1(smallSpam$capitalAve),smallSpam$type)
+
+ nonspam spam
+ nonspam 5 0
+ spam 0 5
+# second rule (simple, setting a threshold)
rule2 <- function(x){
prediction <- rep(NA,length(x))
prediction[x > 2.8] <- "spam"
@@ -250,32 +324,32 @@ In Sample vs Out of Sample Errors
return(prediction)
}
# tabulate results of prediction algorithm 2(in sample error -> 10% in this case)
-table(rule2(smallSpam$capitalAve),smallSpam$type)
-##
-## nonspam spam
-## nonspam 5 1
-## spam 0 4
-# tabulate out of sample error for algorithm 1
-table(rule1(spam$capitalAve),spam$type)
-##
-## nonspam spam
-## nonspam 2141 588
-## spam 647 1225
-# tabulate out of sample error for algorithm 2
-table(rule2(spam$capitalAve),spam$type)
-##
-## nonspam spam
-## nonspam 2224 642
-## spam 564 1171
-# accuracy and total correct for algorithm 1 and 2
+table(rule2(smallSpam$capitalAve),smallSpam$type)
+
+ nonspam spam
+ nonspam 5 1
+ spam 0 4
+# tabulate out of sample error for algorithm 1
+table(rule1(spam$capitalAve),spam$type)
+
+ nonspam spam
+ nonspam 2141 588
+ spam 647 1225
+# tabulate out of sample error for algorithm 2
+table(rule2(spam$capitalAve),spam$type)
+
+ nonspam spam
+ nonspam 2224 642
+ spam 564 1171
+# accuracy and total correct for algorithm 1 and 2
rbind("Rule 1" = c(Accuracy = mean(rule1(spam$capitalAve)==spam$type),
"Total Correct" = sum(rule1(spam$capitalAve)==spam$type)),
"Rule 2" = c(Accuracy = mean(rule2(spam$capitalAve)==spam$type),
- "Total Correct" = sum(rule2(spam$capitalAve)==spam$type)))
-## Accuracy Total Correct
-## Rule 1 0.7315801 3366
-## Rule 2 0.7378831 3395
-\(\pagebreak\)
+ "Total Correct" = sum(rule2(spam$capitalAve)==spam$type)))
+ Accuracy Total Correct
+Rule 1 0.7315801 3366
+Rule 2 0.7378831 3395
+\(\pagebreak\)
\(\pagebreak\)
+\(\pagebreak\)
\(\pagebreak\)
+\(\pagebreak\)
\(\pagebreak\)
+\(\pagebreak\)
\(\pagebreak\)
+\(\pagebreak\)
caret
Package (\(\rightarrow\) preProcess()
-createDataPartition()
, createResample()
, createTimeSlices()
train()
, predict()
confusionMatrix()
preProcess()
createDataPartition()
, createResample()
, createTimeSlices()
train()
, predict()
confusionMatrix()
caret
package
caret
Package (
+createDataPartition(y=data$var, times=1, p=0.75, list=FALSE)
\(\rightarrow\) creates data partitions using given variable
+createDataPartition(y=data$var, times=1, p=0.75, list=FALSE)
\(\rightarrow\) creates data partitions using given variable
y=data$var
= specifies what outcome/variable to split the data ontimes=1
= specifies number of partitions to create (number of data splitting performed)# load packages and data
+# load packages and data
library(caret)
# create training set indexes with 75% of data
inTrain <- createDataPartition(y=spam$type,p=0.75, list=FALSE)
@@ -602,18 +676,18 @@ Data Slicing
# subset spam data (the rest) to test
testing <- spam[-inTrain,]
# dimension of original and training dataset
-rbind("original dataset" = dim(spam),"training set" = dim(training))
-## [,1] [,2]
-## original dataset 4601 58
-## training set 3451 58
+rbind("original dataset" = dim(spam),"training set" = dim(training))
+ [,1] [,2]
+original dataset 4601 58
+training set 3451 58
-createFolds(y=data$var, k=10, list=TRUE, returnTrain=TRUE)
= slices the data in to \(k\) folds for cross validation and returns \(k\) lists of indices
+createFolds(y=data$var, k=10, list=TRUE, returnTrain=TRUE)
= slices the data in to \(k\) folds for cross validation and returns \(k\) lists of indices
y=data$var
= specifies what outcome/variable to split the data on
k=10
= specifies number of folds to create (See K Fold Cross Validation)
-- each training set has approximately has \(\frac{k-1}{k} \%\) of the data (in this case 90%)
-- each training set has approximately has \(\frac{1}{k} \%\) of the data (in this case 10%)
+- each training set has approximately has \(\frac{k-1}{k} \%\) of the data (in this case 90%)
+- each training set has approximately has \(\frac{1}{k} \%\) of the data (in this case 10%)
list=TRUE
= returns k
list of indices that corresponds to the cross-validated sets
@@ -631,39 +705,39 @@ Data Slicing
- example
-# create 10 folds for cross validation and return the training set indices
+# create 10 folds for cross validation and return the training set indices
folds <- createFolds(y=spam$type,k=10,list=TRUE,returnTrain=TRUE)
# structure of the training set indices
-str(folds)
-## List of 10
-## $ Fold01: int [1:4141] 1 2 3 4 6 7 8 9 10 12 ...
-## $ Fold02: int [1:4140] 1 2 3 4 5 7 8 9 10 11 ...
-## $ Fold03: int [1:4141] 1 3 4 5 6 7 8 9 10 11 ...
-## $ Fold04: int [1:4141] 2 3 4 5 6 7 8 9 10 11 ...
-## $ Fold05: int [1:4141] 1 2 3 4 5 6 7 8 9 10 ...
-## $ Fold06: int [1:4141] 1 2 3 4 5 6 7 8 9 10 ...
-## $ Fold07: int [1:4141] 1 2 3 4 5 6 8 9 11 12 ...
-## $ Fold08: int [1:4141] 1 2 3 4 5 6 7 8 9 10 ...
-## $ Fold09: int [1:4141] 1 2 4 5 6 7 10 11 12 13 ...
-## $ Fold10: int [1:4141] 1 2 3 5 6 7 8 9 10 11 ...
-# return the test set indices instead
+str(folds)
+List of 10
+ $ Fold01: int [1:4141] 1 2 3 4 6 7 8 9 10 12 ...
+ $ Fold02: int [1:4140] 1 2 3 4 5 7 8 9 10 11 ...
+ $ Fold03: int [1:4141] 1 3 4 5 6 7 8 9 10 11 ...
+ $ Fold04: int [1:4141] 2 3 4 5 6 7 8 9 10 11 ...
+ $ Fold05: int [1:4141] 1 2 3 4 5 6 7 8 9 10 ...
+ $ Fold06: int [1:4141] 1 2 3 4 5 6 7 8 9 10 ...
+ $ Fold07: int [1:4141] 1 2 3 4 5 6 8 9 11 12 ...
+ $ Fold08: int [1:4141] 1 2 3 4 5 6 7 8 9 10 ...
+ $ Fold09: int [1:4141] 1 2 4 5 6 7 10 11 12 13 ...
+ $ Fold10: int [1:4141] 1 2 3 5 6 7 8 9 10 11 ...
+# return the test set indices instead
# note: returnTrain = FALSE is unnecessary as it is the default behavior
folds.test <- createFolds(y=spam$type,k=10,list=TRUE,returnTrain=FALSE)
-str(folds.test)
-## List of 10
-## $ Fold01: int [1:460] 15 16 18 40 45 62 68 81 82 102 ...
-## $ Fold02: int [1:459] 1 41 55 58 67 75 117 123 151 175 ...
-## $ Fold03: int [1:461] 3 14 66 69 70 80 90 112 115 135 ...
-## $ Fold04: int [1:460] 5 19 25 65 71 83 85 88 91 93 ...
-## $ Fold05: int [1:460] 6 10 17 21 26 56 57 104 107 116 ...
-## $ Fold06: int [1:459] 7 8 13 39 52 54 76 89 99 106 ...
-## $ Fold07: int [1:461] 4 23 27 29 32 33 34 38 49 51 ...
-## $ Fold08: int [1:460] 2 9 30 31 36 37 43 46 47 48 ...
-## $ Fold09: int [1:461] 12 20 24 44 53 59 60 64 84 98 ...
-## $ Fold10: int [1:460] 11 22 28 35 42 61 72 86 92 118 ...
-# return first 10 elements of the first training set
-folds[[1]][1:10]
-## [1] 1 2 3 4 6 7 8 9 10 12
+str(folds.test)
+List of 10
+ $ Fold01: int [1:460] 15 16 18 40 45 62 68 81 82 102 ...
+ $ Fold02: int [1:459] 1 41 55 58 67 75 117 123 151 175 ...
+ $ Fold03: int [1:461] 3 14 66 69 70 80 90 112 115 135 ...
+ $ Fold04: int [1:460] 5 19 25 65 71 83 85 88 91 93 ...
+ $ Fold05: int [1:460] 6 10 17 21 26 56 57 104 107 116 ...
+ $ Fold06: int [1:459] 7 8 13 39 52 54 76 89 99 106 ...
+ $ Fold07: int [1:461] 4 23 27 29 32 33 34 38 49 51 ...
+ $ Fold08: int [1:460] 2 9 30 31 36 37 43 46 47 48 ...
+ $ Fold09: int [1:461] 12 20 24 44 53 59 60 64 84 98 ...
+ $ Fold10: int [1:460] 11 22 28 35 42 61 72 86 92 118 ...
+# return first 10 elements of the first training set
+folds[[1]][1:10]
+ [1] 1 2 3 4 6 7 8 9 10 12
createResample(y=data$var, times=10, list=TRUE)
= create 10 resamplings from the given data with replacement
@@ -674,21 +748,21 @@ Data Slicing
times=10
= number of samples to create
-# create 10 resamples
+# create 10 resamples
resamples <- createResample(y=spam$type,times=10,list=TRUE)
# structure of the resamples (note some samples are repeated)
-str(resamples)
-## List of 10
-## $ Resample01: int [1:4601] 1 4 4 4 7 8 12 13 13 14 ...
-## $ Resample02: int [1:4601] 3 3 5 7 10 12 12 13 13 14 ...
-## $ Resample03: int [1:4601] 1 2 2 3 4 5 8 10 11 12 ...
-## $ Resample04: int [1:4601] 1 3 3 4 7 8 8 9 10 14 ...
-## $ Resample05: int [1:4601] 2 4 5 6 7 7 8 8 9 12 ...
-## $ Resample06: int [1:4601] 3 6 6 7 8 9 12 13 13 14 ...
-## $ Resample07: int [1:4601] 1 2 2 5 5 6 7 8 9 10 ...
-## $ Resample08: int [1:4601] 2 2 3 4 4 7 7 8 8 9 ...
-## $ Resample09: int [1:4601] 1 4 7 8 8 9 12 13 15 15 ...
-## $ Resample10: int [1:4601] 1 3 4 4 7 7 9 9 10 11 ...
+str(resamples)
+List of 10
+ $ Resample01: int [1:4601] 1 4 4 4 7 8 12 13 13 14 ...
+ $ Resample02: int [1:4601] 3 3 5 7 10 12 12 13 13 14 ...
+ $ Resample03: int [1:4601] 1 2 2 3 4 5 8 10 11 12 ...
+ $ Resample04: int [1:4601] 1 3 3 4 7 8 8 9 10 14 ...
+ $ Resample05: int [1:4601] 2 4 5 6 7 7 8 8 9 12 ...
+ $ Resample06: int [1:4601] 3 6 6 7 8 9 12 13 13 14 ...
+ $ Resample07: int [1:4601] 1 2 2 5 5 6 7 8 9 10 ...
+ $ Resample08: int [1:4601] 2 2 3 4 4 7 7 8 8 9 ...
+ $ Resample09: int [1:4601] 1 4 7 8 8 9 12 13 15 15 ...
+ $ Resample10: int [1:4601] 1 3 4 4 7 7 9 9 10 11 ...
createTimeSlices(y=data, initialWindow=20, horizon=10)
= creates training sets with specified window length and the corresponding test sets
@@ -701,32 +775,30 @@ Data Slicing
-# create time series data
+# create time series data
tme <- 1:1000
# create time slices
folds <- createTimeSlices(y=tme,initialWindow=20,horizon=10)
# name of lists
-names(folds)
-## [1] "train" "test"
-# first training set
-folds$train[[1]]
-## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
-# first test set
-folds$test[[1]]
-## [1] 21 22 23 24 25 26 27 28 29 30
+names(folds)
+[1] "train" "test"
+# first training set
+folds$train[[1]]
+ [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
+# first test set
+folds$test[[1]]
+ [1] 21 22 23 24 25 26 27 28 29 30
train(y ~ x, data=df, method="glm")
= function to apply the machine learning algorithm to construct model from training data# returns the arguments of the default train function
-args(train.default)
-## function (x, y, method = "rf", preProcess = NULL, ..., weights = NULL,
-## metric = ifelse(is.factor(y), "Accuracy", "RMSE"), maximize = ifelse(metric ==
-## "RMSE", FALSE, TRUE), trControl = trainControl(), tuneGrid = NULL,
-## tuneLength = 3)
-## NULL
+# returns the arguments of the default train function
+# args(train.default) <- this dosn't work anymore in caret
+args(train)
function (x, ...)
+NULL
train
function has a large set of parameters, below are the default options
tuneLength=3
# returns the default arguments for the trainControl object
-args(trainControl)
-## function (method = "boot", number = ifelse(grepl("cv", method),
-## 10, 25), repeats = ifelse(grepl("cv", method), 1, number),
-## p = 0.75, initialWindow = NULL, horizon = 1, fixedWindow = TRUE,
-## verboseIter = FALSE, returnData = TRUE, returnResamp = "final",
-## savePredictions = FALSE, classProbs = FALSE, summaryFunction = defaultSummary,
-## selectionFunction = "best", preProcOptions = list(thresh = 0.95,
-## ICAcomp = 3, k = 5), index = NULL, indexOut = NULL, timingSamps = 0,
-## predictionBounds = rep(FALSE, 2), seeds = NA, adaptive = list(min = 5,
-## alpha = 0.05, method = "gls", complete = TRUE), trim = FALSE,
-## allowParallel = TRUE)
-## NULL
+# returns the default arguments for the trainControl object
+args(trainControl)
function (method = "boot", number = ifelse(grepl("cv", method),
+ 10, 25), repeats = ifelse(grepl("cv", method), 1, number),
+ p = 0.75, search = "grid", initialWindow = NULL, horizon = 1,
+ fixedWindow = TRUE, skip = 0, verboseIter = FALSE, returnData = TRUE,
+ returnResamp = "final", savePredictions = FALSE, classProbs = FALSE,
+ summaryFunction = defaultSummary, selectionFunction = "best",
+ preProcOptions = list(thresh = 0.95, ICAcomp = 3, k = 5,
+ freqCut = 95/5, uniqueCut = 10, cutoff = 0.9), sampling = NULL,
+ index = NULL, indexOut = NULL, indexFinal = NULL, timingSamps = 0,
+ predictionBounds = rep(FALSE, 2), seeds = NA, adaptive = list(min = 5,
+ alpha = 0.05, method = "gls", complete = TRUE), trim = FALSE,
+ allowParallel = TRUE)
+NULL
trainControl
creates an object that sets many options for how the model will be applied to the training data
# load relevant libraries
+# load relevant libraries
library(ISLR); library(ggplot2);
# load wage data
data(Wage)
@@ -823,8 +897,8 @@ Plotting Predictors ( Wage[inTrain,]
testing <- Wage[-inTrain,]
# plot relationships between the predictors and outcome
-featurePlot(x=training[,c("age","education","jobclass")], y = training$wage,plot="pairs")
-
+featurePlot(x=training[,c("age","education","jobclass")], y = training$wage,plot="pairs")
+
qplot(age, wage, color=eduction, data=training)
= can be used to separate data by a factor variable (by coloring the points differently)
@@ -832,9 +906,9 @@ Plotting Predictors (# qplot plus linear regression lines
-qplot(age,wage,colour=education,data=training)+geom_smooth(method='lm',formula=y~x)
-
+# qplot plus linear regression lines
+qplot(age,wage,colour=education,data=training)+geom_smooth(method='lm',formula=y~x)
+
cut2(variable, g=3)
= creates a new factor variable by cutting the specified variable into n groups (3 in this case) based on percentiles
@@ -846,7 +920,7 @@ Plotting Predictors (# load Hmisc and gridExtra packages
+# load Hmisc and gridExtra packages
library(Hmisc);library(gridExtra);
# cute the wage variable
cutWage <- cut2(training$wage,g=3)
@@ -857,8 +931,8 @@ Plotting Predictors ( qplot(cutWage,age, data=training,fill=cutWage,
geom=c("boxplot","jitter"))
# plot the two graphs side by side
-grid.arrange(p1,p2,ncol=2)
-
+grid.arrange(p1,p2,ncol=2)
+
table(cutVariable, data$var2)
= tabulates the cut factor variable vs another variable in the dataset (ie; builds a contingency table using cross-classifying factors)
prop.table(table, margin=1)
= converts a table to a proportion table
@@ -867,22 +941,22 @@ Plotting Predictors (# tabulate the cutWage and jobclass variables
+# tabulate the cutWage and jobclass variables
t <- table(cutWage,training$jobclass)
# print table
-t
-##
-## cutWage 1. Industrial 2. Information
-## [ 20.1, 92.7) 445 256
-## [ 92.7,119.1) 376 347
-## [119.1,318.3] 269 409
-# convert to proportion table based on the rows
-prop.table(t,1)
-##
-## cutWage 1. Industrial 2. Information
-## [ 20.1, 92.7) 0.6348074 0.3651926
-## [ 92.7,119.1) 0.5200553 0.4799447
-## [119.1,318.3] 0.3967552 0.6032448
+t
+
+cutWage 1. Industrial 2. Information
+ [ 20.1, 92.7) 445 256
+ [ 92.7,119.1) 376 347
+ [119.1,318.3] 269 409
+# convert to proportion table based on the rows
+prop.table(t,1)
+
+cutWage 1. Industrial 2. Information
+ [ 20.1, 92.7) 0.6348074 0.3651926
+ [ 92.7,119.1) 0.5200553 0.4799447
+ [119.1,318.3] 0.3967552 0.6032448
qplot(var1, color=var2, data=training, geom="density")
= produces density plot for the given numeric and factor variables
@@ -893,20 +967,20 @@ Plotting Predictors (# produce density plot
-qplot(wage,colour=education,data=training,geom="density")
-
+# produce density plot
+qplot(wage,colour=education,data=training,geom="density")
+
preProcess
function as an object and apply it to the train
and test
sets using the predict
function# load spam data
+# load spam data
data(spam)
# create train and test sets
inTrain <- createDataPartition(y=spam$type,p=0.75, list=FALSE)
@@ -939,10 +1013,10 @@ Preprocessing (
testCapAveS <- predict(preObj,testing[,-58])$capitalAve
# compare results for capitalAve variable
rbind(train = c(mean = mean(trainCapAveS), std = sd(trainCapAveS)),
- test = c(mean(testCapAveS), sd(testCapAveS)))
-## mean std
-## train 6.097035e-18 1.000000
-## test 7.548133e-02 1.633866
+ test = c(mean(testCapAveS), sd(testCapAveS)))
+ mean std
+train 6.097035e-18 1.000000
+test 7.548133e-02 1.633866
preprocess(data, method="BoxCox")
= applies BoxCox transformations to continuous data to help normalize the variables through maximum likelihood
-# set up BoxCox transforms
+# set up BoxCox transforms
preObj <- preProcess(training[,-58],method=c("BoxCox"))
# perform preprocessing on training data
trainCapAveS <- predict(preObj,training[,-58])$capitalAve
# plot histogram and QQ Plot
# Note: the transformation definitely helped to
# normalize the data but it does not produce perfect result
-par(mfrow=c(1,2)); hist(trainCapAveS); qqnorm(trainCapAveS)
-
+par(mfrow=c(1,2)); hist(trainCapAveS); qqnorm(trainCapAveS)
+
preProcess(data, method="knnImpute")
= impute/estimate the missing data using k nearest neighbors (knn) imputation
@@ -966,7 +1040,7 @@ Preprocessing (
- Note: most prediction algorithms are not build to handle missing data
-# Make some values NA
+# Make some values NA
training$capAve <- training$capitalAve
selectNA <- rbinom(dim(training)[1],size=1,prob=0.05)==1
training$capAve[selectNA] <- NA
@@ -977,10 +1051,10 @@ Preprocessing (
capAveTruth <- training$capitalAve
capAveTruth <- (capAveTruth-mean(capAveTruth))/sd(capAveTruth)
# compute differences between imputed values and true values
-quantile(capAve - capAveTruth)
-## 0% 25% 50% 75% 100%
-## -1.656344e+00 2.377772e-05 1.286900e-03 1.881653e-03 3.174413e-01
-\(\pagebreak\)
+quantile(capAve - capAveTruth)
+ 0% 25% 50% 75% 100%
+-1.656344e+00 2.377772e-05 1.286900e-03 1.880821e-03 3.174413e-01
+\(\pagebreak\)
dummyVars(outcome~var, data=training)
= creates a dummy variable object that can be used through predict
function to create dummy variables
predict(dummyObj, newdata=training)
= creates appropriate columns to represent the factor variable with appropriate 0s and 1s
# setting up data
+# setting up data
inTrain <- createDataPartition(y=Wage$wage,p=0.7, list=FALSE)
training <- Wage[inTrain,]; testing <- Wage[-inTrain,]
# create a dummy variable object
dummies <- dummyVars(wage ~ jobclass,data=training)
# create the dummy variable columns
-head(predict(dummies,newdata=training))
-## jobclass.1. Industrial jobclass.2. Information
-## 231655 1 0
-## 86582 0 1
-## 161300 1 0
-## 155159 0 1
-## 11443 0 1
-## 376662 0 1
+head(predict(dummies,newdata=training))
+ jobclass.1. Industrial jobclass.2. Information
+231655 1 0
+86582 0 1
+161300 1 0
+155159 0 1
+11443 0 1
+376662 0 1
nzv
= TRUE, those variables should be thrown out # print nearZeroVar table
-nearZeroVar(training,saveMetrics=TRUE)
-## freqRatio percentUnique zeroVar nzv
-## year 1.017647 0.33301618 FALSE FALSE
-## age 1.231884 2.85442436 FALSE FALSE
-## sex 0.000000 0.04757374 TRUE TRUE
-## maritl 3.329571 0.23786870 FALSE FALSE
-## race 8.480583 0.19029496 FALSE FALSE
-## education 1.393750 0.23786870 FALSE FALSE
-## region 0.000000 0.04757374 TRUE TRUE
-## jobclass 1.070936 0.09514748 FALSE FALSE
-## health 2.526846 0.09514748 FALSE FALSE
-## health_ins 2.209160 0.09514748 FALSE FALSE
-## logwage 1.011765 18.83920076 FALSE FALSE
-## wage 1.011765 18.83920076 FALSE FALSE
+# print nearZeroVar table
+nearZeroVar(training,saveMetrics=TRUE)
freqRatio percentUnique zeroVar nzv
+year 1.017647 0.33301618 FALSE FALSE
+age 1.231884 2.85442436 FALSE FALSE
+sex 0.000000 0.04757374 TRUE TRUE
+maritl 3.329571 0.23786870 FALSE FALSE
+race 8.480583 0.19029496 FALSE FALSE
+education 1.393750 0.23786870 FALSE FALSE
+region 0.000000 0.04757374 TRUE TRUE
+jobclass 1.070936 0.09514748 FALSE FALSE
+health 2.526846 0.09514748 FALSE FALSE
+health_ins 2.209160 0.09514748 FALSE FALSE
+logwage 1.011765 18.83920076 FALSE FALSE
+wage 1.011765 18.83920076 FALSE FALSE
gam()
function can also be used and it allows for smoothing of multiple variables with different values for each variablepredict
function # load splines package
+# load splines package
library(splines)
# create polynomial function
bsBasis <- bs(training$age,df=3)
@@ -1095,17 +1169,17 @@ Creating Splines (Polynomial Functions)
# plot all age vs wage data
plot(training$age,training$wage,pch=19,cex=0.5)
# plot the fitted polynomial function
-points(training$age,predict(lm1,newdata=training),col="red",pch=19,cex=0.5)
-
-# predict on test values
-head(predict(bsBasis,age=testing$age))
-## 1 2 3
-## [1,] 0.0000000 0.00000000 0.000000000
-## [2,] 0.2368501 0.02537679 0.000906314
-## [3,] 0.4163380 0.32117502 0.082587862
-## [4,] 0.4308138 0.29109043 0.065560908
-## [5,] 0.3625256 0.38669397 0.137491189
-## [6,] 0.3063341 0.42415495 0.195763821
+points(training$age,predict(lm1,newdata=training),col="red",pch=19,cex=0.5)
+
+# predict on test values
+head(predict(bsBasis,age=testing$age))
+ 1 2 3
+[1,] 0.0000000 0.00000000 0.000000000
+[2,] 0.2368501 0.02537679 0.000906314
+[3,] 0.4163380 0.32117502 0.082587862
+[4,] 0.4308138 0.29109043 0.065560908
+[5,] 0.3625256 0.38669397 0.137491189
+[6,] 0.3063341 0.42415495 0.195763821
\(\pagebreak\)
+\(\pagebreak\)
prcomp
Functionpr<-prcomp(data)
= performs PCA on all variables and returns a prcomp
object that contains information about standard deviations and rotations
pr$rotations
= returns eigenvectors for the linear combinations of all variables (coefficients that variables are multiplied by to come up with the principal components) \(\rightarrow\) how the principal components are createdpr$rotations
= returns eigenvectors for the linear combinations of all variables (coefficients that variables are multiplied by to come up with the principal components) \(\rightarrow\) how the principal components are createdlog
transformation of the variables and adding 1 before performing PCA
prcomp
Function# load spam data
+# load spam data
data(spam)
# perform PCA on dataset
prComp <- prcomp(log10(spam[,-58]+1))
# print out the eigenvector/rotations first 5 rows and PCs
-head(prComp$rotation[, 1:5], 5)
-## PC1 PC2 PC3 PC4 PC5
-## make 0.019370409 0.0427855959 -0.01631961 0.02798232 -0.014903314
-## address 0.010827343 0.0408943785 0.07074906 -0.01407049 0.037237531
-## all 0.040923168 0.0825569578 -0.03603222 0.04563653 0.001222215
-## num3d 0.006486834 -0.0001333549 0.01234374 -0.01005991 -0.001282330
-## our 0.036963221 0.0941456085 -0.01871090 0.05098463 -0.010582039
-# create new variable that marks spam as 2 and nospam as 1
+head(prComp$rotation[, 1:5], 5)
+ PC1 PC2 PC3 PC4 PC5
+make 0.019370409 0.0427855959 -0.01631961 0.02798232 -0.014903314
+address 0.010827343 0.0408943785 0.07074906 -0.01407049 0.037237531
+all 0.040923168 0.0825569578 -0.03603222 0.04563653 0.001222215
+num3d 0.006486834 -0.0001333549 0.01234374 -0.01005991 -0.001282330
+our 0.036963221 0.0941456085 -0.01871090 0.05098463 -0.010582039
+# create new variable that marks spam as 2 and nospam as 1
typeColor <- ((spam$type=="spam")*1 + 1)
# plot the first two principal components
-plot(prComp$x[,1],prComp$x[,2],col=typeColor,xlab="PC1",ylab="PC2")
-
+plot(prComp$x[,1],prComp$x[,2],col=typeColor,xlab="PC1",ylab="PC2")
+
caret
Packagecaret
Package# create train and test sets
+# create train and test sets
inTrain <- createDataPartition(y=spam$type,p=0.75, list=FALSE)
training <- spam[inTrain,]
testing <- spam[-inTrain,]
@@ -1209,117 +1283,120 @@ caret
Package
preProc <- preProcess(log10(training[,-58]+1),method="pca",pcaComp=2)
# calculate PCs for training data
trainPC <- predict(preProc,log10(training[,-58]+1))
+# add variable `type` to trainPC
+type <- training$type
+trainPC <- data.frame(trainPC, type) # join variable `type`
# run model on outcome and principle components
-modelFit <- train(training$type ~ .,method="glm",data=trainPC)
+modelFit <- train(type ~ .,method="glm",data=trainPC)
# calculate PCs for test data
testPC <- predict(preProc,log10(testing[,-58]+1))
# compare results
-confusionMatrix(testing$type,predict(modelFit,testPC))
-## Confusion Matrix and Statistics
-##
-## Reference
-## Prediction nonspam spam
-## nonspam 656 41
-## spam 82 371
-##
-## Accuracy : 0.893
-## 95% CI : (0.8737, 0.9103)
-## No Information Rate : 0.6417
-## P-Value [Acc > NIR] : < 2.2e-16
-##
-## Kappa : 0.7724
-## Mcnemar's Test P-Value : 0.0003101
-##
-## Sensitivity : 0.8889
-## Specificity : 0.9005
-## Pos Pred Value : 0.9412
-## Neg Pred Value : 0.8190
-## Prevalence : 0.6417
-## Detection Rate : 0.5704
-## Detection Prevalence : 0.6061
-## Balanced Accuracy : 0.8947
-##
-## 'Positive' Class : nonspam
-##
+confusionMatrix(testing$type,predict(modelFit,testPC))
+Confusion Matrix and Statistics
+
+ Reference
+Prediction nonspam spam
+ nonspam 656 41
+ spam 82 371
+
+ Accuracy : 0.893
+ 95% CI : (0.8737, 0.9103)
+ No Information Rate : 0.6417
+ P-Value [Acc > NIR] : < 2.2e-16
+
+ Kappa : 0.7724
+ Mcnemar's Test P-Value : 0.0003101
+
+ Sensitivity : 0.8889
+ Specificity : 0.9005
+ Pos Pred Value : 0.9412
+ Neg Pred Value : 0.8190
+ Prevalence : 0.6417
+ Detection Rate : 0.5704
+ Detection Prevalence : 0.6061
+ Balanced Accuracy : 0.8947
+
+ 'Positive' Class : nonspam
+
- alternatively, PCA can be directly performed with the
train
method
train(outcome ~ ., method="glm", preProcess="pca", data=training)
= performs PCA first on the training set and then runs the specified model
-- effectively the same procedures as above (
preProcess
\(\rightarrow\) predict
)
+- effectively the same procedures as above (
preProcess
\(\rightarrow\) predict
)
- Note: in both cases, the PCs were able to achieve 90+% accuracy
-# construct model
-modelFit <- train(training$type ~ .,method="glm",preProcess="pca",data=training)
+# construct model
+modelFit <- train(type ~ .,method="glm",preProcess="pca",data=training)
# print results of model
-confusionMatrix(testing$type,predict(modelFit,testing))
-## Confusion Matrix and Statistics
-##
-## Reference
-## Prediction nonspam spam
-## nonspam 668 29
-## spam 59 394
-##
-## Accuracy : 0.9235
-## 95% CI : (0.9066, 0.9382)
-## No Information Rate : 0.6322
-## P-Value [Acc > NIR] : < 2.2e-16
-##
-## Kappa : 0.8379
-## Mcnemar's Test P-Value : 0.001992
-##
-## Sensitivity : 0.9188
-## Specificity : 0.9314
-## Pos Pred Value : 0.9584
-## Neg Pred Value : 0.8698
-## Prevalence : 0.6322
-## Detection Rate : 0.5809
-## Detection Prevalence : 0.6061
-## Balanced Accuracy : 0.9251
-##
-## 'Positive' Class : nonspam
-##
-\(\pagebreak\)
+confusionMatrix(testing$type,predict(modelFit,testing))
+Confusion Matrix and Statistics
+
+ Reference
+Prediction nonspam spam
+ nonspam 668 29
+ spam 59 394
+
+ Accuracy : 0.9235
+ 95% CI : (0.9066, 0.9382)
+ No Information Rate : 0.6322
+ P-Value [Acc > NIR] : < 2.2e-16
+
+ Kappa : 0.8379
+ Mcnemar's Test P-Value : 0.001992
+
+ Sensitivity : 0.9188
+ Specificity : 0.9314
+ Pos Pred Value : 0.9584
+ Neg Pred Value : 0.8698
+ Prevalence : 0.6322
+ Detection Rate : 0.5809
+ Detection Prevalence : 0.6061
+ Balanced Accuracy : 0.9251
+
+ 'Positive' Class : nonspam
+
+\(\pagebreak\)
lm<-lm(y ~ x, data=train)
= runs a linear model of outcome y on predictor x \(\rightarrow\) univariate regression
+lm<-lm(y ~ x, data=train)
= runs a linear model of outcome y on predictor x \(\rightarrow\) univariate regression
summary(lm)
= returns summary of the linear regression model, which will include coefficients, standard errors, \(t\) statistics, and p valuessummary(lm)
= returns summary of the linear regression model, which will include coefficients, standard errors, \(t\) statistics, and p valueslm(y ~ x1+x2+x3, data=train)
= run linear model of outcome y on predictors x1, x2, and x3lm(y ~ ., data=train
= run linear model of outcome y on all predictorspredict(lm, newdata=df)
= use the constructed linear model to predict outcomes (\(\hat Y_i\)) for the new values
+predict(lm, newdata=df)
= use the constructed linear model to predict outcomes (\(\hat Y_i\)) for the new values
newdata
data frame must have the same variables (factors must have the same levels) as the training datanewdata=test
= predict outcomes for the test set based on linear regression model from the training# load data
+# load data
data(faithful)
# create train and test sets
inTrain <- createDataPartition(y=faithful$waiting, p=0.5, list=FALSE)
@@ -1338,31 +1415,31 @@ R Commands and Examples
# build linear model
lm1 <- lm(eruptions ~ waiting,data=trainFaith)
# print summary of linear model
-summary(lm1)
-##
-## Call:
-## lm(formula = eruptions ~ waiting, data = trainFaith)
-##
-## Residuals:
-## Min 1Q Median 3Q Max
-## -1.24867 -0.36292 0.00002 0.35768 1.19858
-##
-## Coefficients:
-## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) -2.165648 0.227486 -9.52 <2e-16 ***
-## waiting 0.079396 0.003146 25.24 <2e-16 ***
-## ---
-## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-##
-## Residual standard error: 0.5013 on 135 degrees of freedom
-## Multiple R-squared: 0.8251, Adjusted R-squared: 0.8238
-## F-statistic: 636.9 on 1 and 135 DF, p-value: < 2.2e-16
-# predict eruptions for new waiting time
+summary(lm1)
+
+Call:
+lm(formula = eruptions ~ waiting, data = trainFaith)
+
+Residuals:
+ Min 1Q Median 3Q Max
+-1.30246 -0.40746 0.03955 0.40465 1.19221
+
+Coefficients:
+ Estimate Std. Error t value Pr(>|t|)
+(Intercept) -1.858966 0.237636 -7.823 1.33e-12 ***
+waiting 0.075444 0.003287 22.952 < 2e-16 ***
+---
+Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
+Residual standard error: 0.5272 on 135 degrees of freedom
+Multiple R-squared: 0.796, Adjusted R-squared: 0.7945
+F-statistic: 526.8 on 1 and 135 DF, p-value: < 2.2e-16
+# predict eruptions for new waiting time
newdata <- data.frame(waiting=80)
-predict(lm1,newdata)
-## 1
-## 4.186
-# create 1 x 2 panel plot
+predict(lm1,newdata)
+ 1
+4.176566
+# create 1 x 2 panel plot
par(mfrow=c(1,2))
# plot train data with the regression line
plot(trainFaith$waiting,trainFaith$eruptions,pch=19,col="blue",xlab="Waiting",
@@ -1371,30 +1448,30 @@ R Commands and Examples
# plot test data with the regression line
plot(testFaith$waiting,testFaith$eruptions,pch=19,col="blue",xlab="Waiting",
ylab="Duration", main = "Test")
-lines(testFaith$waiting,predict(lm1,newdata=testFaith),lwd=3)
-
-# Calculate RMSE on training and test sets
+lines(testFaith$waiting,predict(lm1,newdata=testFaith),lwd=3)
+
+# Calculate RMSE on training and test sets
c(trainRMSE = sqrt(sum((lm1$fitted-trainFaith$eruptions)^2)),
- testRMSE = sqrt(sum((predict(lm1,newdata=testFaith)-testFaith$eruptions)^2)))
-## trainRMSE testRMSE
-## 5.824859 5.788547
+ testRMSE = sqrt(sum((predict(lm1,newdata=testFaith)-testFaith$eruptions)^2)))
+trainRMSE testRMSE
+ 6.125867 5.388723
pi<-predict(lm, newdata=test, interval="prediction")
= returns 3 columns for fit
(predicted value, same as before), lwr
(lower bound of prediction interval), and upr
(upper bound of prediction interval)
matlines(x, pi, type="l")
= plots three lines, one for the linear fit and two for upper/lower prediction interval bounds
-# calculate prediction interval
+# calculate prediction interval
pred1 <- predict(lm1,newdata=testFaith,interval="prediction")
# plot data points (eruptions, waiting)
plot(testFaith$waiting,testFaith$eruptions,pch=19,col="blue")
# plot fit line and prediction interval
-matlines(testFaith$waiting,pred1,type="l",,col=c(1,2,2),lty = c(1,1,1), lwd=3)
-
+matlines(testFaith$waiting,pred1,type="l",,col=c(1,2,2),lty = c(1,1,1), lwd=3)
+
-lm <- train(y ~ x, method="lm", data=train)
= run linear model on the training data \(\rightarrow\) identical to lm
function
+lm <- train(y ~ x, method="lm", data=train)
= run linear model on the training data \(\rightarrow\) identical to lm
function
-summary(lm$finalModel)
= returns summary of the linear regression model, which will include coefficients, standard errors, \(t\) statistics, and p values \(\rightarrow\) identical to summary(lm)
for a lm
object
+summary(lm$finalModel)
= returns summary of the linear regression model, which will include coefficients, standard errors, \(t\) statistics, and p values \(\rightarrow\) identical to summary(lm)
for a lm
object
train(y ~ ., method="lm", data=train)
= run linear model on all predictors in training data
- multiple predictors (dummy/indicator variables) are created for factor variables
@@ -1409,7 +1486,7 @@ R Commands and Examples
-# create train and test sets
+# create train and test sets
inTrain <- createDataPartition(y=Wage$wage,p=0.7, list=FALSE)
training <- Wage[inTrain,]; testing <- Wage[-inTrain,]
# fit linear model for age jobclass and education
@@ -1419,14 +1496,14 @@ R Commands and Examples
# set up 2 x 2 panel plot
par(mfrow = c(2, 2))
# construct diagnostic plots for model
-plot(finMod,pch=19,cex=0.5,col="#00000010")
-
+plot(finMod,pch=19,cex=0.5,col="#00000010")
+
- plotting residuals by fitted values and coloring with a variable not used in the model helps spot a trend in that variable.
-# plot fitted values by residuals
-qplot(finMod$fitted, finMod$residuals, color=race, data=training)
-
+# plot fitted values by residuals
+qplot(finMod$fitted, finMod$residuals, color=race, data=training)
+
- plotting residuals by index (ie; row numbers) can be helpful in showing missing variables
@@ -1437,24 +1514,24 @@ R Commands and Examples
-# plot residual by index
-plot(finMod$residuals,pch=19,cex=0.5)
-
+# plot residual by index
+plot(finMod$residuals,pch=19,cex=0.5)
+
- here the residuals increase linearly with the index, and the highest residuals are concentrated in the higher indexes, so there must be a missing variable
-\(\pagebreak\)
+\(\pagebreak\)
\[\hat{p}_{mk} = \frac{\sum_{i}^m \mathbb{1}(y_i = k)}{N_m}\]
+\[\hat{p}_{mk} = \frac{\sum_{i}^m \mathbb{1}(y_i = k)}{N_m}\]
\(N_m\) is the size of the group
\(N_m\) is the size of the group
example
# set margin and seed
+# set margin and seed
par(mar=c(1,1,1,1), mfrow = c(1, 2)); set.seed(1234);
# simulate data
x = rep(1:4,each=4); y = rep(1:4,4)
# plot first scenario
plot(x,y,xaxt="n",yaxt="n",cex=3,col=c(rep("blue",15),rep("red",1)),pch=19)
# plot second scenario
-plot(x,y,xaxt="n",yaxt="n",cex=3,col=c(rep("blue",8),rep("red",8)),pch=19)
-
+plot(x,y,xaxt="n",yaxt="n",cex=3,col=c(rep("blue",8),rep("red",8)),pch=19)
+
- left graph
-- Misclassification: \(\frac{1}{16} = 0.06\)
-- Gini: \(1 - [(\frac{1}{16})^2 + (\frac{15}{16})^2] = 0.12\)
-- Information:\(-[\frac{1}{16} \times \log_2 (\frac{1}{16})+ \frac{15}{16} \times \log_2(\frac{1}{16})] = 0.34\)
+- Misclassification: \(\frac{1}{16} = 0.06\)
+- Gini: \(1 - [(\frac{1}{16})^2 + (\frac{15}{16})^2] = 0.12\)
+- Information:\(-[\frac{1}{16} \times \log_2 (\frac{1}{16})+ \frac{15}{16} \times \log_2(\frac{1}{16})] = 0.34\)
- right graph
-- Misclassification: \(\frac{8}{16} = 0.5\)
-- Gini: \(1 - [(\frac{8}{16})^2 + (\frac{8}{16})^2] = 0.5\)
-- Information:\(-[\frac{8}{16} \times \log_2 (\frac{8}{16})+ \frac{8}{16} \times \log_2(\frac{8}{16})] = 1\)
+- Misclassification: \(\frac{8}{16} = 0.5\)
+- Gini: \(1 - [(\frac{8}{16})^2 + (\frac{8}{16})^2] = 0.5\)
+- Information:\(-[\frac{8}{16} \times \log_2 (\frac{8}{16})+ \frac{8}{16} \times \log_2(\frac{8}{16})] = 1\)
caret
Packagerattle
package] fancyRpartPlot(tree$finalModel)
= produces more readable, better formatted classification tree diagrams# load iris data set
+# load iris data set
data(iris)
# create test/train data sets
inTrain <- createDataPartition(y=iris$Species,p=0.7, list=FALSE)
@@ -1554,31 +1631,31 @@ Constructing Trees with caret
Package
# fit classification tree as a model
modFit <- train(Species ~ .,method="rpart",data=training)
# print the classification tree
-print(modFit$finalModel)
-## n= 105
-##
-## node), split, n, loss, yval, (yprob)
-## * denotes terminal node
-##
-## 1) root 105 70 setosa (0.33333333 0.33333333 0.33333333)
-## 2) Petal.Length< 2.45 35 0 setosa (1.00000000 0.00000000 0.00000000) *
-## 3) Petal.Length>=2.45 70 35 versicolor (0.00000000 0.50000000 0.50000000)
-## 6) Petal.Width< 1.65 34 1 versicolor (0.00000000 0.97058824 0.02941176) *
-## 7) Petal.Width>=1.65 36 2 virginica (0.00000000 0.05555556 0.94444444) *
-# plot the classification tree
-rattle::fancyRpartPlot(modFit$finalModel)
-
-# predict on test values
-predict(modFit,newdata=testing)
-## [1] setosa setosa setosa setosa setosa setosa
-## [7] setosa setosa setosa setosa setosa setosa
-## [13] setosa setosa setosa versicolor versicolor versicolor
-## [19] versicolor versicolor versicolor versicolor versicolor versicolor
-## [25] versicolor versicolor versicolor versicolor versicolor versicolor
-## [31] virginica virginica virginica virginica virginica virginica
-## [37] versicolor virginica virginica versicolor versicolor virginica
-## [43] virginica virginica virginica
-## Levels: setosa versicolor virginica
+print(modFit$finalModel)
+n= 105
+
+node), split, n, loss, yval, (yprob)
+ * denotes terminal node
+
+1) root 105 70 setosa (0.33333333 0.33333333 0.33333333)
+ 2) Petal.Length< 2.45 35 0 setosa (1.00000000 0.00000000 0.00000000) *
+ 3) Petal.Length>=2.45 70 35 versicolor (0.00000000 0.50000000 0.50000000)
+ 6) Petal.Width< 1.65 34 1 versicolor (0.00000000 0.97058824 0.02941176) *
+ 7) Petal.Width>=1.65 36 2 virginica (0.00000000 0.05555556 0.94444444) *
+# plot the classification tree
+rattle::fancyRpartPlot(modFit$finalModel)
+
+# predict on test values
+predict(modFit,newdata=testing)
+ [1] setosa setosa setosa setosa setosa setosa
+ [7] setosa setosa setosa setosa setosa setosa
+[13] setosa setosa setosa versicolor versicolor versicolor
+[19] versicolor versicolor versicolor versicolor versicolor versicolor
+[25] versicolor versicolor versicolor versicolor versicolor versicolor
+[31] virginica virginica virginica virginica virginica virginica
+[37] versicolor virginica virginica versicolor versicolor virginica
+[43] virginica virginica virginica
+Levels: setosa versicolor virginica
@@ -1600,7 +1677,7 @@ Bagging
-# load data
+# load data
library(ElemStatLearn); data(ozone,package="ElemStatLearn")
# reorder rows based on ozone variable
ozone <- ozone[order(ozone$ozone),]
@@ -1622,8 +1699,8 @@ Bagging
# plot each prediction model
for(i in 1:10){lines(1:155,ll[i,],col="grey",lwd=2)}
# plot the average in red
-lines(1:155,apply(ll,2,mean),col="red",lwd=2)
-
+lines(1:155,apply(ll,2,mean),col="red",lwd=2)
+
Bagging Algorithms
@@ -1648,7 +1725,7 @@ Bagging Algorithms
example
-# load relevant package and data
+# load relevant package and data
library(party); data(ozone,package="ElemStatLearn")
# reorder rows based on ozone variable
ozone <- ozone[order(ozone$ozone),]
@@ -1667,9 +1744,9 @@ Bagging Algorithms
# plot the first fit
points(ozone$ozone,predict(treebag$fits[[1]]$fit,predictors),pch=19,col="red")
# plot the aggregated predictions
-points(ozone$ozone,predict(treebag,predictors),pch=19,col="blue")
-
-\(\pagebreak\)
+points(ozone$ozone,predict(treebag,predictors),pch=19,col="blue")
+
+\(\pagebreak\)
@@ -1680,7 +1757,7 @@ Random Forest
one of the most used/accurate algorithms along with boosting
-
+
- process
@@ -1707,7 +1784,7 @@ R Commands and Examples
rf$finalModel$prox
= returns matrix of proximities
ntree=500
= specify number of trees that should be constructed
-do.trace=TRUE
= prints logs as the trees are being built \(\rightarrow\) useful by indicating progress to user
+do.trace=TRUE
= prints logs as the trees are being built \(\rightarrow\) useful by indicating progress to user
- Note:
randomForest()
function can be used to perform random forest algorithm (syntax is the same as train
) and is much faster
getTree(rf$finalModel, k=2)
= return specific tree from random forest model
@@ -1725,7 +1802,7 @@ R Commands and Examples
example
-# load data
+# load data
data(iris)
# create train/test data sets
inTrain <- createDataPartition(y=iris$Species,p=0.7, list=FALSE)
@@ -1734,38 +1811,38 @@ R Commands and Examples
# apply random forest
modFit <- train(Species~ .,data=training,method="rf",prox=TRUE)
# return the second tree (first 6 rows)
-head(getTree(modFit$finalModel,k=2))
-## left daughter right daughter split var split point status prediction
-## 1 2 3 4 0.70 1 0
-## 2 0 0 0 0.00 -1 1
-## 3 4 5 4 1.75 1 0
-## 4 6 7 3 5.30 1 0
-## 5 0 0 0 0.00 -1 3
-## 6 8 9 3 4.95 1 0
-# compute cluster centers
+head(getTree(modFit$finalModel,k=2))
+ left daughter right daughter split var split point status prediction
+1 2 3 3 2.60 1 0
+2 0 0 0 0.00 -1 1
+3 4 5 4 1.65 1 0
+4 6 7 3 5.25 1 0
+5 8 9 3 4.85 1 0
+6 0 0 0 0.00 -1 2
+# compute cluster centers
irisP <- classCenter(training[,c(3,4)], training$Species, modFit$finalModel$prox)
# convert irisP to data frame and add Species column
irisP <- as.data.frame(irisP); irisP$Species <- rownames(irisP)
# plot data points
p <- qplot(Petal.Width, Petal.Length, col=Species,data=training)
# add the cluster centers
-p + geom_point(aes(x=Petal.Width,y=Petal.Length,col=Species),size=5,shape=4,data=irisP)
-
-# predict outcome for test data set using the random forest model
+p + geom_point(aes(x=Petal.Width,y=Petal.Length,col=Species),size=5,shape=4,data=irisP)
+
+# predict outcome for test data set using the random forest model
pred <- predict(modFit,testing)
# logic value for whether or not the rf algorithm predicted correctly
testing$predRight <- pred==testing$Species
# tabulate results
-table(pred,testing$Species)
-##
-## pred setosa versicolor virginica
-## setosa 15 0 0
-## versicolor 0 15 2
-## virginica 0 0 13
-# plot data points with the incorrect classification highlighted
-qplot(Petal.Width,Petal.Length,colour=predRight,data=testing,main="newdata Predictions")
-
-\(\pagebreak\)
+table(pred,testing$Species)
+
+pred setosa versicolor virginica
+ setosa 15 0 0
+ versicolor 0 14 1
+ virginica 0 1 14
+# plot data points with the incorrect classification highlighted
+qplot(Petal.Width,Petal.Length,colour=predRight,data=testing,main="newdata Predictions")
+
+\(\pagebreak\)
@@ -1774,28 +1851,28 @@ Boosting
boosting = one of the most widely used and accurate prediction models, along with random forest
boosting can be done with any set of classifiers, and a well-known approach is gradient boosting
more detail tutorial can be found here
-process: take a group of weak predictors \(\rightarrow\) weight them and add them up \(\rightarrow\) result in a stronger predictor
+ process: take a group of weak predictors \(\rightarrow\) weight them and add them up \(\rightarrow\) result in a stronger predictor
-- start with a set of classifiers \(h_1, \ldots, h_k\)
+
- start with a set of classifiers \(h_1, \ldots, h_k\)
- examples: all possible trees, all possible regression models, all possible cutoffs (divide data into different parts)
-- calculate a weighted sum of classifiers as the prediction value \[f(x) = \sum_i \alpha_i h_i(x)\] where \(\alpha_i\) = coefficient/weight and \(h_i(x)\) = value of classifier
+
- calculate a weighted sum of classifiers as the prediction value \[f(x) = \sum_i \alpha_i h_i(x)\] where \(\alpha_i\) = coefficient/weight and \(h_i(x)\) = value of classifier
- goal = minimize error (on training set)
-- select one \(h\) at each step (iterative)
+- select one \(h\) at each step (iterative)
- calculate weights based on errors
-- up-weight missed classifications and select next \(h\)
+- up-weight missed classifications and select next \(h\)
example
-
+
- we start with space with blue + and red - and the goal is to classify all the object correctly
- only straight lines will be used for classification
-

+

- from the above, we can see that a group of weak predictors (lines in this case), can be combined and weighed to become a much stronger predictor
@@ -1816,7 +1893,7 @@ R Commands and Examples
predict
function can be used to apply the model to test data, similar to the rest of the algorithms in caret
package
example
-# load data
+# load data
data(Wage)
# remove log wage variable (we are trying to predict wage)
Wage <- subset(Wage,select=-c(logwage))
@@ -1826,37 +1903,35 @@ R Commands and Examples
# run the gbm model
modFit <- train(wage ~ ., method="gbm",data=training,verbose=FALSE)
# print model summary
-print(modFit)
-## Stochastic Gradient Boosting
-##
-## 2102 samples
-## 10 predictor
-##
-## No pre-processing
-## Resampling: Bootstrapped (25 reps)
-##
-## Summary of sample sizes: 2102, 2102, 2102, 2102, 2102, 2102, ...
-##
-## Resampling results across tuning parameters:
-##
-## interaction.depth n.trees RMSE Rsquared RMSE SD Rsquared SD
-## 1 50 35.64972 0.3317422 1.495165 0.02312183
-## 1 100 34.95593 0.3429594 1.503223 0.02319651
-## 1 150 34.84473 0.3451634 1.496016 0.02324176
-## 2 50 34.91119 0.3462101 1.490004 0.02425481
-## 2 100 34.74433 0.3487227 1.480423 0.02278780
-## 2 150 34.74823 0.3487136 1.472941 0.02314265
-## 3 50 34.83480 0.3467828 1.493846 0.02292988
-## 3 100 34.85342 0.3449018 1.482881 0.02373242
-## 3 150 34.99413 0.3401694 1.544378 0.02498133
-##
-## Tuning parameter 'shrinkage' was held constant at a value of 0.1
-##
-## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
-## RMSE was used to select the optimal model using the smallest value.
-## The final values used for the model were n.trees = 100,
-## interaction.depth = 2, shrinkage = 0.1 and n.minobsinnode = 10.
-\(\pagebreak\)
+print(modFit)
+Stochastic Gradient Boosting
+
+2102 samples
+ 10 predictors
+
+No pre-processing
+Resampling: Bootstrapped (25 reps)
+Summary of sample sizes: 2102, 2102, 2102, 2102, 2102, 2102, ...
+Resampling results across tuning parameters:
+
+ interaction.depth n.trees RMSE Rsquared
+ 1 50 35.24716 0.3061446
+ 1 100 34.74137 0.3165154
+ 1 150 34.70108 0.3179425
+ 2 50 34.67703 0.3199690
+ 2 100 34.58430 0.3226554
+ 2 150 34.67104 0.3202411
+ 3 50 34.59842 0.3219575
+ 3 100 34.73009 0.3183701
+ 3 150 34.97108 0.3107605
+
+Tuning parameter 'shrinkage' was held constant at a value of 0.1
+
+Tuning parameter 'n.minobsinnode' was held constant at a value of 10
+RMSE was used to select the optimal model using the smallest value.
+The final values used for the model were n.trees = 100,
+ interaction.depth = 2, shrinkage = 0.1 and n.minobsinnode = 10.
+\(\pagebreak\)
@@ -1869,34 +1944,34 @@ Model Based Prediction
can be reasonably accurate on real problems
this approach does make additional assumptions about the data, which can lead to model failure/reduced accuracy if they are too far off
-goal = build parameter-based model (based on probabilities) for conditional distribution \(P(Y = k~|~X = x)\), or the probability of the outcome \(Y\) is equal to a particular value \(k\) given a specific set of predictor variables \(x\)
+ goal = build parameter-based model (based on probabilities) for conditional distribution \(P(Y = k~|~X = x)\), or the probability of the outcome \(Y\) is equal to a particular value \(k\) given a specific set of predictor variables \(x\)
-- Note: \(X\) is the data for the model (observations for all predictor variables), which is also known as the design matrix
+- Note: \(X\) is the data for the model (observations for all predictor variables), which is also known as the design matrix
typical approach/process
-- start with the quantity \(P(Y = k~|~X = x)\)
-- apply Bayes’ Theorem such that \[ P(Y = k ~|~ X=x) = \frac{P(X=x~|~Y=k)P(Y=k)}{\sum_{\ell=1}^K P(X=x ~|~Y = \ell) P(Y=\ell)}\] where the denominator is simply the sum of probabilities for the predictor variables are the set specified in \(x\) for all outcomes of \(Y\)
-- assume the term \(P(X=x~|~Y=k)\) in the numerator follows a parameter-based probability distribution, or \(f_k(x)\)
+
- start with the quantity \(P(Y = k~|~X = x)\)
+- apply Bayes’ Theorem such that \[ P(Y = k ~|~ X=x) = \frac{P(X=x~|~Y=k)P(Y=k)}{\sum_{\ell=1}^K P(X=x ~|~Y = \ell) P(Y=\ell)}\] where the denominator is simply the sum of probabilities for the predictor variables are the set specified in \(x\) for all outcomes of \(Y\)
+- assume the term \(P(X=x~|~Y=k)\) in the numerator follows a parameter-based probability distribution, or \(f_k(x)\)
-- common choice = Gaussian distribution \[f_k(x) = \frac{1}{\sigma_k \sqrt{2 \pi}}e^{-\frac{(x-\mu_k)^2}{2\sigma_k^2}}\]
+- common choice = Gaussian distribution \[f_k(x) = \frac{1}{\sigma_k \sqrt{2 \pi}}e^{-\frac{(x-\mu_k)^2}{2\sigma_k^2}}\]
-- assume the probability for the outcome \(Y\) to take on value of \(k\), or \(P(Y=k)\), is determined from the data to be some known quantity \(\pi_k\)
+
- assume the probability for the outcome \(Y\) to take on value of \(k\), or \(P(Y=k)\), is determined from the data to be some known quantity \(\pi_k\)
-- Note: \(P(Y=k)\) is known as the prior probability
+- Note: \(P(Y=k)\) is known as the prior probability
-- so the quantity \(P(Y = k~|~X = x)\) can be rewritten as \[P(Y = k ~|~ X=x) = \frac{f_k(x) \pi_k}{\sum_{\ell = 1}^K f_{\ell}(x) \pi_{\ell}}\]
-- estimate the parameters (\(\mu_k\), \(\sigma_k^2\)) for the function \(f_k(x)\) from the data
-- calculate \(P(Y = k~|~X = x)\) using the parameters
-- the outcome \(Y\) is where the value of \(P(Y = k ~|~ X = x)\) is the highest
+- so the quantity \(P(Y = k~|~X = x)\) can be rewritten as \[P(Y = k ~|~ X=x) = \frac{f_k(x) \pi_k}{\sum_{\ell = 1}^K f_{\ell}(x) \pi_{\ell}}\]
+- estimate the parameters (\(\mu_k\), \(\sigma_k^2\)) for the function \(f_k(x)\) from the data
+- calculate \(P(Y = k~|~X = x)\) using the parameters
+- the outcome \(Y\) is where the value of \(P(Y = k ~|~ X = x)\) is the highest
prediction models that leverage this approach
-- linear discriminant analysis = assumes \(f_k(x)\) is multivariate Gaussian distribution with same covariance for each predictor variables
+
- linear discriminant analysis = assumes \(f_k(x)\) is multivariate Gaussian distribution with same covariance for each predictor variables
- effectively drawing lines through “covariate space”
-- quadratic discriminant analysis = assumes \(f_k(x)\) is multivariate Gaussian distribution with different covariance for predictor variables
+
- quadratic discriminant analysis = assumes \(f_k(x)\) is multivariate Gaussian distribution with different covariance for predictor variables
- effectively drawing curves through “covariate space”
@@ -1910,35 +1985,36 @@ Model Based Prediction
Linear Discriminant Analysis
-- to compare the probability for outcome \(Y = k\) versus probability for outcome \(Y = k\), we can look at the ratio of \[\frac{P(Y=k~|~X=x)}{P(Y=j~|~X=x)}\]
-- take the log of the ratio and apply Bayes’ Theorem, we get \[\log \frac{P(Y = k~|~X=x)}{P(Y = j~|~X=x)} = \log \frac{f_k(x)}{f_j(x)} + \log \frac{\pi_k}{\pi_j}\] which is effectively the log ratio of probability density functions plus the log ratio of prior probabilities
+
- to compare the probability for outcome \(Y = k\) versus probability for outcome \(Y = k\), we can look at the ratio of \[\frac{P(Y=k~|~X=x)}{P(Y=j~|~X=x)}\]
+- take the log of the ratio and apply Bayes’ Theorem, we get \[\log \frac{P(Y = k~|~X=x)}{P(Y = j~|~X=x)} = \log \frac{f_k(x)}{f_j(x)} + \log \frac{\pi_k}{\pi_j}\] which is effectively the log ratio of probability density functions plus the log ratio of prior probabilities
-- Note: \(\log\) = monotone transformation, which means taking the \(\log\) of a quantity does not affect implication of the ratio since th \(\log(ratio)\) is directly correlated with ratio
+- Note: \(\log\) = monotone transformation, which means taking the \(\log\) of a quantity does not affect implication of the ratio since th \(\log(ratio)\) is directly correlated with ratio
-- if we substitute \(f_k(x)\) and \(f_l(x)\) with Gaussian probability density functions \[f(x) = \frac{1}{\sigma \sqrt{2 \pi}}e^{-\frac{(x-\mu)^2}{2\sigma^2}}\] so the ratio can be simplified to \[\log \frac{P(Y = k~|~X=x)}{P(Y = j~|~X=x)} = \log \frac{\pi_k}{\pi_j} - \frac{1}{2}(\mu_k + \mu_j)^T \Sigma^{-1}(\mu_k + \mu_j) + x^T \Sigma^{-1} (\mu_k - \mu_j)\] where \(\Sigma^{-1}\) = covariance matrix for the predictor variables, \(x^T\) = set of predictor variables, and \(\mu_k\) / \(\mu_j\) = mean of \(k\), \(j\) respectively
-- as annotated above, the log-ratio is effectively an equation of a line for a set of predictor variables \(x\) \(\rightarrow\) the first two terms are constants and the last term is in the form of \(X \beta\)
+
- if we substitute \(f_k(x)\) and \(f_l(x)\) with Gaussian probability density functions \[f(x) = \frac{1}{\sigma \sqrt{2 \pi}}e^{-\frac{(x-\mu)^2}{2\sigma^2}}\] so the ratio can be simplified to \[\log \frac{P(Y = k~|~X=x)}{P(Y = j~|~X=x)} = \log \frac{\pi_k}{\pi_j} - \frac{1}{2}(\mu_k + \mu_j)^T \Sigma^{-1}(\mu_k + \mu_j) + x^T \Sigma^{-1} (\mu_k - \mu_j)\] where \(\Sigma^{-1}\) = covariance matrix for the predictor variables, \(x^T\) = set of predictor variables, and \(\mu_k\) / \(\mu_j\) = mean of \(k\), \(j\) respectively
+- as annotated above, the log-ratio is effectively an equation of a line for a set of predictor variables \(x\) \(\rightarrow\) the first two terms are constants and the last term is in the form of \(X \beta\)
- Note: the lines are also known as decision boundaries
-- therefore, we can classify values based on which side of the line the value is located (\(k\) vs \(j\))
-- discriminant functions are used to determine value of \(k\), the functions are in the form of \[\delta_k(x) = x^T \Sigma^{-1} \mu_k - \frac{1}{2}\mu_k \Sigma^{-1}\mu_k + log(\mu_k)\]
+
- therefore, we can classify values based on which side of the line the value is located (\(k\) vs \(j\))
+- discriminant functions are used to determine value of \(k\), the functions are in the form of \[\delta_k(x) = x^T \Sigma^{-1} \mu_k - \frac{1}{2}\mu_k \Sigma^{-1}\mu_k + log(\mu_k)\]
-- plugging in the set of predictor variables, \(x^T\), into the discriminant function, we can find the value of \(k\) that maximizes the function \(\delta_k(x)\)
+- plugging in the set of predictor variables, \(x^T\), into the discriminant function, we can find the value of \(k\) that maximizes the function \(\delta_k(x)\)
- the terms of the discriminant function can be estimated using maximum likelihood
-the predicted value for the outcome is therefore \(\hat{Y}(x) = argmax_k \delta_k(x)\)
+the predicted value for the outcome is therefore \(\hat{Y}(x) = argmax_k \delta_k(x)\)
- example
-- classify a group of values into 3 groups using 2 variables (\(x\), \(y\) coordinates)
+- classify a group of values into 3 groups using 2 variables (\(x\), \(y\) coordinates)
- 3 lines are draw to split the data into 3 Gaussian distributions
-- each line splits the data into two groups \(\rightarrow\) 1 vs 2, 2 vs 3, 1 vs 3
-- each side of the line represents a region where the probability of one group (1, 2, or 3) is the highest
+- each line splits the data into two groups \(\rightarrow\) 1 vs 2, 2 vs 3, 1 vs 3
+- each side of the line represents a region where the probability of one group (1, 2, or 3) is the highest
+
- the result is represented in the the following graph
-
+
- R Commands
@@ -1947,7 +2023,7 @@ Linear Discriminant Analysis
- example:
caret
package
-# load data
+# load data
data(iris)
# create training and test sets
inTrain <- createDataPartition(y=iris$Species,p=0.7, list=FALSE)
@@ -1958,27 +2034,27 @@ Linear Discriminant Analysis
# predict test outcomes using LDA model
pred.lda <- predict(lda,testing)
# print results
-pred.lda
-## [1] setosa setosa setosa setosa setosa setosa
-## [7] setosa setosa setosa setosa setosa setosa
-## [13] setosa setosa setosa versicolor versicolor versicolor
-## [19] versicolor versicolor versicolor versicolor versicolor versicolor
-## [25] versicolor versicolor versicolor versicolor versicolor versicolor
-## [31] virginica virginica virginica virginica virginica virginica
-## [37] virginica virginica virginica virginica virginica virginica
-## [43] virginica virginica virginica
-## Levels: setosa versicolor virginica
+pred.lda
+ [1] setosa setosa setosa setosa setosa setosa
+ [7] setosa setosa setosa setosa setosa setosa
+[13] setosa setosa setosa versicolor versicolor versicolor
+[19] versicolor versicolor versicolor versicolor versicolor versicolor
+[25] versicolor versicolor versicolor versicolor versicolor versicolor
+[31] virginica virginica virginica virginica virginica virginica
+[37] virginica virginica virginica virginica virginica virginica
+[43] virginica virginica virginica
+Levels: setosa versicolor virginica
Naive Bayes
-- for predictors \(X_1,\ldots,X_m\), we want to model \(P(Y = k ~|~ X_1,\ldots,X_m)\)
-- by applying Bayes’ Theorem, we get \[P(Y = k ~|~ X_1,\ldots,X_m) = \frac{\pi_k P(X_1,\ldots,X_m~|~ Y=k)}{\sum_{\ell = 1}^K P(X_1,\ldots,X_m ~|~ Y=k) \pi_{\ell}}\]
-- since the denominator is just a sum (constant), we can rewrite the quantity as \[P(Y = k ~|~ X_1,\ldots,X_m) \propto \pi_k P(X_1,\ldots,X_m~|~ Y=k)\] or the probability is proportional to the numerator
+
- for predictors \(X_1,\ldots,X_m\), we want to model \(P(Y = k ~|~ X_1,\ldots,X_m)\)
+- by applying Bayes’ Theorem, we get \[P(Y = k ~|~ X_1,\ldots,X_m) = \frac{\pi_k P(X_1,\ldots,X_m~|~ Y=k)}{\sum_{\ell = 1}^K P(X_1,\ldots,X_m ~|~ Y=k) \pi_{\ell}}\]
+- since the denominator is just a sum (constant), we can rewrite the quantity as \[P(Y = k ~|~ X_1,\ldots,X_m) \propto \pi_k P(X_1,\ldots,X_m~|~ Y=k)\] or the probability is proportional to the numerator
- Note: maximizing the numerator is the same as maximizing the ratio
-- \(\pi_k P(X_1,\ldots,X_m~|~ Y=k)\) can be rewritten as \[\begin{aligned}
+
- \(\pi_k P(X_1,\ldots,X_m~|~ Y=k)\) can be rewritten as \[\begin{aligned}
\pi_k P(X_1,\ldots,X_m~|~ Y=k) & = \pi_k P(X_1 ~|~ Y = k)P(X_2,\ldots,X_m ~|~ X_1,Y=k) \\
& = \pi_k P(X_1 ~|~ Y = k) P(X_2 ~|~ X_1, Y=k) P(X_3,\ldots,X_m ~|~ X_1,X_2, Y=k) \\
& = \pi_k P(X_1 ~|~ Y = k) P(X_2 ~|~ X_1, Y=k)\ldots P(X_m~|~X_1\ldots,X_{m-1},Y=k) \\
@@ -1986,7 +2062,7 @@
Naive Bayes
- this is effectively indicating that each of the predictors may be dependent on other predictors
-- however, if we make the assumption that all predictor variables are independent to each other, the quantity can be simplified to \[ \pi_k P(X_1,\ldots,X_m~|~ Y=k) \approx \pi_k P(X_1 ~|~ Y = k) P(X_2 ~|~ Y = k)\ldots P(X_m ~|~,Y=k)\] which is effectively the product of the prior probability for \(k\) and the probability of variables \(X_1,\ldots,X_m\) given that \(Y = k\)
+
- however, if we make the assumption that all predictor variables are independent to each other, the quantity can be simplified to \[ \pi_k P(X_1,\ldots,X_m~|~ Y=k) \approx \pi_k P(X_1 ~|~ Y = k) P(X_2 ~|~ Y = k)\ldots P(X_m ~|~,Y=k)\] which is effectively the product of the prior probability for \(k\) and the probability of variables \(X_1,\ldots,X_m\) given that \(Y = k\)
- Note: the assumption is naive in that it is unlikely the predictors are completely independent of each other, but this model still produces useful results particularly with large number of binary/categorical variables
@@ -2000,21 +2076,22 @@ Naive Bayes
- example:
caret
package
-# using the same data from iris, run naive Bayes on training data
+# package needed: klaR
+# using the same data from iris, run naive Bayes on training data
nb <- train(Species ~ ., data=training,method="nb")
# predict test outcomes using naive Bayes model
pred.nb <- predict(nb,testing)
# print results
-pred.nb
-## [1] setosa setosa setosa setosa setosa setosa
-## [7] setosa setosa setosa setosa setosa setosa
-## [13] setosa setosa setosa versicolor versicolor versicolor
-## [19] versicolor versicolor versicolor versicolor versicolor versicolor
-## [25] versicolor versicolor versicolor versicolor versicolor versicolor
-## [31] virginica virginica virginica virginica virginica virginica
-## [37] virginica virginica virginica virginica virginica virginica
-## [43] virginica virginica virginica
-## Levels: setosa versicolor virginica
+pred.nb
+ [1] setosa setosa setosa setosa setosa setosa
+ [7] setosa setosa setosa setosa setosa setosa
+[13] setosa setosa setosa versicolor versicolor versicolor
+[19] versicolor versicolor versicolor versicolor versicolor versicolor
+[25] versicolor versicolor versicolor versicolor versicolor versicolor
+[31] virginica virginica virginica virginica virginica virginica
+[37] versicolor virginica virginica virginica virginica virginica
+[43] virginica virginica virginica
+Levels: setosa versicolor virginica
Compare Results for LDA and Naive Bayes
@@ -2022,22 +2099,22 @@ Compare Results for LDA and Naive Bayes
- linear discriminant analysis and naive Bayes generally produce similar results for small data sets
- for our example data from
iris
data set, we can compare the prediction the results from the two models
-# tabulate the prediction results from LDA and naive Bayes
-table(pred.lda,pred.nb)
-## pred.nb
-## pred.lda setosa versicolor virginica
-## setosa 15 0 0
-## versicolor 0 15 0
-## virginica 0 0 15
-# create logical variable that returns TRUE for when predictions from the two models match
+# tabulate the prediction results from LDA and naive Bayes
+table(pred.lda,pred.nb)
+ pred.nb
+pred.lda setosa versicolor virginica
+ setosa 15 0 0
+ versicolor 0 15 0
+ virginica 0 1 14
+# create logical variable that returns TRUE for when predictions from the two models match
equalPredictions <- (pred.lda==pred.nb)
# plot the comparison
-qplot(Petal.Width,Sepal.Width,colour=equalPredictions,data=testing)
-
+qplot(Petal.Width,Sepal.Width,colour=equalPredictions,data=testing)
+
- as we can see from above, only one data point, which is located inbetween the two classes is predicted differently by the two models
-\(\pagebreak\)
+\(\pagebreak\)
@@ -2049,7 +2126,7 @@ Model Selection
the error for the prediction model on test set decreases first and then increases as number of predictors used approaches the total number of predictors available
-
+
- this is expected since as more predictors used, the model is more likely to overfit the training data
- goal in selecting models = avoid overfitting on training data and minimize error on test data
@@ -2077,7 +2154,7 @@ Example: Training vs Test Error for Combination of Predictors
- to demonstrate the behavior of training and test errors, the
prostate
dataset from Elements of Statistical Learning is used
- all combinations of predictors are used to produce prediction models, and Residual Squared Error (RSS) is calculated for all models on both the training and test sets
-# load data and set seed
+# load data and set seed
data(prostate); set.seed(1)
# define outcome y and predictors x
covnames <- names(prostate[-(9:10)])
@@ -2133,8 +2210,8 @@ Example: Training vs Test Error for Combination of Predictors
# plot line through the minimum test RSS data points in blue
lines((1:p), minrss, col="red", lwd=1.7)
# add legend
-legend("topright", c("Train", "Test"), col=c("blue", "red"), pch=1)
-
+legend("topright", c("Train", "Test"), col=c("blue", "red"), pch=1)
+
- from the above, we can clearly that test RSS error approaches the minimum at around 3 predictors and increases slightly as more predictors are used
@@ -2161,16 +2238,16 @@ Split Samples
Decompose Expected Prediction Error
-- the outcome \(Y_i\) can be modeled by \[Y_i = f(X_i) + \epsilon_i\] where \(\epsilon_i\) = error term
-- the expected prediction error is defined as \[EPE(\lambda) = E\left[\left(Y - \hat{f}_{\lambda}(X)\right)^2\right]\] where \(\lambda\) = specific set of tuning parameters
-- estimates from the model constructed with training data can be denoted as \(\hat{f}_{\lambda}(x^*)\) where \(X = x^*\) is the new data point that we would like to predict at
-- the expected prediction error is as follows \[\begin{aligned}
+
- the outcome \(Y_i\) can be modeled by \[Y_i = f(X_i) + \epsilon_i\] where \(\epsilon_i\) = error term
+- the expected prediction error is defined as \[EPE(\lambda) = E\left[\left(Y - \hat{f}_{\lambda}(X)\right)^2\right]\] where \(\lambda\) = specific set of tuning parameters
+- estimates from the model constructed with training data can be denoted as \(\hat{f}_{\lambda}(x^*)\) where \(X = x^*\) is the new data point that we would like to predict at
+- the expected prediction error is as follows \[\begin{aligned}
E\left[\left(Y - \hat{f}_{\lambda}(x^*)\right)^2\right] & = \sigma^2 + \left(E[\hat{f}_{\lambda}(x^*)] - f(x^*)\right)^2 + E\left[\hat{f}_{\lambda}(x^*) - E[\hat{f}_{\lambda}(x^*)]\right]^2\\
& = \mbox{Irreducible Error} + \mbox{Bias}^2 + \mbox{Variance}\\
\end{aligned} \]
- goal of prediction model = minimize overall expected prediction error
-- irreducible error = noise inherent to the data collection process \(\rightarrow\) cannot be reduced
+- irreducible error = noise inherent to the data collection process \(\rightarrow\) cannot be reduced
- bias/variance = can be traded in order to find optimal model (least error)
@@ -2186,9 +2263,9 @@ Hard Thresholding
hard thresholding can help estimate the coefficients/model by taking subsets of predictors and building models
process
-- model the outcome as \[Y_i = f(X_i) + \epsilon_i\] where \(\epsilon_i\) = error term
-- assume the prediction estimate has a linear form \[\hat{f}_{\lambda}(x) = x'\beta\] where only \(\lambda\) coefficients for the set of predictors \(x\) are nonzero
-- after setting the value of \(\lambda\), compute models using all combinations of \(\lambda\) variables to find which variables’ coefficients should be set to be zero
+- model the outcome as \[Y_i = f(X_i) + \epsilon_i\] where \(\epsilon_i\) = error term
+- assume the prediction estimate has a linear form \[\hat{f}_{\lambda}(x) = x'\beta\] where only \(\lambda\) coefficients for the set of predictors \(x\) are nonzero
+- after setting the value of \(\lambda\), compute models using all combinations of \(\lambda\) variables to find which variables’ coefficients should be set to be zero
problem
@@ -2199,43 +2276,43 @@ Hard Thresholding
- as we can see from the results below, some of the coefficients have values of
NA
-# load prostate data
+# load prostate data
data(prostate)
# create subset of observations with 10 variables
small = prostate[1:5,]
# print linear regression
-lm(lpsa ~ .,data =small)
-##
-## Call:
-## lm(formula = lpsa ~ ., data = small)
-##
-## Coefficients:
-## (Intercept) lcavol lweight age lbph
-## 9.60615 0.13901 -0.79142 0.09516 NA
-## svi lcp gleason pgg45 trainTRUE
-## NA NA -2.08710 NA NA
+lm(lpsa ~ .,data =small)
+
+Call:
+lm(formula = lpsa ~ ., data = small)
+
+Coefficients:
+(Intercept) lcavol lweight age lbph
+ 9.60615 0.13901 -0.79142 0.09516 NA
+ svi lcp gleason pgg45 trainTRUE
+ NA NA -2.08710 NA NA
Regularized Regression Concept (Resource)
- regularized regression = fit a regression model and adjust for the large coefficients in attempt to help with bias/variance trade-off or model selection
-- when running regressions unconstrained (without specifying any criteria for coefficients), the model may be susceptible to high variance (coefficients explode \(\rightarrow\) very large values) if there are variables that are highly correlated
+- when running regressions unconstrained (without specifying any criteria for coefficients), the model may be susceptible to high variance (coefficients explode \(\rightarrow\) very large values) if there are variables that are highly correlated
- controlling/regularizing coefficients may slightly increase bias (lose a bit of prediction capability) but will reduce variance and improve the prediction error
- however, this approach may be very demanding computationally and generally does not perform as well as random forest/boosting
-- Penalized Residual Sum of Squares (PRSS) is calculated by adding a penalty term to the prediction squared error \[PRSS(\beta) = \sum_{j=1}^n (Y_j - \sum_{i=1}^m \beta_{1i} X_{ij})^2 + P(\lambda; \beta)\]
+
- Penalized Residual Sum of Squares (PRSS) is calculated by adding a penalty term to the prediction squared error \[PRSS(\beta) = \sum_{j=1}^n (Y_j - \sum_{i=1}^m \beta_{1i} X_{ij})^2 + P(\lambda; \beta)\]
- penalty shrinks coefficients if their values become too large
- penalty term is generally used to reduce complexity and variance for the model, while respecting the structure of the data/relationship
- example: co-linear variables
-- given a linear model, \[Y = \beta_0 + \beta_1 X_1 + \beta_2 X_2 + \epsilon\] where \(X_1\) and \(X_2\) are nearly perfectly correlated (co-linear)
-- the model can then be approximated by this model by omitting \(X_2\), so the model becomes \[Y = \beta_0 + (\beta_1 + \beta_2)X_1 + \epsilon\]
-- with the above model, we can get a good estimate of \(Y\)
+
- given a linear model, \[Y = \beta_0 + \beta_1 X_1 + \beta_2 X_2 + \epsilon\] where \(X_1\) and \(X_2\) are nearly perfectly correlated (co-linear)
+- the model can then be approximated by this model by omitting \(X_2\), so the model becomes \[Y = \beta_0 + (\beta_1 + \beta_2)X_1 + \epsilon\]
+- with the above model, we can get a good estimate of \(Y\)
-- the estimate of \(Y\) will be biased
+- the estimate of \(Y\) will be biased
- but the variance of the prediction may be reduced
@@ -2244,39 +2321,39 @@ Regularized Regression Concept (
Regularized Regression - Ridge Regression
-- the penalized residual sum of squares (PRSS) takes the form of \[PRSS(\beta_j) = \sum_{i=1}^N \left(y_i - \beta_0 - \sum_{j=1}^p x_{ij}\beta_j \right)^2 + \lambda \sum_{j=1}^p \beta_j^2\]
-- this is equivalent to solving the equation \[PRSS(\beta_j) = \sum_{i=1}^N \left(y_i - \beta_0 - \sum_{j=1}^p x_{ij}\beta_j \right)^2\] subject to constraint \(\sum_{j=1}^p \beta_j^2 \leq s\) where \(s\) is inversely proportional to \(\lambda\)
+
- the penalized residual sum of squares (PRSS) takes the form of \[PRSS(\beta_j) = \sum_{i=1}^N \left(y_i - \beta_0 - \sum_{j=1}^p x_{ij}\beta_j \right)^2 + \lambda \sum_{j=1}^p \beta_j^2\]
+- this is equivalent to solving the equation \[PRSS(\beta_j) = \sum_{i=1}^N \left(y_i - \beta_0 - \sum_{j=1}^p x_{ij}\beta_j \right)^2\] subject to constraint \(\sum_{j=1}^p \beta_j^2 \leq s\) where \(s\) is inversely proportional to \(\lambda\)
-- if the coefficients \(\beta_j\) are large in value, the term \(\sum_{j=1}^p \beta_j^2\) will cause the overall PRSS value to increase, leading to worse models
+- if the coefficients \(\beta_j\) are large in value, the term \(\sum_{j=1}^p \beta_j^2\) will cause the overall PRSS value to increase, leading to worse models
- the presence of the term thus requires some of the coefficients to be small
-- inclusion of \(\lambda\) makes the problem non-singular even if \(X^TX\) is not invertible
+
- inclusion of \(\lambda\) makes the problem non-singular even if \(X^TX\) is not invertible
- this means that even in cases where there are more predictors than observations, the coefficients of the predictors can still be estimated
-- \(\lambda\) = tuning parameter
+
- \(\lambda\) = tuning parameter
- controls size of coefficients or the amount of regularization
-- as \(\lambda \rightarrow 0\), the result approaches the least square solution
-- as \(\lambda \rightarrow \infty\), all of the coefficients receive large penalties and the conditional coefficients \(\hat{\beta}_{\lambda=\infty}^{ridge}\) approaches zero collectively
-- \(\lambda\) should be carefully chosen through cross-validation/other techniques to find the optimal tradeoff of bias for variance
+- as \(\lambda \rightarrow 0\), the result approaches the least square solution
+- as \(\lambda \rightarrow \infty\), all of the coefficients receive large penalties and the conditional coefficients \(\hat{\beta}_{\lambda=\infty}^{ridge}\) approaches zero collectively
+- \(\lambda\) should be carefully chosen through cross-validation/other techniques to find the optimal tradeoff of bias for variance
- Note: it is important realize that all coefficients (though they may be shrunk to very small values) will still be included in the model when applying ridge regression
- R Commands
-- [
MASS
package] ridge<-lm.ridge(outcome ~ predictors, data=training, lambda=5)
= perform ridge regression with given outcome and predictors using the provided \(\lambda\) value
+ - [
MASS
package] ridge<-lm.ridge(outcome ~ predictors, data=training, lambda=5)
= perform ridge regression with given outcome and predictors using the provided \(\lambda\) value
- Note: the predictors are centered and scaled first before the regression is run
lambda=5
= tuning parameter
ridge$xm
= returns column/predictor mean from the data
ridge$scale
= returns the scaling performed on the predictors for the ridge regression
-- Note: all the variables are divided by the biased standard deviation \(\sum (X_i - \bar X_i) / n\)
+- Note: all the variables are divided by the biased standard deviation \(\sum (X_i - \bar X_i) / n\)
-ridge$coef
= returns the conditional coefficients, \(\beta_j\) from the ridge regression
+ridge$coef
= returns the conditional coefficients, \(\beta_j\) from the ridge regression
ridge$ym
= return mean of outcome
-- [
caret
package] train(outcome ~ predictors, data=training, method="ridge", lambda=5)
= perform ridge regression with given outcome and predictors
+ caret
package train(outcome ~ predictors, data=training, method="ridge", lambda=5)
= perform ridge regression with given outcome and predictors
preProcess=c("center", "scale")
= centers and scales the predictors before the model is built
@@ -2284,7 +2361,7 @@ Regularized Regression - Ridge Regression
lambda=5
= tuning parameter
-- [
caret
package] train(outcome ~ predictors, data=training, method="foba", lambda=5, k=4)
= perform ridge regression with variable selection
+ caret
package train(outcome ~ predictors, data=training, method="foba", lambda=5, k=4)
= perform ridge regression with variable selection
lambda=5
= tuning parameter
k=4
= number of variables that should be retained
@@ -2292,26 +2369,26 @@ Regularized Regression - Ridge Regression
- this means that
length(predictors)-k
variables will be eliminated
-- [
caret
package] predict(model,test)
= use the model to predict on test set \(\rightarrow\) similar to all other caret
algorithms
+caret
package predict(model,test)
= use the model to predict on test set \(\rightarrow\) similar to all other caret
algorithms
-- example: ridge coefficient paths vs \(\lambda\)
+
- example: ridge coefficient paths vs \(\lambda\)
-- using the same
prostate
dataset, we will run ridge regressions with different values of \(\lambda\) and find the optimum \(\lambda\) value that minimizes test RSS
+- using the same
prostate
dataset, we will run ridge regressions with different values of \(\lambda\) and find the optimum \(\lambda\) value that minimizes test RSS
-

+

Regularized Regression - LASSO Regression
- LASSO (least absolute shrinkage and selection operator) was introduced by Tibshirani (Journal of the Royal Statistical Society 1996)
- similar to ridge, with slightly different penalty term
-- the penalized residual sum of squares (PRSS) takes the form of \[PRSS(\beta_j) = \sum_{i=1}^N \left(y_i - \beta_0 - \sum_{j=1}^p x_{ij}\beta_j \right)^2 + \lambda \sum_{j=1}^p |\beta_j|\]
-- this is equivalent to solving the equation \[PRSS(\beta_j) = \sum_{i=1}^N \left(y_i - \beta_0 - \sum_{j=1}^p x_{ij}\beta_j \right)^2\] subject to constraint \(\sum_{j=1}^p |\beta_j| \leq s\) where \(s\) is inversely proportional to \(\lambda\)
-- \(\lambda\) = tuning parameter
+
- the penalized residual sum of squares (PRSS) takes the form of \[PRSS(\beta_j) = \sum_{i=1}^N \left(y_i - \beta_0 - \sum_{j=1}^p x_{ij}\beta_j \right)^2 + \lambda \sum_{j=1}^p |\beta_j|\]
+- this is equivalent to solving the equation \[PRSS(\beta_j) = \sum_{i=1}^N \left(y_i - \beta_0 - \sum_{j=1}^p x_{ij}\beta_j \right)^2\] subject to constraint \(\sum_{j=1}^p |\beta_j| \leq s\) where \(s\) is inversely proportional to \(\lambda\)
+- \(\lambda\) = tuning parameter
- controls size of coefficients or the amount of regularization
-- large values of \(\lambda\) will set some coefficient equal to zero
+
- large values of \(\lambda\) will set some coefficient equal to zero
- Note: LASSO effectively performs model selection (choose subset of predictors) while shrinking other coefficients, where as Ridge only shrinks the coefficients
@@ -2323,7 +2400,7 @@ Regularized Regression - LASSO Regression
- Note: the predictors are centered and scaled first before the regression is run
as.matrix(x)
= the predictors must be in matrix/dataframe format
trace=TRUE
= prints progress of the lasso regression
-lasso$lambda
= return the \(\lambda\)s used for each step of the lasso regression
+lasso$lambda
= return the \(\lambda\)s used for each step of the lasso regression
plot(lasso)
= prints plot that shows the progression of the coefficients as they are set to zero one by one
predit.lars(lasso, test)
= use the lasso model to predict on test data
@@ -2338,14 +2415,14 @@ Regularized Regression - LASSO Regression
- [
enet
package] lasso<-enet(predictors, outcome, lambda = 0)
= perform elastic net regression on given predictors and outcome
-lambda=0
= default value for \(\lambda\)
+lambda=0
= default value for \(\lambda\)
- Note: lasso regression is a special case of elastic net regression, and forcing
lambda=0
tells the function to fit a lasso regression
plot(lasso)
= prints plot that shows the progression of the coefficients as they are set to zero one by one
predict.ent(lasso, test)
= use the lasso model to predict on test data
-- [
caret
package] train(outcome ~ predictors, data=training, method="lasso")
= perform lasso regression with given outcome and predictors
+ caret
package train(outcome ~ predictors, data=training, method="lasso")
= perform lasso regression with given outcome and predictors
- Note: outcome and predictors must be in the same dataframe
preProcess=c("center", "scale")
= centers and scales the predictors before the model is built
@@ -2353,7 +2430,7 @@ Regularized Regression - LASSO Regression
- Note: this is generally a good idea for building lasso regressions
-- [
caret
package] train(outcome~predictors,data=train,method="relaxo",lambda=5,phi=0.3)
= perform relaxed lasso regression on given predictors and outcome
+ caret
package train(outcome~predictors,data=train,method="relaxo",lambda=5,phi=0.3)
= perform relaxed lasso regression on given predictors and outcome
lambda=5
= tuning parameter
phi=0.3
= relaxation parameter
@@ -2362,11 +2439,11 @@ Regularized Regression - LASSO Regression
phi=0
computes the OLS estimates on the set of variables selected by the Lasso
-[caret
package] predict(model,test)
= use the model to predict on test set \(\rightarrow\) similar to all other caret
algorithms
+caret
package predict(model,test)
= use the model to predict on test set \(\rightarrow\) similar to all other caret
algorithms
example: lars
package
-# load lars package
+# load lars package
library(lars)
# perform lasso regression
lasso.fit <- lars(as.matrix(x), y, type="lasso", trace=TRUE)
@@ -2374,12 +2451,12 @@ Regularized Regression - LASSO Regression
plot(lasso.fit, breaks=FALSE, cex = 0.75)
# add legend
legend("topleft", covnames, pch=8, lty=1:length(covnames),
- col=1:length(covnames), cex = 0.6)
-
-# plots the cross validation curve
-lasso.cv <- cv.lars(as.matrix(x), y, K=10, type="lasso", trace=TRUE)
-
-\(\pagebreak\)
+ col=1:length(covnames), cex = 0.6)
+
+# plots the cross validation curve
+lasso.cv <- cv.lars(as.matrix(x), y, K=10, type="lasso", trace=TRUE)
+
+\(\pagebreak\)
@@ -2437,7 +2514,7 @@ Example - Majority Vote
each has 70% accuracy
majority vote accuracy (mva) = probability of the majority of the models achieving 70% at the same time
-\[\begin{aligned}
+\[\begin{aligned}
\mbox{majority vote accuracy} & = p(3~correct,~2~wrong) + p(4~correct,~1~wrong) \\
&\qquad+ p(5~correct) \\
& = {5 \choose 3} \times(0.7)^3(0.3)^2 + {5 \choose 4} \times(0.7)^4(0.3)^1 - {5 \choose 5} (0.7)^5 \\
@@ -2450,7 +2527,7 @@ Example - Majority Vote
Example - Model Ensembling
-# set up data
+# set up data
inBuild <- createDataPartition(y=Wage$wage,p=0.7, list=FALSE)
validation <- Wage[-inBuild,]; buildData <- Wage[inBuild,]
inTrain <- createDataPartition(y=buildData$wage,p=0.7, list=FALSE)
@@ -2483,11 +2560,11 @@ Example - Model Ensembling
# validation data set RMSE Errors
validation = c(sqrt(sum((glm.pred.val-validation$wage)^2)),
sqrt(sum((rf.pred.val-validation$wage)^2)),
- sqrt(sum((comb.pred.val-validation$wage)^2))))
-## glm rf combined
-## test 858.7074 888.0702 849.3771
-## validation 1061.0891 1086.2027 1057.8264
-\(\pagebreak\)
+ sqrt(sum((comb.pred.val-validation$wage)^2))))
+ glm rf combined
+test 858.7074 888.5536 849.4654
+validation 1061.0891 1085.4773 1056.5813
+\(\pagebreak\)
@@ -2530,8 +2607,8 @@ Forecasting
approaches
-simple moving averages = prediction will be made for a time point by averaging together values from a number of prior periods \[ Y_{t}=\frac{1}{2*k+1}\sum_{j=-k}^k {y_{t+j}}\]
-- exponential smoothing/exponential moving average = weight time points that are closer to point of prediction than those that are further away \[\hat{y}_{t+1} = \alpha y_t + (1-\alpha)\hat{y}_{t-1}\]
+
simple moving averages = prediction will be made for a time point by averaging together values from a number of prior periods \[ Y_{t}=\frac{1}{2*k+1}\sum_{j=-k}^k {y_{t+j}}\]
+- exponential smoothing/exponential moving average = weight time points that are closer to point of prediction than those that are further away \[\hat{y}_{t+1} = \alpha y_t + (1-\alpha)\hat{y}_{t-1}\]
- Note: many different methods of exponential smoothing are available, more information can be found here
@@ -2568,7 +2645,7 @@ R Commands and Examples
ts(googOpen, frequency=12)
= convert data to a time series with frequency
observations per time unit
-frequency=12
= number of observations per unit time (12 in this case because there are 12 months in each year \(\rightarrow\) converts data into yearly time series)
+frequency=12
= number of observations per unit time (12 in this case because there are 12 months in each year \(\rightarrow\) converts data into yearly time series)
decompose(ts)
= decomposes time series into trend, seasonal, and irregular components by using moving averages
@@ -2607,25 +2684,25 @@ R Commands and Examples
quandl
package is also used for finance-related predictions
example: decomposed time series
-# load quantmod package
+# load quantmod package
library(quantmod);
# specify to and from dates
from.dat <- as.Date("01/01/00", format="%m/%d/%y")
to.dat <- as.Date("3/2/15", format="%m/%d/%y")
# get data for AAPL from Google Finance for the specified dates
-getSymbols("AAPL", src="google", from = from.dat, to = to.dat)
-## [1] "AAPL"
-# convert the retrieved daily data to monthly data
+getSymbols("AAPL", src="google", from = from.dat, to = to.dat)
+[1] "AAPL"
+# convert the retrieved daily data to monthly data
mAAPL <- to.monthly(AAPL)
# extract the closing price and convert it to yearly time series (12 observations per year)
ts <- ts(Cl(mAAPL), frequency = 12)
# plot the decomposed parts of the time series
-plot(decompose(ts),xlab="Years")
-
+plot(decompose(ts),xlab="Years")
+
- example: forecast
-# load forecast library
+# load forecast library
library(forecast)
# find the number of rows (years)
rows <- ceiling(length(ts)/12)
@@ -2636,24 +2713,24 @@ R Commands and Examples
# plot the training set
plot(ts.train)
# add the moving average in red
-lines(ma(ts.train,order=3),col="red")
-
-# compute the exponential smoothing average
+lines(ma(ts.train,order=3),col="red")
+
+# compute the exponential smoothing average
ets <- ets(ts.train,model="MMM")
# construct a forecasting model using the exponential smoothing function
fcast <- forecast(ets)
# plot forecast and add actual data in red
-plot(fcast); lines(ts.test,col="red")
-
-# print the accuracy of the forecast model
-accuracy(fcast,ts.test)
-## ME RMSE MAE MPE MAPE MASE
-## Training set 0.1188298 2.825883 1.646959 -0.61217 10.68901 0.1924329
-## Test set -7.8132889 16.736910 15.079222 -13.64900 20.31005 1.7618772
-## ACF1 Theil's U
-## Training set 0.09773823 NA
-## Test set 0.84664431 3.360515
-\(\pagebreak\)
+plot(fcast); lines(ts.test,col="red")
+
+# print the accuracy of the forecast model
+accuracy(fcast,ts.test)
+ ME RMSE MAE MPE MAPE MASE
+Training set 0.3738734 2.576067 1.438703 0.2377946 10.38942 0.1759433
+Test set 0.7292502 18.507031 15.347659 -3.6663982 19.01227 1.8769111
+ ACF1 Theil's U
+Training set 0.1026281 NA
+Test set 0.8661382 3.200942
+\(\pagebreak\)
@@ -2700,7 +2777,7 @@ R Commands and Examples
cl_predict
function in clue
package provides similar functionality
-# load iris data
+# load iris data
data(iris)
# create training and test sets
inTrain <- createDataPartition(y=iris$Species,p=0.7, list=FALSE)
@@ -2715,8 +2792,8 @@ R Commands and Examples
ggtitle("Clusters Classification")
p2 <- qplot(Petal.Width,Petal.Length,colour=Species,data=training) +
ggtitle("Species Classification (Truth)")
-grid.arrange(p1, p2, ncol = 2)
-
+grid.arrange(p1, p2, ncol = 2)
+
- as we can see, there are three clear groups that emerge from the data
@@ -2724,51 +2801,57 @@ R Commands and Examples
- we can compare the results from the clustering and Species classification by tabulating the values
-# tabulate the results from clustering and actual species
-table(kMeans1$cluster,training$Species)
-##
-## setosa versicolor virginica
-## 1 35 0 0
-## 2 0 0 27
-## 3 0 35 8
+# tabulate the results from clustering and actual species
+table(kMeans1$cluster,training$Species)
+
+ setosa versicolor virginica
+ 1 0 2 27
+ 2 35 0 0
+ 3 0 33 8
- with the clusters determined, the training data can be trained on all predictors with the clusters from k-means as outcome
-# build classification trees using the k-means cluster
-clustering <- train(clusters ~.,data=subset(training,select=-c(Species)),method="rpart")
+# build classification trees using the k-means cluster
+clustering <- train(clusters ~.,data=subset(training,select=-c(Species)),method="rpart")
- we can compare the prediction results on training set vs truth
-# tabulate the prediction results on training set vs truth
-table(predict(clustering,training),training$Species)
-##
-## setosa versicolor virginica
-## 1 35 0 0
-## 2 0 0 29
-## 3 0 35 6
+# tabulate the prediction results on training set vs truth
+table(predict(clustering,training),training$Species)
+
+ setosa versicolor virginica
+ 1 0 0 24
+ 2 35 0 0
+ 3 0 35 11
- similarly, we can compare the prediction results on test set vs truth
-# tabulate the prediction results on test set vs truth
-table(predict(clustering,testing),testing$Species)
-##
-## setosa versicolor virginica
-## 1 15 0 0
-## 2 0 1 12
-## 3 0 14 3
+# tabulate the prediction results on test set vs truth
+table(predict(clustering,testing),testing$Species)
+
+ setosa versicolor virginica
+ 1 0 0 10
+ 2 15 0 0
+ 3 0 15 5
+
+
diff --git a/8_PREDMACHLEARN/Practical_Machine_Learning_Course_Notes.pdf b/8_PREDMACHLEARN/Practical_Machine_Learning_Course_Notes.pdf
index eee0c0c..54eb951 100644
Binary files a/8_PREDMACHLEARN/Practical_Machine_Learning_Course_Notes.pdf and b/8_PREDMACHLEARN/Practical_Machine_Learning_Course_Notes.pdf differ