R-RerF/man/RerF.Rd at master · MrAE/R-RerF · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/RerF.R
\name{RerF}
\alias{RerF}
\title{RerF forest Generator}
\usage{
RerF(X, Y, FUN = RandMatBinary, paramList = list(p = NA, d = NA,
  sparsity = NA, prob = NA), min.parent = 1L, trees = 500L,
  max.depth = 0, bagging = 0.2, replacement = TRUE,
  stratify = TRUE, rank.transform = FALSE, store.oob = FALSE,
  store.impurity = FALSE, progress = FALSE, rotate = FALSE,
  num.cores = 0L, seed = sample(0:1e+08, 1), cat.map = NULL,
  rfPack = FALSE)
}
\arguments{
\item{X}{an n by d numeric matrix (preferable) or data frame. The rows correspond to observations and columns correspond to features.}

\item{Y}{an n length vector of class labels.  Class labels must be integer or numeric and be within the range 1 to the number of classes.}

\item{FUN}{a function that creates the random projection matrix. If NULL and cat.map is NULL, then RandMat is used. If NULL and cat.map is not NULL, then RandMatCat is used, which adjusts the sampling of features when categorical features have been one-of-K encoded. If a custom function is to be used, then it must return a matrix in sparse representation, in which each nonzero is an array of the form (row.index, column.index, value). See RandMat or RandMatCat for details.}

\item{paramList}{parameters in a named list to be used by FUN. If left unchanged,
default values will be populated, see \code{\link[rerf]{defaults}} for details.}

\item{min.parent}{the minimum splittable node size.  A node size < min.parent will be a leaf node. (min.parent = 1)}

\item{trees}{the number of trees in the forest. (trees=500)}

\item{max.depth}{the longest allowable distance from the root of a tree to a leaf node (i.e. the maximum allowed height for a tree).  If max.depth=0, the tree will be allowed to grow without bound.  (max.depth=0)}

\item{bagging}{a non-zero value means a random sample of X will be used during tree creation.  If replacement = FALSE the bagging value determines the percentage of samples to leave out-of-bag.  If replacement = TRUE the non-zero bagging value is ignored. (bagging=.2)}

\item{replacement}{if TRUE then n samples are chosen, with replacement, from X. (replacement=TRUE)}

\item{stratify}{if TRUE then class sample proportions are maintained during the random sampling.  Ignored if replacement = FALSE. (stratify = FALSE).}

\item{rank.transform}{if TRUE then each feature is rank-transformed (i.e. smallest value becomes 1 and largest value becomes n) (rank.transform=FALSE)}

\item{store.oob}{if TRUE then the samples omitted during the creation of a tree are stored as part of the tree.  This is required to run OOBPredict(). (store.oob=FALSE)}

\item{store.impurity}{if TRUE then the decrease in impurity is stored for each split. This is required to run FeatureImportance() (store.impurity=FALSE)}

\item{progress}{if TRUE then a pipe is printed after each tree is created.  This is useful for large datasets. (progress=FALSE)}

\item{rotate}{if TRUE then the data matrix X is uniformly randomly rotated for each tree. (rotate=FALSE)}

\item{num.cores}{the number of cores to use while training. If num.cores=0 then 1 less than the number of cores reported by the OS are used. (num.cores=0)}

\item{seed}{the seed to use for training the forest.  For two runs to match you must use the same seed for each run AND you must also use the same number of cores for each run. (seed=sample((0:100000000,1)))}

\item{cat.map}{a list specifying which columns in X correspond to the same one-of-K encoded feature. Each element of cat.map is a numeric vector specifying the K column indices of X corresponding to the same categorical feature after one-of-K encoding. All one-of-K encoded features in X must come after the numeric features. The K encoded columns corresponding to the same categorical feature must be placed contiguously within X. The reason for specifying cat.map is to adjust for the fact that one-of-K encoding cateogorical features results in a dilution of numeric features, since a single categorical feature is expanded to K binary features. If cat.map = NULL, then RerF assumes all features are numeric (i.e. none of the features have been one-of-K encoded).}

\item{rfPack}{boolean flag to determine whether to pack a random forest in order to improve prediction speed.  This flag is only applicable when training a forest with the "rf" option.  (rfPack = FALSE)}
}
\value{
forest
}
\description{
Creates a decision forest based on an input matrix and class vector.  This is the main function in the rerf package.
}
\examples{
### Train RerF on numeric data ###
library(rerf)
forest <- RerF(as.matrix(iris[, 1:4]), iris[[5L]], num.cores = 1L)

### Train RerF on one-of-K encoded categorical data ###
df1 <- as.data.frame(Titanic)
nc <- ncol(df1)
df2 <- df1[NULL, -nc]
for (i in which(df1$Freq != 0L)) {
  df2 <- rbind(df2, df1[rep(i, df1$Freq[i]), -nc])
}
n <- nrow(df2) # number of observations
p <- ncol(df2) - 1L # number of features
num.categories <- apply(df2[, 1:p], 2, function(x) length(unique(x)))
p.enc <- sum(num.categories) # number of features after one-of-K encoding
X <- matrix(0, nrow = n, ncol = p.enc) # initialize training data matrix X
cat.map <- vector("list", p)
col.idx <- 0L
# one-of-K encode each categorical feature and store in X
for (j in 1:p) {
  cat.map[[j]] <- (col.idx + 1L):(col.idx + num.categories[j])
  # convert categorical feature to K dummy variables
  X[, cat.map[[j]]] <- dummies::dummy(df2[[j]])
  col.idx <- col.idx + num.categories[j]
}
Y <- df2$Survived

# specifying the cat.map in RerF allows training to
# be aware of which dummy variables correspond
# to the same categorical feature
forest <- RerF(X, Y, num.cores = 1L, cat.map = cat.map)
\dontrun{
# takes longer than 5s to run.
# adding a continuous feature along with the categorical features
# must be prepended to the categorical features.
set.seed(1234)
xp <- runif(nrow(X))
Xp <- cbind(xp, X)
cat.map1 <- lapply(cat.map, function(x) x + 1)
forestW <- RerF(Xp, Y, num.cores = 1L, cat.map = cat.map1)
}

### Train a random rotation ensemble of CART decision trees (see Blaser and Fryzlewicz 2016)
forest <- RerF(as.matrix(iris[, 1:4]), iris[[5L]],
  num.cores = 1L,
  FUN = RandMatRF, paramList = list(p = 4, d = 2), rotate = TRUE
)
}