SVRBOD/Script 3 at master · xcmarvin/SVRBOD · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
library(caret)
library(e1071)
library(hydroGOF)

# Data import and normalisation -------------------------------------------
trainData <- read.csv(filename, header = TRUE)
testData <- read.csv(filename, header = TRUE)
fullData <- rbind(trainData, testData)
minY <- min(fullData$y)
maxY <- max(fullData$y)
preFullData <- preProcess(fullData, method = "range")
newFullData <- predict(preFullData, fullData)
newTrainData <- newFullData[1:528, ]
newTestData <- newFullData[529:658, ]

# Model performance for original training set -----------------------------
output <- function(trainData, testData, newTrainData, newTestData) {
  model <- svm(y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10 + x11 + x12 + x13 + x14 + x15 + x16 + x17 + x18 + x19 + x20 + x21 + x22 + x23 + x24 + x25 + x26 + x27 + x28 + x29 + x30 + x31 + x32 + x33 + x34 + x35 + x36 + x37 + x38 + x39 + x40, newTrainData, cost = 4, epsilon = 0.04, gamma = 0.03, scale = FALSE)
  trainY <- predict(model, newTrainData)
  testY <- predict(model, newTestData)
  trainY <- trainY * (maxY - minY) + minY
  testY <- testY * (maxY - minY) + minY
  trainError <- trainY - trainData$y
  testError <- testY - testData$y
  trainOutput <- data.frame(trainData$y, trainY, trainError)
  testOutput <- data.frame(testData$y, testY, testError)
  lowTrainOutput <- subset(trainOutput, trainData$y < 150)
  highTrainOutput <- subset(trainOutput, trainData$y >= 150)
  lowTestOutput <- subset(testOutput, testData$y < 150)
  highTestOutput <- subset(testOutput, testData$y >= 150)
  trainmae <- c(mean(abs(trainOutput$trainError)), mean(abs(lowTrainOutput$trainError)), mean(abs(highTrainOutput$trainError)))
  testmae <- c(mean(abs(testOutput$testError)), mean(abs(lowTestOutput$testError)), mean(abs(highTestOutput$testError)))
  trainbias <- c(mean(trainOutput$trainError), mean(lowTrainOutput$trainError), mean(highTrainOutput$trainError))
  testbias <- c(mean(testOutput$testError), mean(lowTestOutput$testError), mean(highTestOutput$testError))
  trainr2 <- c((cor(trainOutput)[2,1])^2, (cor(lowTrainOutput)[2,1])^2, (cor(highTrainOutput)[2,1])^2)
  testr2 <- c((cor(testOutput)[2,1])^2, (cor(lowTestOutput)[2,1])^2, (cor(highTestOutput)[2,1])^2)
  trainnse <- c(NSE(trainY, trainData$y), NSE(lowTrainOutput$trainY, lowTrainOutput$trainData.y), NSE(highTrainOutput$trainY, highTrainOutput$trainData.y))
  testnse <- c(NSE(testY, testData$y), NSE(lowTestOutput$testY, lowTestOutput$testData.y), NSE(highTestOutput$testY, highTestOutput$testData.y))
  positiveLowCoeff <- 0
  negativeLowCoeff <- 0
  positiveHighCoeff <- 0
  negativeHighCoeff <- 0
  countPositiveLowCoeff <- 0
  countNegativeLowCoeff <- 0
  countPositiveHighCoeff <- 0
  countNegativeHighCoeff <- 0
  listCoefs <- matrix(nrow = length(model$index), ncol = 2)
  for (i in 1:length(model$index)) {
    if (trainData$y[model$index[i]] < 150) {
      if (model$coefs[i] > 0) {
        positiveLowCoeff <- positiveLowCoeff + model$coefs[i]
        countPositiveLowCoeff <- countPositiveLowCoeff + 1
        listCoefs[i, 1] <- trainData$y[model$index[i]]
        listCoefs[i, 2] <- 1
      } else {
        negativeLowCoeff <- negativeLowCoeff + model$coefs[i]
        countNegativeLowCoeff <- countNegativeLowCoeff + 1
        listCoefs[i, 1] <- trainData$y[model$index[i]]
        listCoefs[i, 2] <- -1
      }
    } else {
      if (model$coefs[i] > 0) {
        positiveHighCoeff <- positiveHighCoeff + model$coefs[i]
        countPositiveHighCoeff <- countPositiveHighCoeff + 1
        listCoefs[i, 1] <- trainData$y[model$index[i]]
        listCoefs[i, 2] <- 1
      } else {
        negativeHighCoeff <- negativeHighCoeff + model$coefs[i]
        countNegativeHighCoeff <- countNegativeHighCoeff + 1
        listCoefs[i, 1] <- trainData$y[model$index[i]]
        listCoefs[i, 2] <- -1
      }
    }
  }
  sumOfAlpha <- c(positiveLowCoeff, negativeLowCoeff, positiveHighCoeff, negativeHighCoeff)
  numberOfSV <- c(countPositiveLowCoeff, countNegativeLowCoeff, countPositiveHighCoeff, countNegativeHighCoeff)
  list(trainOutput = trainOutput, testOutput = testOutput, trainmae = trainmae, testmae = testmae, trainbias = trainbias, testbias = testbias, trainr2 = trainr2, testr2 = testr2, trainnse = trainnse, testnse = testnse, listCoefs = listCoefs, sumOfAlpha = sumOfAlpha, numberOfSV = numberOfSV)
}

modelStatResult <- output(trainData, testData, newTrainData, newTestData)

# Model performances for different training sets --------------------------
highData <- subset(newTrainData, y >= (150 - minY) / (maxY - minY))

diffOutput <- function(highData, testData, newTestData, n) {
  for (i in 1:n) {
    newTrainData <- rbind(newTrainData, highData)
  }
  model <- svm(y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10 + x11 + x12 + x13 + x14 + x15 + x16 + x17 + x18 + x19 + x20 + x21 + x22 + x23 + x24 + x25 + x26 + x27 + x28 + x29 + x30 + x31 + x32 + x33 + x34 + x35 + x36 + x37 + x38 + x39 + x40, newTrainData, cost = 4, epsilon = 0.04, gamma = 0.03, scale = FALSE)
  trainY <- predict(model, newTrainData)
  testY <- predict(model, newTestData)
  actualTrainY <- newTrainData$y
  actualTestY <- testData$y
  trainY <- trainY * (maxY - minY) + minY
  testY <- testY * (maxY - minY) + minY
  actualTrainY <- actualTrainY * (maxY - minY) + minY
  trainError <- trainY - actualTrainY
  testError <- testY - actualTestY
  trainOutput <- data.frame(actualTrainY, trainY, trainError)
  testOutput <- data.frame(actualTestY, testY, testError)
  lowTrainOutput <- subset(trainOutput, actualTrainY < 150)
  highTrainOutput <- subset(trainOutput, actualTrainY >= 150)
  lowTestOutput <- subset(testOutput, testData$y < 150)
  highTestOutput <- subset(testOutput, testData$y >= 150)
  trainmae <- c(mean(abs(trainOutput$trainError)), mean(abs(lowTrainOutput$trainError)), mean(abs(highTrainOutput$trainError)))
  testmae <- c(mean(abs(testOutput$testError)), mean(abs(lowTestOutput$testError)), mean(abs(highTestOutput$testError)))
  trainbias <- c(mean(trainOutput$trainError), mean(lowTrainOutput$trainError), mean(highTrainOutput$trainError))
  testbias <- c(mean(testOutput$testError), mean(lowTestOutput$testError), mean(highTestOutput$testError))
  trainr2 <- c((cor(trainOutput)[2,1])^2, (cor(lowTrainOutput)[2,1])^2, (cor(highTrainOutput)[2,1])^2)
  testr2 <- c((cor(testOutput)[2,1])^2, (cor(lowTestOutput)[2,1])^2, (cor(highTestOutput)[2,1])^2)
  trainnse <- c(NSE(trainY, actualTrainY), NSE(lowTrainOutput$trainY, lowTrainOutput$actualTrainY), NSE(highTrainOutput$trainY, highTrainOutput$actualTrainY))
  testnse <- c(NSE(testY, actualTestY), NSE(lowTestOutput$testY, lowTestOutput$actualTestY), NSE(highTestOutput$testY, highTestOutput$actualTestY))
  positiveLowCoeff <- 0
  negativeLowCoeff <- 0
  positiveHighCoeff <- 0
  negativeHighCoeff <- 0
  countPositiveLowCoeff <- 0
  countNegativeLowCoeff <- 0
  countPositiveHighCoeff <- 0
  countNegativeHighCoeff <- 0
  listCoefs <- matrix(, nrow = length(model$index), ncol = 2)
  for (i in 1:length(model$index)) {
    if (actualTrainY[model$index[i]] < 150) {
      if (model$coefs[i] > 0) {
        positiveLowCoeff <- positiveLowCoeff + model$coefs[i]
        countPositiveLowCoeff <- countPositiveLowCoeff + 1
        listCoefs[i, 1] <- actualTrainY[model$index[i]]
        listCoefs[i, 2] <- 1
      } else {
        negativeLowCoeff <- negativeLowCoeff + model$coefs[i]
        countNegativeLowCoeff <- countNegativeLowCoeff + 1
        listCoefs[i, 1] <- actualTrainY[model$index[i]]
        listCoefs[i, 2] <- -1
      }
    } else {
      if (model$coefs[i] > 0) {
        positiveHighCoeff <- positiveHighCoeff + model$coefs[i]
        countPositiveHighCoeff <- countPositiveHighCoeff + 1
        listCoefs[i, 1] <- actualTrainY[model$index[i]]
        listCoefs[i, 2] <- 1
      } else {
        negativeHighCoeff <- negativeHighCoeff + model$coefs[i]
        countNegativeHighCoeff <- countNegativeHighCoeff + 1
        listCoefs[i, 1] <- actualTrainY[model$index[i]]
        listCoefs[i, 2] <- -1
      }
    }
  }
  sumOfAlpha <- c(positiveLowCoeff, negativeLowCoeff, positiveHighCoeff, negativeHighCoeff)
  numberOfSV <- c(countPositiveLowCoeff, countNegativeLowCoeff, countPositiveHighCoeff, countNegativeHighCoeff)
  list(trainOutput = trainOutput, testOutput = testOutput, trainmae = trainmae, testmae = testmae, trainbias = trainbias, testbias = testbias, trainr2 = trainr2, testr2 = testr2, trainnse = trainnse, testnse = testnse, listCoefs = listCoefs, sumOfAlpha = sumOfAlpha, numberOfSV = numberOfSV)
}

modelStatResult10 <- diffOutput(highData, testData, newTestData, 3)
modelStatResult20 <- diffOutput(highData, testData, newTestData, 7)
modelStatResult30 <- diffOutput(highData, testData, newTestData, 13)
modelStatResult40 <- diffOutput(highData, testData, newTestData, 20)
modelStatResult50 <- diffOutput(highData, testData, newTestData, 31)

# Store results for training and test performances in csv files -----------
write.table(modelStatResult$trainOutput, file = filename, sep = ",", row.names = FALSE)
write.table(modelStatResult$listCoefs, file = filename, sep = ",", row.names = FALSE)
write.table(modelStatResult$testOutput, file = filename, sep = ",", row.names = FALSE)
write.table(modelStatResult10$trainOutput, file = filename, sep = ",", row.names = FALSE)
write.table(modelStatResult10$listCoefs, file = filename, sep = ",", row.names = FALSE)
write.table(modelStatResult10$testOutput, file = filename, sep = ",", row.names = FALSE)
write.table(modelStatResult20$trainOutput, file = filename, sep = ",", row.names = FALSE)
write.table(modelStatResult20$listCoefs, file = filename, sep = ",", row.names = FALSE)
write.table(modelStatResult20$testOutput, file = filename, sep = ",", row.names = FALSE)
write.table(modelStatResult30$trainOutput, file = filename, sep = ",", row.names = FALSE)
write.table(modelStatResult30$listCoefs, file = filename, sep = ",", row.names = FALSE)
write.table(modelStatResult30$testOutput, file = filename, sep = ",", row.names = FALSE)
write.table(modelStatResult40$trainOutput, file = filename, sep = ",", row.names = FALSE)
write.table(modelStatResult40$listCoefs, file = filename, sep = ",", row.names = FALSE)
write.table(modelStatResult40$testOutput, file = filename, sep = ",", row.names = FALSE)
write.table(modelStatResult50$trainOutput, file = filename, sep = ",", row.names = FALSE)
write.table(modelStatResult50$listCoefs, file = filename, sep = ",", row.names = FALSE)
write.table(modelStatResult50$testOutput, file = filename, sep = ",", row.names = FALSE)

combModelStatMAE <- cbind(modelStatResult$trainmae, modelStatResult10$trainmae, modelStatResult20$trainmae, modelStatResult30$trainmae, modelStatResult40$trainmae, modelStatResult50$trainmae, modelStatResult$testmae, modelStatResult10$testmae, modelStatResult20$testmae, modelStatResult30$testmae, modelStatResult40$testmae, modelStatResult50$testmae)
rownames(combModelStatMAE) <- c("overall", "low", "high")
colnames(combModelStatMAE) <- c("train", "train10", "train20", "train30", "train40", "train50", "test", "test10", "test20", "test30", "test40", "test50")
write.table(combModelStatMAE, file = filename, sep = ",")

combModelStatBias <- cbind(modelStatResult$trainbias, modelStatResult10$trainbias, modelStatResult20$trainbias, modelStatResult30$trainbias, modelStatResult40$trainbias, modelStatResult50$trainbias, modelStatResult$testbias, modelStatResult10$testbias, modelStatResult20$testbias, modelStatResult30$testbias, modelStatResult40$testbias, modelStatResult50$testbias)
rownames(combModelStatBias) <- c("overall", "low", "high")
colnames(combModelStatBias) <- c("train", "train10", "train20", "train30", "train40", "train50", "test", "test10", "test20", "test30", "test40", "test50")
write.table(combModelStatBias, file = filename, sep = ",")

combModelStatR2 <- cbind(modelStatResult$trainr2, modelStatResult10$trainr2, modelStatResult20$trainr2, modelStatResult30$trainr2, modelStatResult40$trainr2, modelStatResult50$trainr2, modelStatResult$testr2, modelStatResult10$testr2, modelStatResult20$testr2, modelStatResult30$testr2, modelStatResult40$testr2, modelStatResult50$testr2)
rownames(combModelStatR2) <- c("overall", "low", "high")
colnames(combModelStatR2) <- c("train", "train10", "train20", "train30", "train40", "train50", "test", "test10", "test20", "test30", "test40", "test50")
write.table(combModelStatR2, file = filename, sep = ",")

combModelStatNSE <- cbind(modelStatResult$trainnse, modelStatResult10$trainnse, modelStatResult20$trainnse, modelStatResult30$trainnse, modelStatResult40$trainnse, modelStatResult50$trainnse, modelStatResult$testnse, modelStatResult10$testnse, modelStatResult20$testnse, modelStatResult30$testnse, modelStatResult40$testnse, modelStatResult50$testnse)
rownames(combModelStatNSE) <- c("overall", "low", "high")
colnames(combModelStatNSE) <- c("train", "train10", "train20", "train30", "train40", "train50", "test", "test10", "test20", "test30", "test40", "test50")
write.table(combModelStatNSE, file = filename, sep = ",")

combModelSV <- cbind(modelStatResult$sumOfAlpha, modelStatResult10$sumOfAlpha, modelStatResult20$sumOfAlpha, modelStatResult30$sumOfAlpha, modelStatResult40$sumOfAlpha, modelStatResult50$sumOfAlpha, modelStatResult$numberOfSV, modelStatResult10$numberOfSV, modelStatResult20$numberOfSV, modelStatResult30$numberOfSV, modelStatResult40$numberOfSV, modelStatResult50$numberOfSV)
rownames(combModelSV) <- c("positivelowcoeff", "negativelowcoeff", "positivehighcoeff", "negativehighcoeff")
colnames(combModelSV) <- c("trainsum", "train10sum", "train20sum", "train30sum", "train40sum", "train50sum", "traincount", "train10count", "train20count", "train30count", "train40count", "train50count")
write.table(combModelSV, file = filename, sep = ",")