-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy patharma_garch.py
357 lines (275 loc) · 13.9 KB
/
arma_garch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
import csv
import os
import sys
import datetime
import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import scale
import sklearn as sk
import datetime
import subprocess
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs
from arch import arch_model
from numpy import genfromtxt
from operator import add, sub
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.metrics import mean_squared_error
p = print
#The EWMA class is capable of fetching data, performing anomaly detection on said data, calculates the score and plot the result.
class arma_garch():
#Models a ARIMA(p,q,d)-Garch(i,j) after the series in D and then forcasts the last 5 points of each.
def ARIMA_GARCH(self, data, dataType, dataSize):
X = data
#Performs an Augmented Dickey-Fuller test to check for stationarity.
result = smt.adfuller(X[0])
pvalue = result[1]
if pvalue < 0.2:
print('p-value = ' + str(pvalue) + ' The series is likely stationary.')
differentiation = "None"
else:
print('p-value = ' + str(pvalue) + ' The series is likely NON-stationary.')
#differentiation = "Once"
differentiation = "None"
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
print('\t%s: %.3f' % (key, value))
#Starts a subprocess of the ./arima_garch.r program which is a program written in R and uses the rugarch packaged to perform anomaly detection using an ARMA-GARCH
nr_of_series = len(data)
print("Starting subprocess")
subprocess.call(["Rscript", "--vanilla", "./arima_garch.r", str(dataType), str(dataSize), str(nr_of_series), differentiation])
print("Ended subprocess")
#Load the results that were created by the R program, from file.
forecasts_mean = genfromtxt('forecasts_mean_' + dataType + "_" + dataSize + '.csv', skip_header=0, delimiter=' ', dtype=float)
forecasts_variance = genfromtxt('forecasts_variance_' + dataType + "_" + dataSize + '.csv', skip_header=0, delimiter=' ', dtype=float)
#Correcting for NaN's and 0's
for row in range(len(forecasts_mean)):
for col in range(len(forecasts_mean[row])):
if forecasts_mean[row][col] == 0 and col > 0:
forecasts_mean[row][col] = forecasts_mean[row][col-1]
if np.isnan(forecasts_variance[row][col]):
if col > 0:
forecasts_variance[row][col] = forecasts_variance[row][col-1]
else:
forecasts_variance[row][col] = 1
return forecasts_mean, forecasts_variance
#Expected a tensorflow matrix of datapoints.
def runAnomalyDetection(self, D, D_truth, D_unpolluted, dataType, dataSize):
data = D
α = 4
FP = 0.
FN = 0.
TP = 0.
TN = 0.
FN_ALO = 0.
TP_ALO = 0.
FN_LS = 0.
TP_LS = 0.
FN_TC = 0.
TP_TC = 0.
FN_LTO = 0.
TP_LTO = 0.
FN_SALO = 0.
TP_SALO = 0.
reaction_counter = 0.
tot_reaction = 0.
startup_period = int(0.1*len(D[0]))
print("Running ARIMA-GARCH-AD")
guessed_anomalies = np.zeros((len(D), len(D[0])))
forecasts_mean, forecasts_variance = self.ARIMA_GARCH(D, dataType, dataSize)
all_preds = np.zeros((len(D), len(D[0])))
all_residuals = np.zeros((len(D), len(D[0])))
for i in range(len(D)):
#Make guesses
for t in range(startup_period, len(D[i])):
UCL = forecasts_mean[i][t-startup_period] + α * forecasts_variance[i][t-startup_period]
LCL = forecasts_mean[i][t-startup_period] - α * forecasts_variance[i][t-startup_period]
if UCL < D[i][t] or D[i][t] < LCL:
guessed_anomalies[i][t] = 1 #This is an anomaly guess
#Evaluate guesses
in_anomaly_window = False
anomaly_window_type = 0
has_flagged_anomaly_window = False
for t in range(1, len(D[i])):
#Exiting an anomaly window. Check if it has been flagged.
if D_truth[i][t] == 0 and in_anomaly_window:
in_anomaly_window = False
if not has_flagged_anomaly_window:
FN += 1 #Failed to flag the entire anomaly window.
if int(anomaly_window_type) == 1:
FN_ALO += 1 #Failed to flag the entire anomaly window.
elif int(anomaly_window_type) == 2:
FN_LS += 1 #Failed to flag the entire anomaly window.
elif int(anomaly_window_type) == 3:
FN_TC += 1 #Failed to flag the entire anomaly window.
elif int(anomaly_window_type) == 4:
FN_LTO += 1 #Failed to flag the entire anomaly window.
elif int(anomaly_window_type) == 5:
FN_SALO += 1 #Failed to flag the entire anomaly window.
else:
print("ERROR ANOMALY WINDOW HAD NO TYPE!")
print(int(anomaly_window_type))
else:
has_flagged_anomaly_window = False
reaction_counter = 0.
anomaly_window_type = 0
#Entering an anomaly window.
if D_truth[i][t] != 0 and not in_anomaly_window:
in_anomaly_window = True
anomaly_window_type = D_truth[i][t]
if guessed_anomalies[i][t] == 0 and not in_anomaly_window:
TN += 1 #Correct to not guess for an anomaly.
if guessed_anomalies[i][t] != 0 and in_anomaly_window and not has_flagged_anomaly_window:
TP += 1 #Correct guess, within anomaly window.
tot_reaction += reaction_counter #Add reaction counter to av
reaction_counter = 0.
has_flagged_anomaly_window = True
if int(anomaly_window_type) == 1:
TP_ALO += 1
elif int(anomaly_window_type) == 2:
TP_LS += 1
elif int(anomaly_window_type) == 3:
TP_TC += 1
elif int(anomaly_window_type) == 4:
TP_LTO += 1
elif int(anomaly_window_type) == 5:
TP_SALO += 1
else:
print("ERROR ANOMALY WINDOW HAD NO TYPE!")
print(int(anomaly_window_type))
if guessed_anomalies[i][t] != 0 and not in_anomaly_window:
FP += 1 #Erroneous guess, outside anomaly window.
if guessed_anomalies[i][t] == 0 and in_anomaly_window and not has_flagged_anomaly_window:
reaction_counter += 1 #Failed to react to an anomaly which was present in the current timestep.
print(" ")
print("|" + "TP: " + str(TP) + "|TN: " + str(TN) + "|" + "FP: " + str(FP) + "|" + "FN: " + str(FN) + "|")
if TP == 0:
print("Somethings is wrong. No TP's")
TP = 0.0000001
precision = TP / (TP + FP)
recall = TP / (TP + FN)
avg_reaction = tot_reaction/(TP)
F1 = 2* (precision * recall) / (precision + recall)
print("Anomalies: " + str(TP + FN))
print("Guesses: " + str(TP + FP))
print("precision: " + str(precision))
print("recall: " + str(recall))
print("F1: " + str(F1))
print("Reaction: " + str(avg_reaction))
print("TP_ALO: " + str(TP_ALO))
print("FN_ALO: " + str(FN_ALO))
print("TP_LS: " + str(TP_LS))
print("FN_LS: " + str(FN_LS))
print("TP_TC: " + str(TP_TC))
print("FN_TC: " + str(FN_TC))
print("TP_LTO: " + str(TP_LTO))
print("FN_LTO: " + str(FN_LTO))
print("TP_SALO: " + str(TP_SALO))
print("FN_SALO: " + str(FN_SALO))
with open(str(dataType) + "_" + str(dataSize) + '_result_arma.txt', 'a') as the_result_file:
the_result_file.write("\n\n\n")
the_result_file.write(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"))
the_result_file.write("\n")
the_result_file.write("|" + "TP: " + str(TP) + "|TN: " + str(TN) + "|" + "FP: " + str(FP) + "|" + "FN: " + str(FN) + "|")
the_result_file.write("\nAnomalies: " + str(TP + FN))
the_result_file.write("\nGuesses: " + str(TP + FP))
the_result_file.write("\nprecision: " + str(precision))
the_result_file.write("\nrecall: " + str(recall))
the_result_file.write("\nF1: " + str(F1))
the_result_file.write("\nReaction: " + str(avg_reaction))
the_result_file.write("\nTP_ALO: " + str(TP_ALO))
the_result_file.write("\nFN_ALO: " + str(FN_ALO))
the_result_file.write("\nTP_LS: " + str(TP_LS))
the_result_file.write("\nFN_LS: " + str(FN_LS))
the_result_file.write("\nTP_TC: " + str(TP_TC))
the_result_file.write("\nFN_TC: " + str(FN_TC))
the_result_file.write("\nTP_LTO: " + str(TP_LTO))
the_result_file.write("\nFN_LTO: " + str(FN_LTO))
the_result_file.write("\nTP_SALO: " + str(TP_SALO))
the_result_file.write("\nFN_SALO: " + str(FN_SALO))
return guessed_anomalies, forecasts_mean, forecasts_variance
def tsplot(y, lags=None, figsize=(10, 8), style='bmh'):
print("In tsplot")
if not isinstance(y, pd.Series):
y = pd.Series(y)
with plt.style.context(style):
fig = plt.figure(figsize=figsize)
#mpl.rcParams['font.family'] = 'Ubuntu Mono'
layout = (3, 2)
ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
acf_ax = plt.subplot2grid(layout, (1, 0))
pacf_ax = plt.subplot2grid(layout, (1, 1))
qq_ax = plt.subplot2grid(layout, (2, 0))
pp_ax = plt.subplot2grid(layout, (2, 1))
y.plot(ax=ts_ax)
ts_ax.set_title('Time Series Analysis Plots')
smt.graphics.plot_acf(y, lags=lags, ax=acf_ax, alpha=0.5)
smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax, alpha=0.5)
sm.qqplot(y, line='s', ax=qq_ax)
qq_ax.set_title('QQ Plot')
scs.probplot(y, sparams=(y.mean(), y.std()), plot=pp_ax)
plt.tight_layout()
plt.show()
return
#Finds parameters p, d, q for ARIMA(p, d, q) which maximizes MLE-BIC for the data serie X.
#The model cannot try more parameters than #obs/#params = 5
def find_best_order(X):
best_bic = np.inf
best_order = None
best_mdl = None
max_p = 1
max_d = 2
max_q = 1
#Creates the biggest range possible for the gridsearch with likely convergance.
while len(X) > 5*(max_p + max_d + max_q) or max_p > 10:
max_p += 1
max_q += 1
p_rng = range(max_p) # [0, 1, 2,..., max_p]
q_rng = range(max_d) # [0, 1]
d_rng = range(max_q) # [0, 1, 2,..., max_q]
for i in p_rng:
for d in d_rng:
for j in q_rng:
try:
tmp_mdl = smt.ARIMA(X, order=(i,d,j)).fit(method='mle', disp=0, trend='nc')
tmp_bic = tmp_mdl.bic
if tmp_bic < best_bic:
best_bic = tmp_bic
best_order = (i, d, j)
best_mdl = tmp_mdl
except: continue
p('bic: {:6.5f} | order: {}'.format(best_bic, best_order))
# aic: -11518.22902 | order: (4, 0, 4)
# ARIMA model resid plot
#_ = tsplot(best_mdl.resid, lags=30)
print("best_order: " + str(best_order))
return best_order
def plotAnomalyDetection(self, D, D_truth, D_unpolluted, D_guesses, D_means, D_vars):
for row in range(min(len(D), 10)):
plt.figure(1)
plt.subplot(211)
x = range(len(D_means[0]))
line_mean, = plt.plot(D_means[row], 'g', label='Conditional Mean')
plt.fill_between(x, (D_means[row] - 3*D_vars[row]), (D_means[row] + 3*D_vars[row]), facecolor='green', alpha=0.4)
line_actual, = plt.plot(D[row, int(0.1*len(D[row])):], 'r', label='True Values')
line_unpolluted, = plt.plot(D_unpolluted[row, int(0.1*len(D[row])):], label='True Values without Anomalies')
all_gi = np.argwhere(D_guesses[row] != 0) #all guesses
all_ai = np.argwhere(D_truth[row] != 0) #all actuall anomalies
for gi in all_gi:
dot_wrong, = plt.plot(gi-int(0.1*len(D[row])), D[row][gi], 'yo', label='Incorrect Guess')
#plt.axvspan(ai-0.1, ai+0.1, facecolor='#990000', alpha=0.8)
for ai in all_ai:
dot_actual, = plt.plot(ai-int(0.1*len(D[row])), D[row][ai], 'ro', label='Anomaly')
if ai in all_gi:
dot_correct, = plt.plot(ai-int(0.1*len(D[row])), D[row][ai], 'ko', label='Correct Guess')
plt.legend(handles=[line_mean, line_actual, line_unpolluted, dot_wrong, dot_actual, dot_correct])
plt.subplot(212)
line_variance, = plt.plot(D_vars[row], 'orange', label='Conditional Variance')
plt.legend(handles=[line_variance])
plt.show()