-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy patharima.py
223 lines (192 loc) · 6.96 KB
/
arima.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# Orignal author: Siddhant Ray
import argparse
import time as t
import warnings
from datetime import datetime
from math import sqrt
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from generate_sequences import generate_ARIMA_delay_data
from sklearn.metrics import mean_squared_error
from statsmodels.tools.sm_exceptions import ConvergenceWarning
from statsmodels.tsa.arima.model import ARIMA
NUMBOTTLECKS = 1
def run_arima():
delay_data = generate_ARIMA_delay_data(NUMBOTTLECKS)
targets, predictions = [], []
warnings.simplefilter("ignore", ConvergenceWarning)
# count = 0
# We want minimum 1023 for the first ARIMA prediction (size of the window)
# Make this 29990 -> 9990 for the 10000 history window ARIMA
for value in range(1023, int(delay_data.shape[0] / 116) + 29990):
# We want to predict the next value
# Fit the model
model = ARIMA(delay_data[:value], order=(1, 1, 2))
model_fit = model.fit()
yhat = model_fit.forecast(steps=1)
targets.append(delay_data[value])
predictions.append(yhat)
# count+=1
# print(count)
return targets, predictions
def evaluate_arima(targets, predictions):
mse = mean_squared_error(targets, predictions)
squared_error = np.square(targets - predictions)
return squared_error, mse
if __name__ == "__main__":
args = argparse.ArgumentParser()
args.add_argument("--run", type=bool, default=False)
args = args.parse_args()
if args.run:
print("Started ARIMA at:")
time = datetime.now()
print(time)
targets, predictions = run_arima()
## MSE calculation
mse = mean_squared_error(targets, predictions)
print(mse)
print("Finished ARIMA at:")
time = datetime.now()
print(time)
# Save the results
df = pd.DataFrame({"Targets": targets, "Predictions": predictions})
df.to_csv("memento_data/ARIMA_30000.csv", index=False)
else:
print("ARIMA load results from file")
df = pd.read_csv("memento_data/ARIMA_30000.csv")
targets = df["Targets"]
predictions = (
df["Predictions"].str.split(" ").str[4].str.split("\n").str[0].astype(float)
)
squared_error, mse = evaluate_arima(targets, predictions)
df = pd.DataFrame(
{
"Squared Error": squared_error,
"targets": targets,
"predictions": predictions,
}
)
df.to_csv("memento_data/ARIMA_evaluation_30000.csv", index=False)
print(df.head())
print(squared_error.values)
## Stats on the squared error
# Mean squared error
print(
np.mean(squared_error.values),
" Mean squared error",
)
# Median squared error
print(np.median(squared_error.values), " Median squared error")
# 90th percentile squared error
print(
np.quantile(squared_error.values, 0.90, method="closest_observation"),
" 90th percentile squared error",
)
# 99th percentile squared error
print(
np.quantile(squared_error.values, 0.99, method="closest_observation"),
" 99th percentile squared error",
)
# 99.9th percentile squared error
print(
np.quantile(squared_error.values, 0.999, method="closest_observation"),
" 99.9th percentile squared error",
)
# Standard deviation squared error
print(np.std(squared_error.values), " Standard deviation squared error")
## Df row where the squared error is the a certain value
print(
df[
df["Squared Error"]
== np.quantile(squared_error.values, 0.5, method="closest_observation")
],
"Values at median SE",
)
print(
df[
df["Squared Error"]
== np.quantile(squared_error.values, 0.90, method="closest_observation")
],
"Values at 90th percentile SE",
)
print(
df[
df["Squared Error"]
== np.quantile(squared_error.values, 0.99, method="closest_observation")
],
"Values at 99th percentile SE",
)
print(
df[
df["Squared Error"]
== np.quantile(
squared_error.values, 0.999, method="closest_observation"
)
],
"Values at 99.9th percentile SE",
)
print(
df[
df["Squared Error"]
== np.quantile(
squared_error.values, 0.9999, method="closest_observation"
)
],
"Values at 99.99th percentile SE",
)
# Plot the index vs squared error
# Set figure size
sns.set_theme("paper", "whitegrid", font_scale=1.5)
mpl.rcParams.update(
{
"text.usetex": True,
"font.family": "serif",
"text.latex.preamble": r"\usepackage{amsmath,amssymb}",
"lines.linewidth": 2,
"lines.markeredgewidth": 0,
"scatter.marker": ".",
"scatter.edgecolors": "none",
# Set image quality and reduce whitespace around saved figure.
"savefig.dpi": 300,
"savefig.bbox": "tight",
"savefig.pad_inches": 0.01,
}
)
# Plot the index vs squared error
plt.figure(figsize=(5, 5))
sns.lineplot(x=df.index, y=df["Squared Error"], color="red")
plt.xlabel("History Length")
plt.ylabel("Squared Error")
# plt.title("Squared Error on predictions vs History Length upto 10000")
plt.xlim(1023, 10000)
# place legend to the right, bbox is the box around the legend
# plt.legend(loc='upper right', bbox_to_anchor=(0.67, 1.02), ncol=1)
plt.savefig("SE_trend_arima_10000.pdf")
## Do the plots over a loop of xlims
xlims = [0, 6000, 12000, 18000, 24000, 30000]
for idx_xlim in range(len(xlims) - 1):
plt.figure(figsize=(10, 6))
sns.lineplot(
x=df.index, y=df["Squared Error"], color="red", label="Squared Error"
)
# label axes
plt.xlabel("History Length")
plt.ylabel("Squared Error")
# set xlim
plt.xlim(xlims[idx_xlim], xlims[idx_xlim + 1])
plt.title(
"Squared Error trend for xlims "
+ str(xlims[idx_xlim])
+ " to "
+ str(xlims[idx_xlim + 1])
)
plt.savefig(
"SE_trend_arima_xlim_"
+ str(xlims[idx_xlim])
+ "_"
+ str(xlims[idx_xlim + 1])
+ ".pdf"
)