-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_data.py
89 lines (65 loc) · 2.78 KB
/
process_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from datetime import timedelta
from optparse import Values
from re import I
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from datetime import date, timedelta, datetime
def get_mean_from_csv(PATH_TO_FILE):
data = pd.read_csv(PATH_TO_FILE)
data = data.drop(columns=['countyFIPS', 'StateFIPS'])
average_column = data.mean(axis=0).to_frame()
average_column = average_column.diff()
return average_column
def convert_to_weekly(data):
offset = pd.offsets.DateOffset(-1)
data.set_axis(['covid_mean'], axis='columns', inplace=True)
data_modified = data.reset_index()
data_modified = data_modified.assign(date = data_modified['index'].drop(columns = 'index'))
data_modified['date'] = data_modified['date'].astype('datetime64[ns]')
data_weekly = data_modified.resample('W-mon', label='left', closed='left', on='date', loffset=offset).mean()
data_weekly.drop(index=('2020-01-19'), inplace=True)
return data_weekly
def preprocess_data(df, y_col):
df = df.fillna(df.mean()) # fill na / nan values with mean value
if y_col in df.columns:
X = df.drop(y_col, axis=1) # splitting data into X and y
y = df[y_col]
return X, y
return df
# APPEND DATA FROM VALUES INTO DATAFRAME
def predict_dataframe(values, mode, ar_order):
if mode == 0:
next_date = date.today() + timedelta(days= 7)
past_date = next_date - timedelta(days=365)
if mode == 1:
next_date = datetime.strptime("2022-02-27", "%Y-%m-%d").date()
past_date = next_date - timedelta(days=365)
if mode == 2:
past_date = datetime.strptime("2020-01-26", "%Y-%m-%d").date()
past_date = past_date + timedelta(days=7 * ar_order)
idx = pd.date_range(past_date, periods=len(values), freq="W")
datetime_series = pd.Series(range(len(idx)), index=idx)
df = pd.DataFrame(values)
df['date'] = datetime_series.index
df.reset_index()
df.set_index(['date'], inplace=True)
return df
# GET CERTAIN DATA FROM DATAFRAME
def get_data_for_comparison(df, mode, ar_order):
if mode == 1:
next_date = datetime.strptime("2022-02-27", "%Y-%m-%d").date()
past_date = next_date - timedelta(days=365 + 7 * ar_order)
if mode == 0:
past_date = datetime.strptime("2020-01-26", "%Y-%m-%d").date()
next_date = past_date + timedelta(days=365 + 7 * ar_order)
comparable_df = df.loc[str(past_date):str(next_date)]
return comparable_df
# MAKE VECTOR OUT OF DATAFRAME DATA
def make_vector(dataframe):
vector_arr = []
for column in dataframe.columns:
for i in range(len(dataframe[column].values)):
vector_arr.append(dataframe[column].values[i])
return vector_arr