-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
script for plotting time series for demo slides, assessing model
not for app
- Loading branch information
Showing
1 changed file
with
206 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,206 @@ | ||
|
||
import matplotlib.dates as mdates | ||
import matplotlib.pyplot as plt | ||
import numpy as np | ||
import pandas as pd | ||
import seaborn as sns | ||
|
||
# import os; os.chdir("/Users/Battrd/Documents/School&Work/Insight/parking") | ||
|
||
# show more columns in printout | ||
desired_width=320 | ||
pd.set_option('display.width', desired_width) | ||
# np.set_printoption(linewidth=desired_width) | ||
pd.set_option('display.max_columns',65) | ||
|
||
|
||
#%% data options | ||
test_size_per = 24*7*4 # number of observations per precinct; (hours/day)*(days)*(obs/hour) | ||
date_end = pd.to_datetime("2017-06-27") # used to truncate data set | ||
|
||
|
||
#%% load data | ||
proj_dir = "~/Documents/School&Work/Insight/parking" | ||
data = pd.read_csv(proj_dir + "/data_int/pv.csv") | ||
n_ts = len(data.ViolationPrecinct.unique()) | ||
data["datetime_rnd"] = pd.to_datetime(data.datetime_rnd) | ||
data = data[data.datetime_rnd <= date_end].reset_index(drop=True) | ||
|
||
|
||
#%% split into train and test | ||
n_precincts = len(data.ViolationPrecinct.unique()) | ||
test_size = test_size_per*n_precincts | ||
nrow = data.shape[0] | ||
nrow_per = nrow/n_precincts | ||
train_size_per = nrow_per - test_size_per # number of observations per precinct in training set | ||
train_size = nrow - test_size | ||
|
||
train = data.groupby('ViolationPrecinct').apply(lambda x: x.iloc[range(int(train_size_per)+1)]) | ||
train.reset_index(inplace=True, drop=True) | ||
train = train[['ViolationPrecinct','counts','dow','hour']] | ||
test = data.groupby('ViolationPrecinct').apply(lambda x: x.iloc[-test_size_per:]) | ||
|
||
|
||
#%% Create data frame of time series: observed, baseline, glm | ||
# baseline forecasts | ||
base_test_df = pd.read_csv("./data_int/model_00_baseline.csv") | ||
base_test_df.rename(columns={'counts_hat':'counts_hat_base'}, inplace=True) | ||
base_test_df.datetime_rnd = pd.to_datetime(base_test_df.datetime_rnd) | ||
|
||
# glm forecasts | ||
glm_test_df = pd.read_csv("./data_int/mvp_f_out2.csv") | ||
glm_test_df.rename(columns={'counts_hat':'counts_hat_glm'}, inplace=True) | ||
glm_test_df.datetime_rnd = pd.to_datetime(glm_test_df.datetime_rnd) | ||
|
||
# both forecasts and obs | ||
fore_obs_df = pd.merge(base_test_df, glm_test_df, on=['datetime_rnd','ViolationPrecinct','counts'], how='outer') | ||
|
||
|
||
#%% Plot Fits and Residuals | ||
# set up plots | ||
fig, ax = plt.subplots(2,2) # 2x2 grid | ||
sval = 1 # point size | ||
alpha_val = 0.5 # transparency of points | ||
|
||
# plot baseline vs observed | ||
plt.subplot(2,2,1) | ||
plt.scatter(fore_obs_df.counts, fore_obs_df.counts_hat_base, s=sval, alpha=alpha_val) | ||
|
||
# plot glm vs observed | ||
plt.subplot(2,2,2) | ||
plt.scatter(fore_obs_df.counts, fore_obs_df.counts_hat_glm, s=sval, alpha=alpha_val) | ||
|
||
# plot glm vs baseline | ||
plt.subplot(2,2,3) | ||
plt.scatter(fore_obs_df.counts_hat_base, fore_obs_df.counts_hat_glm, s=sval, alpha=alpha_val) | ||
|
||
# plot residual time series on top of each other | ||
ax = plt.subplot(2, 2, 4) | ||
for i in fore_obs_df['ViolationPrecinct'].unique(): | ||
td = fore_obs_df[fore_obs_df.ViolationPrecinct == i] | ||
t_glm_error = td.counts_hat_glm - td.counts | ||
t_dtrnd = td['datetime_rnd'].values | ||
ax.xaxis.set_major_formatter(mdates.DateFormatter('%a\n%b %d')) | ||
plt.plot(td.datetime_rnd, t_glm_error) | ||
# plt.plot(fore_obs_df.datetime_rnd, glm_error) | ||
|
||
plt.show() | ||
|
||
|
||
#%% Time Series full: Manhattan daily | ||
daily_tickets = data.groupby(data.datetime_rnd.dt.date)['counts'].sum() | ||
# daily_tickets.index = pd.DatetimeIndex(daily_tickets.index) | ||
# day_dict = {'weekday':[0,1,2,3,4,5],'sat':[6], 'sun':[7]} | ||
day_dict = {0:'weekday', 1:'weekday', 2:'weekday', 3:'weekday', 4:'weekday', 5:'sat', 6:'sun'} | ||
daily_tickets_dayType = pd.Series(pd.to_datetime(daily_tickets.index).dayofweek).replace(day_dict) | ||
|
||
fig = plt.figure(facecolor='white', figsize=[6,4]) # uses matplotlib.pyplot imported as plt | ||
ax = fig.add_subplot(111) | ||
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %d\n%Y')) | ||
plt.plot(daily_tickets.index, daily_tickets, c='black', linewidth=0.5) | ||
for i in daily_tickets_dayType.unique(): | ||
t_index = list(daily_tickets_dayType == i) | ||
t_date = daily_tickets.index[t_index] | ||
t_tick = daily_tickets[list(t_index)] | ||
plt.scatter(t_date, t_tick) | ||
plt.title("Manhattan FY 2017") | ||
plt.ylabel('Tickets per Day') | ||
plt.savefig("./figures/dailyTimeSeries_Manhattan.png", bbox_inches='tight', dpi=200) | ||
# plt.show() | ||
|
||
|
||
#%%Options for example Time Series | ||
vio_prec_eg = 13 | ||
fore_obs_df_eg = fore_obs_df[fore_obs_df.ViolationPrecinct==vio_prec_eg] | ||
|
||
|
||
#%% Time series test: observed | ||
fig = plt.figure(facecolor='white', figsize=[6,4]) # uses matplotlib.pyplot imported as plt | ||
ax = fig.add_subplot(111) | ||
ax.xaxis.set_major_formatter(mdates.DateFormatter('%a\n%b %d')) | ||
plt.plot(fore_obs_df_eg.datetime_rnd, fore_obs_df_eg.counts, c='black', linewidth=0.5) | ||
plt.title('End of FY 2017, Precinct ' + str(vio_prec_eg)) | ||
plt.ylabel('Tickets per 15 min') | ||
plt.savefig("./figures/forecast_obs.png", bbox_inches='tight', dpi=200) | ||
# plt.show() | ||
|
||
|
||
#%% Time series test: observed, baseline model | ||
fig = plt.figure(facecolor='white', figsize=[6,4]) # uses matplotlib.pyplot imported as plt | ||
ax = fig.add_subplot(111) | ||
ax.xaxis.set_major_formatter(mdates.DateFormatter('%a\n%b %d')) | ||
plt.plot(fore_obs_df_eg.datetime_rnd, fore_obs_df_eg.counts, c='black', linewidth=0.5) | ||
plt.plot(fore_obs_df_eg.datetime_rnd, fore_obs_df_eg.counts_hat_base, c='blue', linewidth=1.5) | ||
plt.title('End of FY 2017, Precinct ' + str(vio_prec_eg)) | ||
plt.ylabel('Tickets per 15 min') | ||
# plt.legend(labels=['Observed','Baseline']) | ||
plt.savefig("./figures/forecast_obs_baseline.png", bbox_inches='tight', dpi=200) | ||
# plt.show() | ||
|
||
|
||
#%% Time series test: observed, glm | ||
fig = plt.figure(facecolor='white', figsize=[6,4]) # uses matplotlib.pyplot imported as plt | ||
ax = fig.add_subplot(111) | ||
ax.xaxis.set_major_formatter(mdates.DateFormatter('%a\n%b %d')) | ||
plt.plot(fore_obs_df_eg.datetime_rnd, fore_obs_df_eg.counts, c='black', linewidth=0.5) | ||
plt.plot(fore_obs_df_eg.datetime_rnd, fore_obs_df_eg.counts_hat_glm, c='red', linewidth=1.5) | ||
plt.title('End of FY 2017, Precinct ' + str(vio_prec_eg)) | ||
plt.ylabel('Tickets per 15 min') | ||
# plt.legend(labels=['Observed','GLM']) | ||
plt.savefig("./figures/forecast_obs_glm.png", bbox_inches='tight', dpi=200) | ||
# plt.show() | ||
|
||
|
||
#%% Time series test: observed, baseline, glm | ||
fig = plt.figure(facecolor='white', figsize=[6,4]) # uses matplotlib.pyplot imported as plt | ||
ax = fig.add_subplot(111) | ||
ax.xaxis.set_major_formatter(mdates.DateFormatter('%a\n%b %d')) | ||
plt.plot(fore_obs_df_eg.datetime_rnd, fore_obs_df_eg.counts, c='black', linewidth=0.5) | ||
plt.plot(fore_obs_df_eg.datetime_rnd, fore_obs_df_eg.counts_hat_base, c='blue', linewidth=1.5) | ||
plt.plot(fore_obs_df_eg.datetime_rnd, fore_obs_df_eg.counts_hat_glm, c='red', linewidth=1.5) | ||
plt.title('End of FY 2017, Precinct ' + str(vio_prec_eg)) | ||
plt.ylabel('Tickets per 15 min') | ||
# plt.legend(labels=['Observed','Baseline','GLM']) | ||
plt.savefig("./figures/forecast_obs_baseline_glm.png", bbox_inches='tight', dpi=200) | ||
# plt.show() | ||
|
||
|
||
#%% Average DoW-HoD combinations, in table for plotting | ||
# baseline averages, but in table for plotting convenience | ||
baseline_table = train.drop(columns='ViolationPrecinct').pivot_table(index='hour', columns='dow', values='counts', aggfunc='mean') | ||
|
||
# plot heat map day-time averages | ||
fig = plt.figure(facecolor='white', figsize=[3,5]) # uses matplotlib.pyplot imported as plt | ||
ax = fig.add_subplot(111) | ||
xticklab = ('Mon',"Tue",'Wed','Thu','Fri','Sat','Sun') | ||
sns.heatmap(baseline_table, annot=False, cmap='RdYlGn_r', linewidths=0.5, yticklabels=2, xticklabels=xticklab, fmt=".1f") | ||
plt.ylabel("Hour of Day") | ||
plt.xlabel("Day of Week") | ||
plt.title("Tickets per 15 min") | ||
plt.savefig("./figures/ticketHeatmap.png", bbox_inches='tight', dpi=250) | ||
# plt.show() | ||
|
||
|
||
#%% How forecast error as a function of horizon | ||
u_prec = fore_obs_df.ViolationPrecinct.unique() | ||
eg_datetime = fore_obs_df[fore_obs_df.ViolationPrecinct == i].datetime_rnd | ||
horizon = (eg_datetime - pd.to_datetime("2017-06-20 00:00:00")).dt.total_seconds()/60/60/24 | ||
fore_errors_glm = np.zeros((len(horizon), len(u_prec))) | ||
for i in range(len(u_prec)): | ||
td = fore_obs_df[fore_obs_df.ViolationPrecinct == u_prec[i]] | ||
fore_errors_glm[:, i] = np.array(td.counts - td.counts_hat_glm) | ||
|
||
|
||
def mae(x): | ||
return np.mean(np.abs(x)) | ||
|
||
|
||
blah = np.apply_along_axis(mae, 1, fore_errors_glm) | ||
plt.plot(horizon, blah) | ||
# plt.scatter(horizon, blah) | ||
plt.show() | ||
|
||
|
||
# import seaborn as sns | ||
# sns.heatmap(fore_errors_glm) | ||
# plt.show() |