script for plotting time series for demo slides, assessing model

not for app
rBatt · Oct 5, 2018 · 34a6e9a · 34a6e9a
1 parent 01e652e
commit 34a6e9a
Showing 1 changed file with 206 additions and 0 deletions.
diff --git a/figures.py b/figures.py
@@ -0,0 +1,206 @@
+
+import matplotlib.dates as mdates
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+
+# import os; os.chdir("/Users/Battrd/Documents/School&Work/Insight/parking")
+
+# show more columns in printout
+desired_width=320
+pd.set_option('display.width', desired_width)
+# np.set_printoption(linewidth=desired_width)
+pd.set_option('display.max_columns',65)
+
+
+#%% data options
+test_size_per = 24*7*4  # number of observations per precinct; (hours/day)*(days)*(obs/hour)
+date_end = pd.to_datetime("2017-06-27")  # used to truncate data set
+
+
+#%% load data
+proj_dir = "~/Documents/School&Work/Insight/parking"
+data = pd.read_csv(proj_dir + "/data_int/pv.csv")
+n_ts = len(data.ViolationPrecinct.unique())
+data["datetime_rnd"] = pd.to_datetime(data.datetime_rnd)
+data = data[data.datetime_rnd <= date_end].reset_index(drop=True)
+
+
+#%% split into train and test
+n_precincts = len(data.ViolationPrecinct.unique())
+test_size = test_size_per*n_precincts
+nrow = data.shape[0]
+nrow_per = nrow/n_precincts
+train_size_per = nrow_per - test_size_per  # number of observations per precinct in training set
+train_size = nrow - test_size
+
+train = data.groupby('ViolationPrecinct').apply(lambda x: x.iloc[range(int(train_size_per)+1)])
+train.reset_index(inplace=True, drop=True)
+train = train[['ViolationPrecinct','counts','dow','hour']]
+test = data.groupby('ViolationPrecinct').apply(lambda x: x.iloc[-test_size_per:])
+
+
+#%% Create data frame of time series: observed, baseline, glm
+# baseline forecasts
+base_test_df = pd.read_csv("./data_int/model_00_baseline.csv")
+base_test_df.rename(columns={'counts_hat':'counts_hat_base'}, inplace=True)
+base_test_df.datetime_rnd = pd.to_datetime(base_test_df.datetime_rnd)
+
+# glm forecasts
+glm_test_df = pd.read_csv("./data_int/mvp_f_out2.csv")
+glm_test_df.rename(columns={'counts_hat':'counts_hat_glm'}, inplace=True)
+glm_test_df.datetime_rnd = pd.to_datetime(glm_test_df.datetime_rnd)
+
+# both forecasts and obs
+fore_obs_df = pd.merge(base_test_df, glm_test_df, on=['datetime_rnd','ViolationPrecinct','counts'], how='outer')
+
+
+#%% Plot Fits and Residuals
+# set up plots
+fig, ax = plt.subplots(2,2) # 2x2 grid
+sval = 1 # point size
+alpha_val = 0.5 # transparency of points
+
+# plot baseline vs observed
+plt.subplot(2,2,1)
+plt.scatter(fore_obs_df.counts, fore_obs_df.counts_hat_base, s=sval, alpha=alpha_val)
+
+# plot glm vs observed
+plt.subplot(2,2,2)
+plt.scatter(fore_obs_df.counts, fore_obs_df.counts_hat_glm, s=sval, alpha=alpha_val)
+
+# plot glm vs baseline
+plt.subplot(2,2,3)
+plt.scatter(fore_obs_df.counts_hat_base, fore_obs_df.counts_hat_glm, s=sval, alpha=alpha_val)
+
+# plot residual time series on top of each other
+ax = plt.subplot(2, 2, 4)
+for i in fore_obs_df['ViolationPrecinct'].unique():
+    td = fore_obs_df[fore_obs_df.ViolationPrecinct == i]
+    t_glm_error = td.counts_hat_glm - td.counts
+    t_dtrnd = td['datetime_rnd'].values
+    ax.xaxis.set_major_formatter(mdates.DateFormatter('%a\n%b %d'))
+    plt.plot(td.datetime_rnd, t_glm_error)
+# plt.plot(fore_obs_df.datetime_rnd, glm_error)
+
+plt.show()
+
+
+#%% Time Series full: Manhattan daily
+daily_tickets = data.groupby(data.datetime_rnd.dt.date)['counts'].sum()
+# daily_tickets.index = pd.DatetimeIndex(daily_tickets.index)
+# day_dict = {'weekday':[0,1,2,3,4,5],'sat':[6], 'sun':[7]}
+day_dict = {0:'weekday', 1:'weekday', 2:'weekday', 3:'weekday', 4:'weekday', 5:'sat', 6:'sun'}
+daily_tickets_dayType = pd.Series(pd.to_datetime(daily_tickets.index).dayofweek).replace(day_dict)
+
+fig = plt.figure(facecolor='white', figsize=[6,4]) # uses matplotlib.pyplot imported as plt
+ax = fig.add_subplot(111)
+ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %d\n%Y'))
+plt.plot(daily_tickets.index, daily_tickets, c='black', linewidth=0.5)
+for i in daily_tickets_dayType.unique():
+    t_index = list(daily_tickets_dayType == i)
+    t_date = daily_tickets.index[t_index]
+    t_tick = daily_tickets[list(t_index)]
+    plt.scatter(t_date, t_tick)
+plt.title("Manhattan FY 2017")
+plt.ylabel('Tickets per Day')
+plt.savefig("./figures/dailyTimeSeries_Manhattan.png", bbox_inches='tight', dpi=200)
+# plt.show()
+
+
+#%%Options for example Time Series
+vio_prec_eg = 13
+fore_obs_df_eg = fore_obs_df[fore_obs_df.ViolationPrecinct==vio_prec_eg]
+
+
+#%% Time series test: observed
+fig = plt.figure(facecolor='white', figsize=[6,4]) # uses matplotlib.pyplot imported as plt
+ax = fig.add_subplot(111)
+ax.xaxis.set_major_formatter(mdates.DateFormatter('%a\n%b %d'))
+plt.plot(fore_obs_df_eg.datetime_rnd, fore_obs_df_eg.counts, c='black', linewidth=0.5)
+plt.title('End of FY 2017, Precinct ' + str(vio_prec_eg))
+plt.ylabel('Tickets per 15 min')
+plt.savefig("./figures/forecast_obs.png", bbox_inches='tight', dpi=200)
+# plt.show()
+
+
+#%% Time series test: observed, baseline model
+fig = plt.figure(facecolor='white', figsize=[6,4]) # uses matplotlib.pyplot imported as plt
+ax = fig.add_subplot(111)
+ax.xaxis.set_major_formatter(mdates.DateFormatter('%a\n%b %d'))
+plt.plot(fore_obs_df_eg.datetime_rnd, fore_obs_df_eg.counts, c='black', linewidth=0.5)
+plt.plot(fore_obs_df_eg.datetime_rnd, fore_obs_df_eg.counts_hat_base, c='blue', linewidth=1.5)
+plt.title('End of FY 2017, Precinct ' + str(vio_prec_eg))
+plt.ylabel('Tickets per 15 min')
+# plt.legend(labels=['Observed','Baseline'])
+plt.savefig("./figures/forecast_obs_baseline.png", bbox_inches='tight', dpi=200)
+# plt.show()
+
+
+#%% Time series test: observed, glm
+fig = plt.figure(facecolor='white', figsize=[6,4]) # uses matplotlib.pyplot imported as plt
+ax = fig.add_subplot(111)
+ax.xaxis.set_major_formatter(mdates.DateFormatter('%a\n%b %d'))
+plt.plot(fore_obs_df_eg.datetime_rnd, fore_obs_df_eg.counts, c='black', linewidth=0.5)
+plt.plot(fore_obs_df_eg.datetime_rnd, fore_obs_df_eg.counts_hat_glm, c='red', linewidth=1.5)
+plt.title('End of FY 2017, Precinct ' + str(vio_prec_eg))
+plt.ylabel('Tickets per 15 min')
+# plt.legend(labels=['Observed','GLM'])
+plt.savefig("./figures/forecast_obs_glm.png", bbox_inches='tight', dpi=200)
+# plt.show()
+
+
+#%% Time series test: observed, baseline, glm
+fig = plt.figure(facecolor='white', figsize=[6,4]) # uses matplotlib.pyplot imported as plt
+ax = fig.add_subplot(111)
+ax.xaxis.set_major_formatter(mdates.DateFormatter('%a\n%b %d'))
+plt.plot(fore_obs_df_eg.datetime_rnd, fore_obs_df_eg.counts, c='black', linewidth=0.5)
+plt.plot(fore_obs_df_eg.datetime_rnd, fore_obs_df_eg.counts_hat_base, c='blue', linewidth=1.5)
+plt.plot(fore_obs_df_eg.datetime_rnd, fore_obs_df_eg.counts_hat_glm, c='red', linewidth=1.5)
+plt.title('End of FY 2017, Precinct ' + str(vio_prec_eg))
+plt.ylabel('Tickets per 15 min')
+# plt.legend(labels=['Observed','Baseline','GLM'])
+plt.savefig("./figures/forecast_obs_baseline_glm.png", bbox_inches='tight', dpi=200)
+# plt.show()
+
+
+#%% Average DoW-HoD combinations, in table for plotting
+# baseline averages, but in table for plotting convenience
+baseline_table = train.drop(columns='ViolationPrecinct').pivot_table(index='hour', columns='dow', values='counts', aggfunc='mean')
+
+# plot heat map day-time averages
+fig = plt.figure(facecolor='white', figsize=[3,5]) # uses matplotlib.pyplot imported as plt
+ax = fig.add_subplot(111)
+xticklab = ('Mon',"Tue",'Wed','Thu','Fri','Sat','Sun')
+sns.heatmap(baseline_table, annot=False, cmap='RdYlGn_r', linewidths=0.5, yticklabels=2, xticklabels=xticklab, fmt=".1f")
+plt.ylabel("Hour of Day")
+plt.xlabel("Day of Week")
+plt.title("Tickets per 15 min")
+plt.savefig("./figures/ticketHeatmap.png", bbox_inches='tight', dpi=250)
+# plt.show()
+
+
+#%% How forecast error as a function of horizon
+u_prec = fore_obs_df.ViolationPrecinct.unique()
+eg_datetime = fore_obs_df[fore_obs_df.ViolationPrecinct == i].datetime_rnd
+horizon = (eg_datetime - pd.to_datetime("2017-06-20 00:00:00")).dt.total_seconds()/60/60/24
+fore_errors_glm = np.zeros((len(horizon), len(u_prec)))
+for i in range(len(u_prec)):
+    td = fore_obs_df[fore_obs_df.ViolationPrecinct == u_prec[i]]
+    fore_errors_glm[:, i] = np.array(td.counts - td.counts_hat_glm)
+
+
+def mae(x):
+    return np.mean(np.abs(x))
+
+
+blah = np.apply_along_axis(mae, 1, fore_errors_glm)
+plt.plot(horizon, blah)
+# plt.scatter(horizon, blah)
+plt.show()
+
+
+# import seaborn as sns
+# sns.heatmap(fore_errors_glm)
+# plt.show()