-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathBACK_data_analysis.py
More file actions
222 lines (176 loc) · 7.14 KB
/
BACK_data_analysis.py
File metadata and controls
222 lines (176 loc) · 7.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
#!/usr/bin/env python
"""Data analysis main file.
"""
__license__ = "GPL"
__version__ = "0.0"
__status__ = "Development"
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
# Fixing random state for reproducibility
np.random.seed(19680801)
# A function to use a Gaussian Process Regressor to extrapolate travel time
# data from one data set to others. Travel time comes from Google Maps which
# costs money. We want to avoid paying for more API calls when we already
# have plenty of travel time data.
def GP_Winter_2018_Travel_Time_data():
Winter_2018 = pd.read_hdf('Winter_2018_Travel_Time.hdf5')
# Reduce the dimensionality to fit so it isn't so slow!
N = 2 ** 10
points = np.sort(np.random.randint(0, len(X) - 1, N))
x = np.array([Winter_2018.zillow_longitude, Winter_2018.zillow_latitude])[1, points]
y = Winter_2018.morning_drive_duration_with_traffic[points]
t = pd.read_hdf('./dumped_data/saved_data.hdf5',key='all_zips')
# Principal is the initial loan amount.
# interest is the MONTHLY interest. i.e. is the yearly interest is 4% then
# interest = 0.04/12
# months is the length of the loans in months.
def calculate_monthly_payment(principal, interest, months):
monthly_payment = principal * \
(interest * (1 + interest)**months) / \
((1+interest)**months - 1)
return(monthly_payment)
def process_data(t):
"""
adds useful features to t
"""
# price in mega-bucks
t['price (M$)'] = t['zillow_price'] / 1000000.0
# averge times in minutes
t['average_drive_duration'] = \
t['morning_drive_duration'].add(t['evening_drive_duration']).div(120.0)
t['average_drive_duration_with_traffic'] = \
t['morning_drive_duration_with_traffic'] \
.add(t['evening_drive_duration_with_traffic']).div(120.0)
t['average_transit_duration'] = \
t['morning_transit_duration'] \
.add(t['evening_transit_duration']).div(120.0)
# identify east bay locations
t['east_bay'] = t['location'].apply(find_east_bay)
# filter for < 180 minute drives in traffic, < $5M and SINGLE_FAMILY homes
mask = (t['average_drive_duration_with_traffic'] < 180) & \
(t['price (M$)'] < 5) & \
(t['zillow_homeType'] == 'SINGLE_FAMILY')
return t[mask]
def find_east_bay(lat=0.0,lng=0.0):
"""
accepts lat and lng and tells you if the property is in the east bay
Parameters
----------
lat: float or string
if float, latitude you want to check (requires a longitude)
if string, a string of the form 1.234,5.678 for use with pandas apply
latitude comes first in this string
lng: float
longitude you want to check
Returns
-------
boolean
True if the location is in the east bay
"""
# check for the string
if isinstance(lat,(str,unicode)):
tl = lat.split(',')
lat = float(tl[0])
lng = float(tl[1])
# a GPS point near Treasure Island
x1 = -122.389807
y1 = 37.813489
# a GPS point just north of San Jose
x2 = -121.986697
y2 = 37.418685
if lat < y2: # anything south of San Jose is not in the east bay
return False
val = (lng - x1) * (y2 - y1) - (lat - y1) * (x2 - x1)
# positive values are east bay, negative values are peninsula
return val > 0
t = process_data(t)
################################################################################
# you can make pretty plots now
################################################################################
def price_dist_plot():
"""
plot of price versus distance
"""
plt.figure(22)
# plot the east bay points
eb_mask = t['east_bay']
plt.scatter(x=t['price (M$)'][eb_mask],
y=t['average_drive_duration_with_traffic'][eb_mask],
s=5,
c='b',
marker='.')
# plot the not east bay points
neb_mask = ~t['east_bay']
plt.scatter(x=t['price (M$)'][neb_mask],
y=t['average_drive_duration_with_traffic'][neb_mask],
s=5,
c='r',
marker='.')
plt.xlabel('commute time to SLAC (min)')
plt.ylabel('price (M$)')
plt.title('Single family homes, <$5M and <180 mins')
plt.show()
def plot_by_zipcode():
"""
plots the results for single family homes by zipcode
"""
# find all the unique zipcodes
zipcodes = t['zillow_zipcode'].unique()
colors = np.random.rand(len(zipcodes),3)
plt.figure(23)
for i,zipcode in enumerate(zipcodes):
temp = t[t['zillow_zipcode']==zipcode]
x = temp['price (M$)']
y = temp['average_drive_duration_with_traffic']
plt.scatter(x=x,y=y,s=5,marker='.',c=colors[i],label=str(zipcode))
plt.xlabel('commute time to SLAC (min)')
plt.ylabel('price (M$)')
plt.title('Single family homes, <$5M and <180 mins')
plt.legend(loc='upper right')
plt.show()
def dbscan_price_drive(epsilon=0.1,min_samples=50):
"""
DBSCAN cluster analysis of the data in terms of price and
average_drive_duration_with_traffic
"""
tt = t[['price (M$)','average_drive_duration_with_traffic']]
scaler = StandardScaler()
ts = scaler.fit_transform(tt)
db = DBSCAN(eps=epsilon, min_samples=min_samples).fit(ts)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (labels == k)
x = tt.loc[:,'price (M$)'][class_member_mask & core_samples_mask]
y = tt.loc[:,'average_drive_duration_with_traffic'][class_member_mask & core_samples_mask]
plt.plot(x[:],y[:], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=10)
x = tt.loc[:,'price (M$)'][class_member_mask & ~core_samples_mask]
y = tt.loc[:,'average_drive_duration_with_traffic'][class_member_mask & ~core_samples_mask]
plt.plot(x[:],y[:], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=2)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
# Test out the GP import
if __name__ == '__main__':
GP_Winter_2018_Travel_Time_data()
# END ##########################################################################