-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcombine_collision_route_and_bikepath_data.rb
178 lines (147 loc) · 5.84 KB
/
combine_collision_route_and_bikepath_data.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
require_relative './helpers'
route_summary = CSV.parse(File.read('./route_summary.csv'), headers: true)
routes = CSV.parse(File.read('./geocoded_cycling_data.csv'), headers: true)
collisions = CSV.parse(File.read('./bike_collision_geo.csv'), headers: true)
bikepaths = CSV.parse(File.read('./joined-bikepaths.csv'), headers: true)
headers = ['Data Type', 'Path Id', 'Step', 'Time', 'Street Name', 'Latitude', 'Longitude', 'Step Distance', 'Total Distance', 'Narrative', 'On Bike Lane']
distance_lists = {}
CSV.open('./boston-bike-trips-crashes-and-bike-paths-may2010-dec2012.csv', 'w', headers: headers, write_headers: true) do |output|
@bikepaths_by_street = Hash.new{|h,street| h[street] = [] }
def on_bike_lane?(row)
bikepaths = @bikepaths_by_street[row['Street Name']]
if bikepaths.length > 0
path, min_distance = min_and_value(bikepaths) { |seg| distance_between(row, seg) }
if min_distance < path['Step Distance']
if Time.parse(row['Time']) > Time.parse(path['Time'])
return true
else
return "Not yet built"
end
end
end
false
end
bikepaths.group_by { |row| row['path_id'] }.each do |path_id, rows|
segment_data = []
rows.each_with_index do |row, i|
next unless row['year'].to_i <= 2012
if i == 0
distance = 0
else
distance = distance_between(rows[i-1], row)
end
street_name = \
row['name'].to_s.
sub(' Avenue', ' Ave').
sub(' Street', ' St').
sub(' Road', ' Rd').
sub(/^North/, 'N').
sub(' Boulevard', ' Blvd').
sub(' Square', ' Sq').
sub(' Shared-Use', '').
sub(' Bridge', ' Brg').
sub(' Highway', ' Hwy')
segment = {
'Data Type' => 'Bike Path',
'Path Id' => row['path_id'],
'Step' => row['step'],
'Time' => "#{row['year']}-01-01T00:00",
'Street Name' => street_name,
'Latitude' => row['latitude'],
'Longitude' => row['longitude'],
'Step Distance' => distance
}
segment_data << segment
@bikepaths_by_street[street_name] << segment
end
calc_total_distance = segment_data.inject(0) { |acc, rd| acc + rd['Step Distance'] }
#puts "calculated total distance: #{calc_total_distance}"
#puts "orig total distance: #{rows.map{|row| row['bike_path_length'].to_f}.uniq.sum}"
segment_data.each do |row_data|
output << row_data.merge('Total Distance' => calc_total_distance).values_at(*headers)
end
end
base_collision_id = bikepaths.max_by {|p| p['path_id'].to_i }['path_id'].to_i + 1
collisions.each_with_index do |row, i|
next unless row['DATE'].to_s.length > 0
m, d, y = row['DATE'].split('/')
time_string = "#{y}-#{m.rjust(2, '0')}-#{d.rjust(2, '0')} #{row['TIME']}"
time = Time.parse(time_string)
next unless time >= Time.parse('2010-05-01')
row_data = {
'Data Type' => 'Collision',
'Path Id' => base_collision_id + i,
'Step' => '1',
'Time' => time.strftime("%FT%R"),
'Street Name' => row['Address'].to_s[/^(?:\d+\s)?([a-zA-Z0-9 ]+)/, 1],
'Latitude' => row["LAT"],
'Longitude' => row["LON"],
'Narrative' => row["Narrative"]
}
row_data['On Bike Lane'] = on_bike_lane?(row_data)
output << row_data.values_at(*headers)
end
city_whitelist = %w(Boston)
base_route_id = base_collision_id + collisions.size + 1
total_miles_by_route_id = Hash[route_summary.map{|rs| [rs['routeid'], rs['distance_m'].to_f*0.000621371] }]
route_no = 0
routes.
select { |row| Time.parse(row['Datetime']).year <= 2012 }.
group_by { |row| row['Route ID'] }.
each do |route_id, rows|
route_data = []
total_distance = total_miles_by_route_id[route_id]
distances = []
rows.each_with_index do |row, i|
utc_time = Time.parse(row['Datetime'])
est_time = Time.at(utc_time.to_f - 4*3600)
street_name = row['Address'].to_s[/^(?:\d+\s)?([a-zA-Z0-9 ]+), ([a-zA-Z]+)/, 1]
city_name = row['Address'].to_s[/^(?:\d+\s)?([a-zA-Z0-9 ]+), ([a-zA-Z]+)/, 2]
if i == rows.length-1
# this is a bit of a hack, but it should be ok. we can't figure out what
# the last leg length is so use the total distance to calculate what it ought to be --
# unless it's way too big. our `distance_between` method throws out a tiny bit of
# length on each leg.
distance = [distances.reject{|d| d==0}.mean, total_distance - distances.sum].min
else
distance = distance_between(row, rows[i+1])
end
route_datum = {
'Data Type' => 'Runkeeper Route',
'Time' => est_time.strftime("%FT%R"),
'Street Name' => street_name,
'City Name' => city_name,
'Latitude' => row['Latitude'],
'Longitude' => row['Longitude'],
'Step Distance' => distance
}
route_datum['On Bike Lane'] = on_bike_lane?(route_datum)
route_data << route_datum
distances << distance
end
distance_lists[route_id] = distances
route_segments = []
current_segment = []
while route_data.length > 0
point = route_data.shift
if city_whitelist.include?(point['City Name'])
current_segment << point
else
route_segments << current_segment unless current_segment.length == 0
current_segment = []
end
end
route_segments << current_segment unless current_segment.length == 0
route_segments.each do |segment|
calc_total_distance = segment.inject(0) { |acc, rd| acc + rd['Step Distance'] }
segment.each_with_index do |row_data, i|
output << row_data.merge(
'Total Distance' => calc_total_distance,
'Path Id' => base_route_id + route_no,
'Step' => i + 1
).values_at(*headers)
end
route_no += 1
end
end
end