-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrail-scraper.py
128 lines (103 loc) · 4.11 KB
/
trail-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from bs4 import BeautifulSoup
import urllib2
import requests
import re
import tablib
#Global Variables
#Using tablib creates a mini database of sorts
data = tablib.Dataset()
data.headers = ('Name','Location', 'State(s)', 'Counties', 'Type', 'Length', 'Uses')
#Main trail database
def scrape_national_trail_database():
# construct url for request
#Test of 15 trails
#url = "http://www.americantrails.org/NRTDatabase/trailList.php?usrSortOrder=TrailName&maxRows_rsTrails=15&usrTrailName=&usrTrailState=&usrTrailUse=&usrTrailCounty=&usrTrailUse2=&usrAgency=&usrYearDesignated=&usrTrailLength=0&usrTrailQuery=&usrTrailType="
url = "http://www.americantrails.org/NRTDatabase/trailList.php?usrSortOrder=TrailName&maxRows_rsTrails=15000&usrTrailName=&usrTrailState=&usrTrailUse=&usrTrailCounty=&usrTrailUse2=&usrAgency=&usrYearDesignated=&usrTrailLength=0&usrTrailQuery=&usrTrailType="
user_agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
headers = { "User-Agent" : user_agent }
try:
# make request
req = urllib2.Request(url, headers=headers)
res = urllib2.urlopen(req)
except:
raise
else:
soup = BeautifulSoup(res.read(), "html.parser")
res.close()
#parses the trails all over the US
#First Half
trails = soup.findAll("div", {"style" : "width:740px;float:left; background:#f0f8e2; padding: 10px 30px;"})
for trail in trails:
#Takes the "p" with the anchor to find trail url
bodyTrail = trail.findAll("p")
for body in bodyTrail:
anchor = body.find("a", {"class" : "boldBlue"})
if anchor is None:
#this is the Trail info url ending.
continue
else:
scrape_trail_info_americantrails(anchor["href"])
#Second Half
trails = soup.findAll("div", {"style" : "width:740px;float:left; padding: 10px 30px;"})
for trail in trails:
#Takes the "p" with the anchor to find trail url
bodyTrail = trail.findAll("p")
for body in bodyTrail:
anchor = body.find("a", {"class" : "boldBlue"})
if anchor is None:
#this is the Trail info url ending.
continue
else:
scrape_trail_info_americantrails(anchor["href"])
with open('/Users/tbhall/Documents/MyProjects/Social/scraper/Database_Trails_US.csv', 'wb') as f:
f.write(data.csv)
#Information on each trail
def scrape_trail_info_americantrails(html):
user_agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
headers = { "User-Agent" : user_agent }
try:
# make request
html = "http://www.americantrails.org/NRTDatabase/" + html
req = urllib2.Request(html, headers=headers)
res = urllib2.urlopen(req)
except:
raise
else:
soup = BeautifulSoup(res.read(), "html.parser")
res.close()
#extract info data
trail_head = soup.find("div", {"id" : "innerContent"}).find('h1')
trail_table = soup.find("div", {"id": "innerContent"}).table.findAll('tr')
trail_head = trail_head.renderContents().strip()
#Adds New trail to payload
create_payload(trail_table, trail_head)
def create_payload(row, trail_head):
#Takes each row to begin the parse
location = row[0].findAll('td')
states = row[1].findAll('td')
counties = row[2].findAll('td')
trail_type = row[3].findAll('td')
length = row[4].findAll('td')
uses = row[5].findAll('td')
#gets the data out of td
location = location[1].renderContents().strip()
states = states[1].renderContents().strip()
counties = counties[1].renderContents().strip()
trail_type = trail_type[1].renderContents().strip()
trail_type = trail_type.replace("<br>", ' ').replace("<br/>", ' ').replace("</br>", ' ').split()
#Takes out the blank fields
while '' in trail_type:
trail_type.remove('')
length = length[1].renderContents().strip()
uses = uses[1].renderContents().strip()
uses = uses.replace("<br>", '$').replace("<br/>", '$').replace("</br>", '$').split('$')
#Takes out the blank fields
while '' in uses:
uses.remove('')
#appends new trail
trail_append(trail_head, location, states, counties, trail_type, length, uses)
#appends a trail to global data
def trail_append(name, location, state, counties, trail_type, length, uses):
data.append((name, location, state, counties, trail_type, length, uses))
#Call the function
scrape_national_trail_database()