-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy paththeCrag-scraper.py
113 lines (87 loc) · 3.78 KB
/
theCrag-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from bs4 import BeautifulSoup
import urllib2
import requests
import re
import tablib
#Global Variables
#Tablib DataSet
data = tablib.Dataset()
data.headers = ("Region of World", "Country","State", "Name", "Type", "Grade")
#Request using beautiful soup
def request_url(url):
url = "http://www.thecrag.com" + url
user_agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT"
headers = { "User-Agent" : user_agent}
try:
#make request
req = urllib2.Request(url, headers=headers)
res = urllib2.urlopen(req)
except:
raise
else:
soup = BeautifulSoup(res.read(), "html.parser")
res.close()
return soup
def scrape_thecrag(world):
#requests world url ending
url_ending = "/climbing/"+world
soup = request_url(url_ending)
#TODO Recursive function (soup)
scrape_areas(soup)
#Recursive function to get the areas and routes
def scrape_areas(soup):
if soup is not None:
if soup.find("div", {"class" : "node-listview"}).find("h2", {"class" : "inline"}).renderContents().strip() != None:
area_route = soup.find("div", {"class" : "node-listview"}).find("h2", {"class" : "inline"}).renderContents().strip()
# Areas
if area_route == "Areas":
areas = soup.find("div", {"class" : "node-listview"}).findAll(lambda tag: tag.name == 'div' and tag.get('class') == ['area'])
for area in areas:
climbs = area.find("div", {"class" : "stats"}).find("div", {"class" : "routes"})
for climb in climbs:
climb = climb.replace(',', '')
if 'k' not in climb:
if float(climb) > 0:
class_names = area.find("div", {"class" : "name"})
anchor = class_names.find('a')
if anchor is not None:
new_soup = request_url(anchor['href'])
scrape_areas(new_soup)
else:
climb = climb[:-1]
if float(climb) > 0:
class_names = area.find("div", {"class" : "name"})
anchor = class_names.find('a')
if anchor is not None:
new_soup = request_url(anchor['href'])
scrape_areas(new_soup)
# Routes
if area_route == "Routes":
#gets world/state
region = soup.find("div",{"id" : "breadCrumbs"}).find('ul').findAll('li')
world = region[1].find('a').find("span").renderContents().strip()
country = region[2].find('a').find("span").renderContents().strip()
if region[3] is not None:
state = region[3].find('a').find("span").renderContents().strip()
#get routes
routes = soup.find("div", {"class" : "node-listview"}).findAll(lambda tag: tag.name == 'div' and tag.get('class') == ['route'])
for route in routes:
[s.extract() for s in route('img')]
extract_info(route, world, country, state)
with open('/Users/tbhall/Documents/MyProjects/Social/scraper/Database_Routes_World.csv', 'wb') as f:
f.write(data.csv)
#extracts the name of route, type of route, and the grade
def extract_info(route, world, country, state):
name = route.find("div", {"class" : "title"}).find("span",{"class" : "name"}).find('a').renderContents().strip()
type_climb = route.find("div", {"class" : "title"}).find("span",{"class" : "flags"}).find('span').renderContents().strip()
grade = route.find("div", {"class" : "title"}).find("span",{"class" : "r-grade"}).find('span').renderContents().strip()
route_append(world, country, state, name, type_climb, grade)
#appends to the data
def route_append(world, country, state,name, type_climb, grade):
data.append((world, country, state, name, type_climb, grade))
#Execuation
World_Places = ["north-america"]#"australia", "new-zealand", "pacific-islands", "europe", "north-america", "central-america", "south-america", "africa", "asia", "middle-east","artic-region", "antarctica"]
for places in World_Places:
scrape_thecrag(places)
with open('/Users/tbhall/Documents/MyProjects/Social/scraper/Database_Routes_World.csv', 'wb') as f:
f.write(data.csv)