Skip to content

Commit 1e10244

Browse files
Added solar website scraper
Contains file for scraping from solar website
1 parent d843cd4 commit 1e10244

File tree

2 files changed

+103
-0
lines changed

2 files changed

+103
-0
lines changed

chromedriver

14.7 MB
Binary file not shown.

solar.py

+103
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
# import requests
2+
# import selenium
3+
from bs4 import BeautifulSoup
4+
from selenium import webdriver
5+
from selenium.webdriver.common.keys import Keys
6+
from selenium.webdriver.common.by import By
7+
from bs4 import BeautifulSoup
8+
import time
9+
from geopy.geocoders import Nominatim
10+
# from googlemaps import GoogleMaps
11+
12+
13+
def coordinates(address):
14+
geolocator = Nominatim(user_agent="Solar Calc")
15+
location = geolocator.geocode(address)
16+
return location.latitude, location.longitude
17+
18+
19+
def get_full_url(address1):
20+
latitude, longitude = coordinates(address1)
21+
base_url = "https://sunroof.withgoogle.com/building/"
22+
return base_url + str(latitude) + '/' + str(longitude) + '/' + '#?f=buy'
23+
24+
25+
def get_data():
26+
op = webdriver.ChromeOptions()
27+
op.add_argument('headless')
28+
# driver = webdriver.Chrome(options=op)
29+
driver = webdriver.Chrome('/Users/michaelfedotov/Documents/HackHarvard/chromedriver', options=op)
30+
driver.get(get_full_url("23 Harvard St Somerville, MA 02143"))
31+
32+
html = driver.page_source
33+
34+
main_data = {
35+
'num_hours_solar': driver.find_element(By.XPATH, "/html/body/div[1]/address-view/div[1]/div/div/section[1]/div[2]/md-card[1]/ul/li[1]/div[2]").text,
36+
'sqft_avail': driver.find_element(By.XPATH, "/html/body/div[1]/address-view/div[1]/div/div/section[1]/div[2]/md-card[1]/ul/li[2]/div[2]").text,
37+
'savings': driver.find_element(By.XPATH, "/html/body/div[1]/address-view/div[1]/div/div/section[1]/div[2]/md-card[2]/div[1]/div[1]").text
38+
}
39+
40+
41+
envirionmental_impact = {
42+
'carbon-dioxide': driver.find_element(By.XPATH, "/html/body/div[1]/address-view/div[1]/div/div/section[2]/div/md-content[2]/md-card/md-card-content/div/place-metrics-cell[1]/div[2]").text,
43+
'passenger-cars': driver.find_element(By.XPATH, "/html/body/div[1]/address-view/div[1]/div/div/section[2]/div/md-content[2]/md-card/md-card-content/div/place-metrics-cell[2]/div[2]").text,
44+
'tree-seedlings': driver.find_element(By.XPATH, "/html/body/div[1]/address-view/div[1]/div/div/section[2]/div/md-content[2]/md-card/md-card-content/div/place-metrics-cell[3]/div[2]").text
45+
}
46+
47+
48+
cost = {
49+
'upfront-with-incentives': driver.find_element(By.XPATH, "/html/body/div[1]/address-view/div[1]/div/div/section[2]/div/md-content[3]/md-card/md-card-content/div/cost-cell[1]/div[1]").text,
50+
'20-year-benefits-': driver.find_element(By.XPATH, "/html/body/div[1]/address-view/div[1]/div/div/section[2]/div/md-content[3]/md-card/md-card-content/div/cost-cell[2]/div[1]").text,
51+
# '20-year-savings': driver.find_element(By.XPATH, "/html/body/div[1]/address-view/div[1]/div/div/section[2]/div/md-content[3]/md-card/md-card-content/div/cost-cell[3]/div[1]").text,
52+
'years-til-payback': driver.find_element(By.XPATH, "/html/body/div[1]/address-view/div[1]/div/div/section[2]/div/md-content[3]/md-card/md-card-content/div/cost-cell[4]/div[1]").text,
53+
'up-front-cost-of-installation': driver.find_element(By.XPATH, "/html/body/div[1]/address-view/div[1]/div/div/section[2]/div/md-content[3]/md-card/md-card-content/show-more/div/table/tbody/tr[1]/td[2]/span").text,
54+
}
55+
# envirionmental_impact = driver.find_element(By.XPATH, "/html/body/div[1]/address-view/div[1]/div/div/section[2]/div/md-content[2]/md-card/md-card-content/div")
56+
# # print("env", envirionmental_impact.text)
57+
# for impact in envirionmental_impact.find_elements(By.CLASS_NAME, "place-metrics-cell has-icon"):
58+
# print(impact.text)
59+
# print(1)
60+
61+
# print(envirionmental_impact)
62+
63+
# bs = BeautifulSoup(html)
64+
# print(bs)
65+
# bsobject = bs.find_element(By.XPATH, "/html/body/div[1]/address-view/div[1]/div/div/section[2]/div/md-content[2]/md-card/md-card-content/div")
66+
# for tag in bsobject.find_all("div", {"class": "place-metrics-cell has-icon"}):
67+
# print(tag.text)
68+
69+
driver.close()
70+
71+
data_dict = {
72+
"main_date": main_data,
73+
"envirionmental_impact": envirionmental_impact,
74+
"cost": cost
75+
}
76+
77+
print(data_dict)
78+
return data_dict
79+
80+
def send_data():
81+
elemNumHours, sqft_avail, savings = get_data()
82+
83+
json_out = {'elemNumHours': elemNumHours,
84+
'sqft_avail': sqft_avail,
85+
'savings': savings}
86+
87+
return json_out # todo ad
88+
89+
# function for monthly electricity
90+
91+
92+
93+
94+
95+
96+
97+
98+
99+
100+
101+
102+
103+

0 commit comments

Comments
 (0)