-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
134 lines (118 loc) · 4.76 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import httpx
from prefect import flow, task
from bs4 import BeautifulSoup
import requests
import json
import psycopg2
import time
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from bs4 import BeautifulSoup
conn = psycopg2.connect(
database="my_database", user='user', password='password', host='localhost', port= '5433'
)
cursor = conn.cursor()
@task
def get_urls():
"""Get info about a repo - will retry twice after failing"""
url = "https://news.ycombinator.com/ask"
page = requests.get(url)
html = BeautifulSoup(page.content, "html.parser")
parentElements = html.find_all('span', class_='titleline')
urls = []
for el in parentElements:
url_append = el.find('a').get('href')
title = el.find('a').text
new_url = 'https://news.ycombinator.com/' + url_append
urls.append({'url': new_url, 'title': title})
return urls
@task
def get_comments(urls):
"""_summary_
Scrape the body of the question and all the comments
"""
final_obj = []
for obj in urls:
time.sleep(3)
page = requests.get(obj.get('url'))
html = BeautifulSoup(page.content, "html.parser")
body = html.find('div', class_='toptext')
if body is not None:
body = body.text
else:
body = ''
comments = html.find_all('div', class_='comment')
comments_txt = [comment.text for comment in comments]
final_obj.append({
'url': obj.get('url'),
'title': obj.get('title'),
'body': body,
'comments': comments_txt
})
return final_obj
@task
def write_to_db(final_obj):
query = """
INSERT INTO web_data (url, title, body, comments)
VALUES (%s, %s, %s, %s);
"""
print(f"writing {len(final_obj)} rows to postgresql")
for row in final_obj:
data = (row['url'], row['title'], row['body'], row['comments'])
cursor.execute(query, data)
conn.commit()
@task
def write_to_db_rightmove(final_obj):
query = """
INSERT INTO rightmove (price, property_type, bedrooms)
VALUES (%s, %s, %s);
"""
print(f"writing {len(final_obj)} rows to postgresql")
for row in final_obj:
data = (row['price'], row['property_type'], row['bedrooms'])
cursor.execute(query, data)
conn.commit()
@flow(log_prints=True, name='hackernews')
def scraping():
"""
Scrape https://news.ycombinator.com/ask every 24 hours
"""
print('starting to scrape')
urls_list = get_urls()
final_obj = get_comments(urls_list)
write_to_db(final_obj)
@task
def get_properties():
"""
Scrape https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=OUTCODE%5E1865&index=24&propertyTypes=&includeSSTC=false&mustHave=&dontShow=&furnishTypes=&keywords=
and save average price of house to Fact Table in PostgreSQL
"""
options = Options() # Create an Options object
options.add_argument("--headless") # Make it headless
options.add_argument("--no-sandbox") # Disable the limits protecting from potential viruses
options.add_argument("--disable-dev-shm-usage") # don't use disk memory
service = Service(ChromeDriverManager().install()) # Install the ChromeDriver if not already present
url = 'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=OUTCODE%5E1865&sortType=6&propertyTypes=&includeSSTC=false&mustHave=&dontShow=&furnishTypes=&keywords='
data_obj = []
with webdriver.Chrome(service=service, options=options) as driver:
driver.get(url)
html = driver.page_source
html_parsed = BeautifulSoup(html, "html.parser")
parentPropertyCards = html_parsed.find_all('div', class_='propertyCard')
for rightmove_property in parentPropertyCards:
price = rightmove_property.find(class_='propertyCard-priceValue').text
property_info = rightmove_property.find("div", class_="property-information")
house_type = property_info.find("span", class_="text").text
bedrooms = property_info.find("span", class_="bed-icon").find("title").text
bedrooms_stripped = int(bedrooms.replace(' bedrooms','').replace(' bedroom',''))
stripped_price = int(price.replace("£", "").replace(",", ""))
data_obj.append({'price': stripped_price, 'property_type': house_type, 'bedrooms': bedrooms_stripped})
return data_obj
@flow(log_prints=True, name='rightmove')
def scraping_rightmove():
properties = get_properties()
write_to_db(properties)
if __name__ == "__main__":
scraping.serve(name="hn-rightmove-deployment",cron="15 0 * * *")