forked from TonyNguyenVn17/Food_Hunting
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_food_event.py
150 lines (120 loc) · 4.74 KB
/
get_food_event.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import time # Default lib in Python
from typing import List, Dict, Union, Set # Default lib in Python
from selenium.webdriver.common.by import By # From pip install
from selenium.webdriver.support.ui import WebDriverWait # From pip install
from selenium.webdriver.support import expected_conditions as EC # From pip install
from bs4 import BeautifulSoup # From pip install
from config import driver # Self defined
from Event import Event
WAIT = WebDriverWait(driver, 20)
def format_events(event_html: str) -> Dict[str, str]:
"""
convert HTML source code into event object
"""
event_object = Event()
# navigate through HTML containers to get to information texts
event_source = BeautifulSoup(event_html, "html.parser")
# nagivate to large parent containers
li_container = event_source.find("li")
listing_element = li_container.find("div", class_="listing-element")
row = listing_element.find("div", class_="row")
media_container = row.find(
"div", class_="listing-element__title-block col-md-8")
media = media_container.find("div", class_="media")
media_body = media.find("div", class_="media-body")
# navigate into specific containers for each information
name_container = media_body.find("h3")
date_container = media_body.find("div", class_="row")
tag_container = media_body.find("div", role="group")
date_div = date_container.find_all("div", class_="media-heading")
# extract texts as event's information
date_text = date_div[1].find_all("p")[0].text.strip()
time_text = date_div[1].find_all("p")[1].text.strip()
location_text = date_container.find("div", class_="col-md-4 col-lg-4").text
# initialize attributes in Event object
for tag in tag_container.find_all("span"):
event_object.tags.add(tag.text)
event_object.name = name_container.find("a").text
event_object.id = li_container["id"]
event_object.date = date_text
event_object.time = time_text
event_object.location = location_text.strip()
# return Event object as dictionary
return event_object.get_info()
def check_exists_by_ID(ID: str) -> bool:
"""
check for whether element with given ID exists
"""
try:
driver.find_element(By.ID, ID)
except:
return False
return True
def check_login() -> None:
"""
direct driver to login page and let user login
"""
# go to login page
driver.get('https://www.campusgroups.com/shibboleth/login?idp=usf')
time.sleep(5)
# Keep login page open until user manually login (login elements are no longer visible)
while check_exists_by_ID("loginHeader") == True or check_exists_by_ID("displayName") == True:
print('Error: Need hooman authentication')
time.sleep(10)
def open_food_page() -> None:
"""
direct driver to food page (after login)
"""
driver.get('https://bullsconnect.usf.edu/events?topic_tags=7276307')
time.sleep(1)
driver.maximize_window()
time.sleep(5)
def find_events() -> List[Event]:
"""
Scrape BullsConnect web and create list of Event objects from WebElement objects
"""
output = []
event_list = []
# wait until all events on BullsConnect page are loaded, then collect all events as raw WebElement objects
try:
event_raw_list = WAIT.until(
EC.visibility_of_element_located((By.ID, 'divAllItems')))
# return all events as list of WebElement objects
event_list = event_raw_list.find_elements(By.TAG_NAME, "li")
except:
print("Events not loaded")
# process WebElement objects into HTML source code
# filter out true events
events_source_list = []
for event in event_list:
if "list-group-item" in event.get_attribute("class") and "display: none;" not in event.get_attribute("style"):
# return all events as HTML source code
events_source_list.append(event.get_attribute("outerHTML"))
# process all HTML source code into Event objects
for event in events_source_list:
output.append(format_events(event))
# #filter events of today date
# output = [event for event in output if is_today(event.date)]
return output # export list of Event objects
if __name__ == "__main__":
"""
run scripts
"""
from db import FoodDatabase
db = FoodDatabase()
check_login()
open_food_page()
events = find_events()
data = {}
for event in events:
event_name = event['name']
data[event_name] = [{
"id": event['id'],
"tags": event['tags'],
"date": event['date'],
"time": event['time'],
"location": event['location']
}]
db.add_event(event)
output = db.get_all_event()
print(output)