-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathvv_exporter.py
executable file
·204 lines (164 loc) · 6.22 KB
/
vv_exporter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#!/usr/bin/env python3
import re
import json
import helper
import warnings
from bs4 import BeautifulSoup
warnings.simplefilter('ignore', UserWarning)
# These links are problematic because they contain links to the whole VV of all departments of the TU Darmstadt. If all the VVs would be crawled, the crawl time need would be very high.
# Also these links contain links that are recursive (it would crawl the
# VVs again and again and again and again and again and again and ...)
BLACKLIST = (
'Weitere Veranstaltungen',
'Leistungen für den Masterstudiengang', # PO 2009
'Vorgezogene Masterleistungen', # PO 2015
'Zusätzliche Leistungen',
'Anmelden',
'Gesamtkatalog aller Module an der TU Darmstadt',
'Informatik fachübergreifend',
'Module des Sprachenzentrums mit Fachprüfungen',
'Fachübergreifende Veranstaltungen', # PO 2009
'Fachübergreifende Lehrveranstaltungen', # PO 2015
'Studium Generale', # PO 2023
'Veranstaltung',
'Zusätzliche Leistungen nach §20(2) APB',
'Vorgezogene Masterleistungen nach §20(3) APB',
'Weitere Veranstaltungen',
)
BASE_URL = helper.get_tucan_baseurl()
def main():
(browser, page) = helper.log_into_tucan_()
vv = get_vv(browser, page, helper.get_tucan_baseurl())
with open('modules.json', 'w+') as f:
json.dump(vv, f, indent=4, sort_keys = True)
def details_from_element(element):
link = element.find('a')
title = link.text.strip()
link = BASE_URL + link.attrs['href']
return {
'title': title,
'link': link,
'isParent': "PRGNAME=REGISTRATION" in link,
'isModule': "PRGNAME=MODULEDETAILS" in link,
'children': []
}
def get_table_with_caption(tables, caption):
for table in tables:
if caption in table.select('caption')[0].text:
return table
return None
def get_links_of_table_with_caption(page, caption):
tables = page.select('table.tb')
table = get_table_with_caption(tables, caption)
if not table:
return
return set(BASE_URL + x.attrs['href'] for x in table.select('tr a'))
def extract_rooms_and_times_of_module(course_page):
tables = course_page.select('table.tb')
table = get_table_with_caption(tables, 'Termine')
if not table:
return
for link in table.select('a'):
link.attrs['href'] = ''
return table
def get_all_links(page):
SELECTOR = '#pageContent ul li, #pageContent table tr'
links = [details_from_element(x) for x in page.soup.select(
SELECTOR) if x.text.strip() not in BLACKLIST and len(x.select('a')) > 0]
return links
# ....
def sanitize_detail(detail):
replacements = [
('\t', ''),
('<br/>', '\n'),
('\n', '<br/>'),
(':', '\b'),
('\b', ':'),
('\r', ''),
('////', '<br/>')
]
reg_replacements = [
(r'^:', ''),
(r']$', ''),
(r'(<br\/>)*$', ''),
(r'^(<br\/>)*', ''),
(r'\s{2,}', ''),
(r'(<br\/>)*$', '')
]
detail_text = detail['details'].replace('<br/>', '////')
detail_text = BeautifulSoup(detail_text, "html.parser").text
detail['title'] = detail['title'].replace(':', '').strip()
for r in replacements:
detail_text = detail_text.replace(r[0], r[1]).strip()
for r in reg_replacements:
detail_text = re.sub(r[0], r[1], detail_text).strip()
detail['details'] = detail_text
return detail
def extract_module_details(html, browser):
details_raw = html.select('#pageContent table:nth-of-type(1) .tbdata td')
details = [sanitize_detail({"title": x.split('</b>')[0].strip(), "details": x.split('</b>')[1].strip()})
for x in str(details_raw).split('<b>')[1:]]
# Extract the appointments and rooms of the module
# TODO cleanup
try:
links = get_links_of_table_with_caption(html, 'Kurse')
kurse_pages = [browser.get(link).soup for idx, link in enumerate(links)]
kurs_appointments = [extract_rooms_and_times_of_module(x) for x in kurse_pages][-1]
if len(kurs_appointments.select('tr')) > 2:
details.append({'title': 'Kurstermine', 'details': str(kurs_appointments)})
except Exception as e:
print('(Could not extract Kurstermine) - {}'.format(e))
return details
def extract_cp(link):
for detail in link['details']:
if 'Credits' in detail['title']:
cp = detail['details'].split(',')[0]
try:
cp = int(cp)
except:
pass
return cp
return 0
def print_link(link):
print(('\t' * link['depth']) + '> {}'.format(link['title']))
def crawl(browser, link):
# If it's a parent module, crawl it's children
if link['isParent']:
print_link(link)
page = browser.get(link['link'])
# Only go through all links that are parents or modules (there are other links in the page, too)
for l in [x for x in get_all_links(page) if x['isParent'] or x['isModule']]:
l['depth'] = link['depth'] + 1
link['children'].append(crawl(browser, l))
# If it's a normal module (= Kurs), extract the data from the module page
elif link['isModule']:
module_page = browser.get(link['link'])
link['details'] = extract_module_details(module_page.soup, browser)
link['credits'] = extract_cp(link)
print_link(link)
return link
def walk_modules(vv, fn, only_children=True):
if len(vv['children']) == 0 or not only_children:
vv = fn(vv)
for child in vv['children']:
child = walk_modules(child, fn, only_children)
return vv
def get_vv(browser, start_page, base_url):
# anmeldung page
anmeldung_page_link = base_url + start_page.soup.select('li[title="Anmeldung"] a')[0].attrs['href']
vv = crawl(browser, {
'title': 'Start',
'link': anmeldung_page_link,
'isParent': True,
'isModule': False,
'children': [],
"depth": 0}
)
def remove_unneccesary_data(module):
for e in ['link', 'isParent', 'depth', 'isModule']:
del module[e]
return module
vv = walk_modules(vv, remove_unneccesary_data, only_children=False)
return vv['children']
if __name__ == '__main__':
main()