-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspider.py
93 lines (61 loc) · 2.56 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/python3
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def get_paper_ids(section):
url = f'https://openreview.net/group?id=ICLR.cc/2020/Conference#{section}'
driver = webdriver.Firefox(executable_path='assets/geckodriver')
driver.get(url)
paper_ids = []
try:
html = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.CLASS_NAME, "submissions-list"))
)
pdf_links = html.find_elements_by_class_name('pdf-link')
for link in pdf_links:
paper_id = link.get_attribute('href').split('id=')[-1].strip()
paper_ids.append(paper_id)
finally:
driver.quit()
write_path = f'open_review/iclr_accept/raw/{section}.txt'
with open(write_path, 'a') as f:
for paper_id in paper_ids:
f.write(f"{paper_id}\n")
def get_revisions_from_paper_ids(section):
with open(f'open_review/iclr_accept/raw/{section}.txt', 'r') as f:
paper_ids = []
for line in f.readlines():
paper_ids.append(line.strip())
paper_revisions = []
driver = webdriver.Firefox(executable_path='assets/geckodriver')
for paper_id in paper_ids[:20]:
url = f'https://openreview.net/revisions?id={paper_id}'
driver.get(url)
try:
html = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.CLASS_NAME, "submissions-list"))
)
pdf_links = html.find_elements_by_class_name('attachment-download-link')
lst = []
for link in pdf_links:
url = link.get_attribute('href')
url = url.replace('&name=original_pdf', '').replace('attachment', 'pdf')
lst.append(url)
paper_revisions.append((paper_id, tuple(lst)))
finally:
pass
driver.quit()
folder = f'open_review/iclr_accept/raw/{section}'
if not os.path.exists(folder):
os.makedirs(folder)
write_path = f'{folder}/revisions.txt'
with open(write_path, 'a') as f:
for paper_id, revisions in paper_revisions:
f.write(f"\nid: {paper_id}\n")
for idx, rev in enumerate(reversed(revisions)):
f.write(f"rev {idx}: {rev}\n")
if __name__ == "__main__":
# get_paper_ids(section = 'accept-spotlight')
get_revisions_from_paper_ids(section = 'accept-spotlight')