|
| 1 | +import os |
| 2 | +import time |
1 | 3 | import pickle
|
2 |
| -from urllib.request import urlopen |
3 | 4 | import shutil
|
4 |
| -import time |
5 |
| -import os |
6 | 5 | import random
|
| 6 | +from urllib.request import urlopen |
7 | 7 |
|
8 |
| -timeout_secs = 10 # after this many seconds we give up on a paper |
| 8 | +from utils import Config |
9 | 9 |
|
10 |
| -pdf_dir = os.path.join('data', 'pdf') |
11 |
| -db_path = 'db.p' |
12 |
| -if not os.path.exists(pdf_dir): os.makedirs(pdf_dir) |
| 10 | +timeout_secs = 10 # after this many seconds we give up on a paper |
| 11 | +if not os.path.exists(Config.pdf_dir): os.makedirs(Config.pdf_dir) |
| 12 | +have = set(os.listdir(Config.pdf_dir)) # get list of all pdfs we already have |
13 | 13 |
|
14 | 14 | numok = 0
|
15 | 15 | numtot = 0
|
16 |
| -db = pickle.load(open(db_path, 'rb')) |
17 |
| -have = set(os.listdir(pdf_dir)) # get list of all pdfs we already have |
| 16 | +db = pickle.load(open(Config.db_path, 'rb')) |
18 | 17 | for pid,j in db.items():
|
19 | 18 |
|
20 | 19 | pdfs = [x['href'] for x in j['links'] if x['type'] == 'application/pdf']
|
21 | 20 | assert len(pdfs) == 1
|
22 | 21 | pdf_url = pdfs[0] + '.pdf'
|
23 | 22 | basename = pdf_url.split('/')[-1]
|
24 |
| - fname = os.path.join(pdf_dir, basename) |
| 23 | + fname = os.path.join(Config.pdf_dir, basename) |
25 | 24 |
|
26 | 25 | # try retrieve the pdf
|
27 | 26 | numtot += 1
|
|
31 | 30 | req = urlopen(pdf_url, None, timeout_secs)
|
32 | 31 | with open(fname, 'wb') as fp:
|
33 | 32 | shutil.copyfileobj(req, fp)
|
34 |
| - time.sleep(0.1 + random.uniform(0,0.2)) |
| 33 | + time.sleep(0.05 + random.uniform(0,0.1)) |
35 | 34 | else:
|
36 | 35 | print('%s exists, skipping' % (fname, ))
|
37 | 36 | numok+=1
|
|
0 commit comments