Skip to content

Commit bcd5b2f

Browse files
committedFeb 7, 2017
small tweaks to pdf download script
1 parent 2104222 commit bcd5b2f

File tree

2 files changed

+11
-11
lines changed

2 files changed

+11
-11
lines changed
 

‎download_pdfs.py

+10-11
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,26 @@
1+
import os
2+
import time
13
import pickle
2-
from urllib.request import urlopen
34
import shutil
4-
import time
5-
import os
65
import random
6+
from urllib.request import urlopen
77

8-
timeout_secs = 10 # after this many seconds we give up on a paper
8+
from utils import Config
99

10-
pdf_dir = os.path.join('data', 'pdf')
11-
db_path = 'db.p'
12-
if not os.path.exists(pdf_dir): os.makedirs(pdf_dir)
10+
timeout_secs = 10 # after this many seconds we give up on a paper
11+
if not os.path.exists(Config.pdf_dir): os.makedirs(Config.pdf_dir)
12+
have = set(os.listdir(Config.pdf_dir)) # get list of all pdfs we already have
1313

1414
numok = 0
1515
numtot = 0
16-
db = pickle.load(open(db_path, 'rb'))
17-
have = set(os.listdir(pdf_dir)) # get list of all pdfs we already have
16+
db = pickle.load(open(Config.db_path, 'rb'))
1817
for pid,j in db.items():
1918

2019
pdfs = [x['href'] for x in j['links'] if x['type'] == 'application/pdf']
2120
assert len(pdfs) == 1
2221
pdf_url = pdfs[0] + '.pdf'
2322
basename = pdf_url.split('/')[-1]
24-
fname = os.path.join(pdf_dir, basename)
23+
fname = os.path.join(Config.pdf_dir, basename)
2524

2625
# try retrieve the pdf
2726
numtot += 1
@@ -31,7 +30,7 @@
3130
req = urlopen(pdf_url, None, timeout_secs)
3231
with open(fname, 'wb') as fp:
3332
shutil.copyfileobj(req, fp)
34-
time.sleep(0.1 + random.uniform(0,0.2))
33+
time.sleep(0.05 + random.uniform(0,0.1))
3534
else:
3635
print('%s exists, skipping' % (fname, ))
3736
numok+=1

‎utils.py

+1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
# -----------------------------------------------------------------------------
1010
class Config(object):
1111
db_path = 'db.p'
12+
pdf_dir = os.path.join('data', 'pdf')
1213

1314
# Context managers for atomic writes courtesy of
1415
# http://stackoverflow.com/questions/2333872/atomic-writing-to-file-with-python

0 commit comments

Comments
 (0)
Please sign in to comment.