small tweaks to pdf download script

karpathy · karpathy · commit bcd5b2fd532c · 2017-02-06T17:01:04.000-08:00
diff --git a/download_pdfs.py b/download_pdfs.py
@@ -1,27 +1,26 @@
+import os
+import time
 import pickle
-from  urllib.request import urlopen
 import shutil
-import time
-import os
 import random
+from  urllib.request import urlopen
 
-timeout_secs = 10 # after this many seconds we give up on a paper
+from utils import Config
 
-pdf_dir = os.path.join('data', 'pdf')
-db_path = 'db.p'
-if not os.path.exists(pdf_dir): os.makedirs(pdf_dir)
+timeout_secs = 10 # after this many seconds we give up on a paper
+if not os.path.exists(Config.pdf_dir): os.makedirs(Config.pdf_dir)
+have = set(os.listdir(Config.pdf_dir)) # get list of all pdfs we already have
 
 numok = 0
 numtot = 0
-db = pickle.load(open(db_path, 'rb'))
-have = set(os.listdir(pdf_dir)) # get list of all pdfs we already have
+db = pickle.load(open(Config.db_path, 'rb'))
 for pid,j in db.items():
   
   pdfs = [x['href'] for x in j['links'] if x['type'] == 'application/pdf']
   assert len(pdfs) == 1
   pdf_url = pdfs[0] + '.pdf'
   basename = pdf_url.split('/')[-1]
-  fname = os.path.join(pdf_dir, basename)
+  fname = os.path.join(Config.pdf_dir, basename)
 
   # try retrieve the pdf
   numtot += 1
@@ -31,7 +30,7 @@
       req = urlopen(pdf_url, None, timeout_secs)
       with open(fname, 'wb') as fp:
           shutil.copyfileobj(req, fp)
-      time.sleep(0.1 + random.uniform(0,0.2))
+      time.sleep(0.05 + random.uniform(0,0.1))
     else:
       print('%s exists, skipping' % (fname, ))
     numok+=1
diff --git a/utils.py b/utils.py
@@ -9,6 +9,7 @@
 # -----------------------------------------------------------------------------
 class Config(object):
     db_path = 'db.p'
+    pdf_dir = os.path.join('data', 'pdf')
 
 # Context managers for atomic writes courtesy of
 # http://stackoverflow.com/questions/2333872/atomic-writing-to-file-with-python