Merge pull request #5 from hopcity/streamline_config_and_setup

cleaned up output with standard logging and swapped out wget for requ…
hopcity · Aug 6, 2015 · b646aa0 · b646aa0
2 parents 30fef86 + ab9ba37
commit b646aa0
Show file tree

Hide file tree

Showing 9 changed files with 96 additions and 146 deletions.
diff --git a/.gitignore b/.gitignore
@@ -65,3 +65,7 @@ cityscrape/*
 
 # Vagrant
 .vagrant/*
+
+# Tempfiles
+*.zip
+*.tmp
diff --git a/access_dump.py b/access_dump.py
diff --git a/cityscrape-setup.sh b/cityscrape-setup.sh
@@ -6,8 +6,12 @@ CONFIGFILE="config/cityscrape-config.sh"
 . $CONFIGFILE
 
 # Create working directories for ingest
-mkdir zip_files
-mkdir stl_city
+if [ -d "$STL_CITY_DOWNLOAD_DIR" ]
+then
+  echo "$STL_CITY_DOWNLOAD_DIR already exists"
+else
+  mkdir $STL_CITY_DOWNLOAD_DIR
+fi
 
 # Need to install virtualenv first
 pip install virtualenv==1.10.1

diff --git a/config/cityscrape-config.sh b/config/cityscrape-config.sh
@@ -13,9 +13,9 @@ export CITYSCRAPE_VIRTUALENV_DIR=$BASEDIR/.py-env
 
 # URL housing zip files of open city data
 # Is this the actual scrape URL? I don't think it is
-export SCRAPE_URL='https://github.com/hopcity/cityscrape/'
+export GITHUB_URL='https://github.com/hopcity/cityscrape/'
+export SOURCEFILE_URL='http://dynamic.stlouis-mo.gov/citydata/downloads/'
 
-# Directory in which scripts residek and a subfolder, 'zipFiles' exists
-# for temporary file download and extraction before loading into database
-export STL_CITY_DIR=$BASEDIR/stl_city
+# temporary file download and extraction before loading into database
+export OUTPUT_DIR=$BASEDIR/stl_city_files/
 
diff --git a/grab_all_files.py b/grab_all_files.py
diff --git a/run-cityscrape.sh b/run-cityscrape.sh
@@ -1,6 +1,6 @@
 #!/bin/bash -e
 
-if [[ -z "$1" ]] : then
+if [[ -z "$1" ]] ; then
 	echo "Cityscape requires at least 1 argument - the database to be used for the ingest (currently \"postgresql\")"
 	exit 3
 fi
@@ -15,10 +15,10 @@ CONFIGFILE="config/cityscrape-config.sh"
 
 echo "Running Cityscrape"
 
-python $BASEDIR/grab_all_files.py
+python $BASEDIR/src/grab_all_files.py
 
-#	Push zip_files to the bash path and unzip files
-pushd zip_files
+#	Push path to stack (as opposed to cd) and unzip files
+pushd $STL_CITY_DOWNLOAD_DIR
 
 unzip -f "*.zip"
 

diff --git a/src/grab_all_files.py b/src/grab_all_files.py
@@ -1,54 +1,93 @@
 '''
-    @author: clayton.young
-    deps: blah
+    @author: clayton.young, dylan.raithel
+    deps: run cityscrape-setup.sh
 
 '''
 
-from bs4 import BeautifulSoup
-import requests
+# Base Imports
 import re
 import wget
 import os
+import sys
+import logging
+
+# Third Party Imports
+from bs4 import BeautifulSoup
+import requests
 
+# Modules
+from util.log import configure_log
 
-def send_request():
-    '''
-        input: None
-        output: None
+# Globals
+OUTPUT_DIR = os.environ['OUTPUT_DIR']
+SOURCEFILE_URL = os.environ['SOURCEFILE_URL']
 
-        TODO: remove any and all references to hardcoded filepaths that aren't bootstrapped in the
-              config or the setup script
 
+def get_soup():
     '''
-    os.chdir('/home/clay/Downloads/stl_city')
+        Input: None
+        Output: html-like BeautifulSoup object
 
-    source_files = "http://dynamic.stlouis-mo.gov/citydata/downloads/"
+    '''
+    source_files = SOURCEFILE_URL
+
+    logger = logging.getLogger(__name__)
+    logger.info('Scraping ' + source_files + ' for .zip files')
 
-    print '\nScraping ' + source_files + ' for .zip files'
     resp = requests.get(source_files)
     encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
     soup = BeautifulSoup(resp.content, from_encoding=encoding)
-    os.chdir('zipFiles')
-    for link in soup.find_all('a', href=re.compile("\.zip")):
-        download_link = source_files + link['href']
-        if os.path.isfile(link['href']) == False:
-            print '\nDownloading ' + link['href']
-            wget.download(download_link)
 
-    print '\nAll *.zip files downloaded from ' + source_files
-    os.chdir('..')
+    return soup
+
+
+def get_files(soup):
+    '''
+        Input: html-like BeautifulSoup object
+        Output: Download a bunch o' files
+
+    '''
+    source_files = SOURCEFILE_URL
+
+    logger = logging.getLogger(__name__)
+
+    for endpoint in soup.find_all('a', href=re.compile("\.zip")):
+        link = endpoint['href']
+
+        if os.path.isfile(link) == False:
+            logger.info('Downloading ' + link)
+
+            filename = OUTPUT_DIR + link
+
+            download_link = source_files + link
+            logger.info('Http endpoint: {}'.format(download_link))
+
+            request = requests.get(download_link, stream=True)
+
+            with open(filename, 'wb') as f:
+                for chunk in request.iter_content(chunk_size=1024):
+                    if chunk:
+                        f.write(chunk)
+                        f.flush
+
+    logger.info('All zip files downloaded from ' + source_files)
 
 
 def main():
     '''
-        Execute Main
+        Main module gathers new raw files for ingest into CityScrapeDB
 
     '''
+    configure_log()
+    logger = logging.getLogger(__name__)
+
+    logger.info('Executing initial page scrape to look for new files...')
+    soup = get_soup()
 
-    print 'executing request'
-    send_request()
+    logger.info('Fetching files now!')
+    get_files(soup)
 
-    print 'request complete!'
+    logger.info('CityScrape complete!')
 
 
 if __name__ == '__main__':

diff --git a/src/util/__init__.py b/src/util/__init__.py
diff --git a/src/util/log.py b/src/util/log.py
@@ -0,0 +1,14 @@
+import logging
+import sys
+from datetime import datetime
+
+
+def configure_log():
+    """Sets up the log system to have a consistent format and output"""
+    format = "%(asctime)s|%(name)13s:%(levelname)-7s| %(message)s"
+
+    cur_date = datetime.now()
+    today_string = cur_date.strftime('%Y-%m-%d')
+
+    # Configure our logger so it has a useful format
+    logging.basicConfig(level=logging.DEBUG, format=format)