Skip to content

Commit

Permalink
Merge pull request #5 from hopcity/streamline_config_and_setup
Browse files Browse the repository at this point in the history
cleaned up output with standard logging and swapped out wget for requ…
  • Loading branch information
dylanraithel committed Aug 6, 2015
2 parents 30fef86 + ab9ba37 commit b646aa0
Show file tree
Hide file tree
Showing 9 changed files with 96 additions and 146 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,7 @@ cityscrape/*

# Vagrant
.vagrant/*

# Tempfiles
*.zip
*.tmp
56 changes: 0 additions & 56 deletions access_dump.py

This file was deleted.

8 changes: 6 additions & 2 deletions cityscrape-setup.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,12 @@ CONFIGFILE="config/cityscrape-config.sh"
. $CONFIGFILE

# Create working directories for ingest
mkdir zip_files
mkdir stl_city
if [ -d "$STL_CITY_DOWNLOAD_DIR" ]
then
echo "$STL_CITY_DOWNLOAD_DIR already exists"
else
mkdir $STL_CITY_DOWNLOAD_DIR
fi

# Need to install virtualenv first
pip install virtualenv==1.10.1
Expand Down
8 changes: 4 additions & 4 deletions config/cityscrape-config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ export CITYSCRAPE_VIRTUALENV_DIR=$BASEDIR/.py-env

# URL housing zip files of open city data
# Is this the actual scrape URL? I don't think it is
export SCRAPE_URL='https://github.com/hopcity/cityscrape/'
export GITHUB_URL='https://github.com/hopcity/cityscrape/'
export SOURCEFILE_URL='http://dynamic.stlouis-mo.gov/citydata/downloads/'

# Directory in which scripts residek and a subfolder, 'zipFiles' exists
# for temporary file download and extraction before loading into database
export STL_CITY_DIR=$BASEDIR/stl_city
# temporary file download and extraction before loading into database
export OUTPUT_DIR=$BASEDIR/stl_city_files/

55 changes: 0 additions & 55 deletions grab_all_files.py

This file was deleted.

8 changes: 4 additions & 4 deletions run-cityscrape.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash -e

if [[ -z "$1" ]] : then
if [[ -z "$1" ]] ; then
echo "Cityscape requires at least 1 argument - the database to be used for the ingest (currently \"postgresql\")"
exit 3
fi
Expand All @@ -15,10 +15,10 @@ CONFIGFILE="config/cityscrape-config.sh"

echo "Running Cityscrape"

python $BASEDIR/grab_all_files.py
python $BASEDIR/src/grab_all_files.py

# Push zip_files to the bash path and unzip files
pushd zip_files
# Push path to stack (as opposed to cd) and unzip files
pushd $STL_CITY_DOWNLOAD_DIR

unzip -f "*.zip"

Expand Down
89 changes: 64 additions & 25 deletions src/grab_all_files.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,93 @@
'''
@author: clayton.young
deps: blah
@author: clayton.young, dylan.raithel
deps: run cityscrape-setup.sh
'''

from bs4 import BeautifulSoup
import requests
# Base Imports
import re
import wget
import os
import sys
import logging

# Third Party Imports
from bs4 import BeautifulSoup
import requests

# Modules
from util.log import configure_log

def send_request():
'''
input: None
output: None
# Globals
OUTPUT_DIR = os.environ['OUTPUT_DIR']
SOURCEFILE_URL = os.environ['SOURCEFILE_URL']

TODO: remove any and all references to hardcoded filepaths that aren't bootstrapped in the
config or the setup script

def get_soup():
'''
os.chdir('/home/clay/Downloads/stl_city')
Input: None
Output: html-like BeautifulSoup object
source_files = "http://dynamic.stlouis-mo.gov/citydata/downloads/"
'''
source_files = SOURCEFILE_URL

logger = logging.getLogger(__name__)
logger.info('Scraping ' + source_files + ' for .zip files')

print '\nScraping ' + source_files + ' for .zip files'
resp = requests.get(source_files)
encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
soup = BeautifulSoup(resp.content, from_encoding=encoding)
os.chdir('zipFiles')
for link in soup.find_all('a', href=re.compile("\.zip")):
download_link = source_files + link['href']
if os.path.isfile(link['href']) == False:
print '\nDownloading ' + link['href']
wget.download(download_link)

print '\nAll *.zip files downloaded from ' + source_files
os.chdir('..')
return soup


def get_files(soup):
'''
Input: html-like BeautifulSoup object
Output: Download a bunch o' files
'''
source_files = SOURCEFILE_URL

logger = logging.getLogger(__name__)

for endpoint in soup.find_all('a', href=re.compile("\.zip")):
link = endpoint['href']

if os.path.isfile(link) == False:
logger.info('Downloading ' + link)

filename = OUTPUT_DIR + link

download_link = source_files + link
logger.info('Http endpoint: {}'.format(download_link))

request = requests.get(download_link, stream=True)

with open(filename, 'wb') as f:
for chunk in request.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush

logger.info('All zip files downloaded from ' + source_files)


def main():
'''
Execute Main
Main module gathers new raw files for ingest into CityScrapeDB
'''
configure_log()
logger = logging.getLogger(__name__)

logger.info('Executing initial page scrape to look for new files...')
soup = get_soup()

print 'executing request'
send_request()
logger.info('Fetching files now!')
get_files(soup)

print 'request complete!'
logger.info('CityScrape complete!')


if __name__ == '__main__':
Expand Down
Empty file added src/util/__init__.py
Empty file.
14 changes: 14 additions & 0 deletions src/util/log.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import logging
import sys
from datetime import datetime


def configure_log():
"""Sets up the log system to have a consistent format and output"""
format = "%(asctime)s|%(name)13s:%(levelname)-7s| %(message)s"

cur_date = datetime.now()
today_string = cur_date.strftime('%Y-%m-%d')

# Configure our logger so it has a useful format
logging.basicConfig(level=logging.DEBUG, format=format)

0 comments on commit b646aa0

Please sign in to comment.