Skip to content

Commit

Permalink
Merge pull request #6 from hopcity/split_download_and_ingest
Browse files Browse the repository at this point in the history
split download and ingest logic into two scripts
  • Loading branch information
hopcity committed Aug 16, 2015
2 parents b646aa0 + 7d38756 commit 0513905
Show file tree
Hide file tree
Showing 9 changed files with 162 additions and 74 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -68,4 +68,6 @@ cityscrape/*

# Tempfiles
*.zip
*.tmp
*.tmp

workdir/*
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Scrape open data sources provided by the city of Saint Louis. Load data into a P

- clone the project
- `vagrant up`
- Magic!
- `./cityscrape-setup.sh`

# Deployment

Expand Down
10 changes: 5 additions & 5 deletions cityscrape-setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@ CONFIGFILE="config/cityscrape-config.sh"
. $CONFIGFILE

# Create working directories for ingest
if [ -d "$STL_CITY_DOWNLOAD_DIR" ]
if [ -d "$DDL_FILES" ]
then
echo "$STL_CITY_DOWNLOAD_DIR already exists"
echo "$DDL_FILES already exists"
else
mkdir $STL_CITY_DOWNLOAD_DIR
mkdir $WORKDIR
mkdir $DDL_FILES
fi

# Need to install virtualenv first
Expand All @@ -30,6 +31,5 @@ fi
# Install Python libraries
pip install beautifulsoup4==4.4.0
pip install requests==2.7.0
pip install wget==2.2

echo "Cityscrape setup complete!"
# Add the create database stuff here
6 changes: 4 additions & 2 deletions config/cityscrape-config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ export DATABASE_USER='postgres'
export DATABASE_PASSWORD=[redacted]

# Set up paths
export BASEDIR=`dirname "$0"`
export BASEDIR=`dirname "$_"`
export CITYSCRAPE_VIRTUALENV_DIR=$BASEDIR/.py-env

# URL housing zip files of open city data
Expand All @@ -17,5 +17,7 @@ export GITHUB_URL='https://github.com/hopcity/cityscrape/'
export SOURCEFILE_URL='http://dynamic.stlouis-mo.gov/citydata/downloads/'

# temporary file download and extraction before loading into database
export OUTPUT_DIR=$BASEDIR/stl_city_files/
export WORKDIR=$BASEDIR/workdir
export DDL_FILES=$WORKDIR/ddl
export $DDL_FILE_SUFFIX=".sql"

13 changes: 13 additions & 0 deletions run-cityscrape-get.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash -e

CONFIGFILE="config/cityscrape-config.sh"

# Bootstrap the config into our bash env
. $CONFIGFILE

# Activate virtualenv
. $CITYSCRAPE_VIRTUALENV_DIR/bin/activate

echo "Running Cityscrape Download"

python $BASEDIR/src/grab_all_files.py
49 changes: 49 additions & 0 deletions run-cityscrape-postgresql-ingest.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/bin/bash -e

CONFIGFILE="config/cityscrape-config.sh"

. $CONFIGFILE

echo "Running Cityscrape PostgreSQL Ingest"
pushd $OUTPUT_DIR

echo "Unzipping files..."
unzip -f "*.zip"

if [ -z "$(ls *.shp)" ]
then
echo "No *.shp files found, exiting..."
break
else
for f in *.shp
do
echo "Loading "$f

ogr2ogr -overwrite -progress -skipfailures -f "PostgreSQL" PG:"host=localhost user=postgres dbname=city" $f#

done
fi

for f in *.mdb

do
echo "Extracting tables from $f"

mdb-schema $f postgres | sed 's/Char/Varchar/g' | sed 's/Postgres_Unknown 0x0c/text/g' | psql -h localhost -U postgres -d city

tables=$(echo -en $(mdb-schema $f postgres | grep "CREATE TABLE" | awk '{ print $3 }' | sed -e 's/"//g');)

for i in $tables

do
echo "[File: "$f" ] [Table - "$i"]"

mdb-export -D ‘%%Y-%%m-%%d %%H:%%M:%%S’ -I postgress -q \’ -R \; $f $i | psql -d city -U postgres -w -h localhost

done

done

# # return to project root $BASEDIR
popd

50 changes: 0 additions & 50 deletions run-cityscrape.sh

This file was deleted.

63 changes: 63 additions & 0 deletions run-ddl-generation.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/bin/bash

CONFIGFILE="config/cityscrape-config.sh"

. $CONFIGFILE

echo "Running Cityscrape PostgreSQL Ingest"

pushd $WORKDIR
echo "Unzipping files..."

zip_files=$(echo `ls *.zip 2>/dev/null`)
if [ -z "$zip_files" ]; then
echo "No *.zip files found, skipping unzip..."
else
for zip_file in $zip_files;
do
unzip -o $zip_file
done
echo "Unzip complete"
fi

shp_files=$(echo `ls *.shp 2>/dev/null`)
if [ -z "$shp_files" ]; then
echo "No *.shp files found, skipping ogr2ogr..."
else
for shp_file in $shp_files;
do
echo `ls $shp_file`
# ogr2ogr -overwrite -progress -skipfailures -f "PostgreSQL" PG:"host=localhost user=postgres dbname=city" $shp_file
done
fi

pushd $DDL_FILES
echo "Building ddl sql files now..."

mdb_files=$(echo `ls *.mdb 2>/dev/null`)
if [ -z "$mdb_files" ]; then
echo "No *.mdb files found, exiting..."
else
for mdb_file in $mdb_files
do
echo "Extracting tables from $mdb_file"
ddl_file=$mdb_file$DDL_FILE_SUFFIX

mdb-schema $mdb_file | sed 's/Char/Varchar/g' | sed 's/Postgres_Unknown 0x0c/text/g' > "$ddl_file"

tables=$(echo -en $(mdb-schema $mdb_file postgres | grep "CREATE TABLE IF NOT EXISTS" | awk '{ print $3 }' | sed -e 's/"//g');)

if [ -z "$tables" ]
then
echo "No tables found, skipping table ddl generation."
else
for table in $tables
do
echo $table > "$table$DDL_FILE_SUFFIX"
done
fi
done
fi

popd

39 changes: 24 additions & 15 deletions src/grab_all_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

# Base Imports
import re
import wget
import os
import sys
import logging
Expand All @@ -19,7 +18,7 @@
from util.log import configure_log

# Globals
OUTPUT_DIR = os.environ['OUTPUT_DIR']
WORKDIR = os.environ['WORKDIR']
SOURCEFILE_URL = os.environ['SOURCEFILE_URL']


Expand Down Expand Up @@ -53,24 +52,34 @@ def get_files(soup):

for endpoint in soup.find_all('a', href=re.compile("\.zip")):
link = endpoint['href']
logger.info("Link: {}".format(link))

if os.path.isfile(link) == False:
logger.info('Downloading ' + link)
filename = '/'.join([WORKDIR, link])

filename = OUTPUT_DIR + link
download_link = source_files + link
logger.info('url: {}'.format(download_link))

download_link = source_files + link
logger.info('Http endpoint: {}'.format(download_link))
if os.path.isfile(filename) == True:
logger.info('File: {} already exists'.format(link))

request = requests.get(download_link, stream=True)
else:
logger.info('Downloading ' + link)
try:
request = requests.get(download_link, stream=True)
except requests.exceptions.ConnectionError as err:
logger.warn("Manually Raised Error {}: {}".format(err.errno, err.strerror))
break

with open(filename, 'wb') as f:
for chunk in request.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush

logger.info('All zip files downloaded from ' + source_files)
logger.info("Writing out to file: {}".format(filename))
try:
for chunk in request.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush
except requests.exceptions.ConnectionError as err:
logger.warn("Error: {} | {}".format(err.errno, err.strerror))
break


def main():
Expand All @@ -87,7 +96,7 @@ def main():
logger.info('Fetching files now!')
get_files(soup)

logger.info('CityScrape complete!')
logger.info('CityScrape download complete!')


if __name__ == '__main__':
Expand Down

0 comments on commit 0513905

Please sign in to comment.