diff --git a/.gitignore b/.gitignore index 0b1a0f1..af58d46 100644 --- a/.gitignore +++ b/.gitignore @@ -68,4 +68,6 @@ cityscrape/* # Tempfiles *.zip -*.tmp \ No newline at end of file +*.tmp + +workdir/* diff --git a/README.md b/README.md index 859c670..996a032 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ Scrape open data sources provided by the city of Saint Louis. Load data into a P - clone the project - `vagrant up` - - Magic! + - `./cityscrape-setup.sh` # Deployment diff --git a/cityscrape-setup.sh b/cityscrape-setup.sh index 57930ea..4f97c7b 100755 --- a/cityscrape-setup.sh +++ b/cityscrape-setup.sh @@ -6,11 +6,12 @@ CONFIGFILE="config/cityscrape-config.sh" . $CONFIGFILE # Create working directories for ingest -if [ -d "$STL_CITY_DOWNLOAD_DIR" ] +if [ -d "$DDL_FILES" ] then - echo "$STL_CITY_DOWNLOAD_DIR already exists" + echo "$DDL_FILES already exists" else - mkdir $STL_CITY_DOWNLOAD_DIR + mkdir $WORKDIR + mkdir $DDL_FILES fi # Need to install virtualenv first @@ -30,6 +31,5 @@ fi # Install Python libraries pip install beautifulsoup4==4.4.0 pip install requests==2.7.0 -pip install wget==2.2 -echo "Cityscrape setup complete!" \ No newline at end of file +# Add the create database stuff here \ No newline at end of file diff --git a/config/cityscrape-config.sh b/config/cityscrape-config.sh index 7d9774e..43264dc 100644 --- a/config/cityscrape-config.sh +++ b/config/cityscrape-config.sh @@ -8,7 +8,7 @@ export DATABASE_USER='postgres' export DATABASE_PASSWORD=[redacted] # Set up paths -export BASEDIR=`dirname "$0"` +export BASEDIR=`dirname "$_"` export CITYSCRAPE_VIRTUALENV_DIR=$BASEDIR/.py-env # URL housing zip files of open city data @@ -17,5 +17,7 @@ export GITHUB_URL='https://github.com/hopcity/cityscrape/' export SOURCEFILE_URL='http://dynamic.stlouis-mo.gov/citydata/downloads/' # temporary file download and extraction before loading into database -export OUTPUT_DIR=$BASEDIR/stl_city_files/ +export WORKDIR=$BASEDIR/workdir +export DDL_FILES=$WORKDIR/ddl +export $DDL_FILE_SUFFIX=".sql" diff --git a/run-cityscrape-get.sh b/run-cityscrape-get.sh new file mode 100755 index 0000000..4d03a1d --- /dev/null +++ b/run-cityscrape-get.sh @@ -0,0 +1,13 @@ +#!/bin/bash -e + +CONFIGFILE="config/cityscrape-config.sh" + +# Bootstrap the config into our bash env +. $CONFIGFILE + +# Activate virtualenv +. $CITYSCRAPE_VIRTUALENV_DIR/bin/activate + +echo "Running Cityscrape Download" + +python $BASEDIR/src/grab_all_files.py \ No newline at end of file diff --git a/run-cityscrape-postgresql-ingest.sh b/run-cityscrape-postgresql-ingest.sh new file mode 100755 index 0000000..b775787 --- /dev/null +++ b/run-cityscrape-postgresql-ingest.sh @@ -0,0 +1,49 @@ +#!/bin/bash -e + +CONFIGFILE="config/cityscrape-config.sh" + +. $CONFIGFILE + +echo "Running Cityscrape PostgreSQL Ingest" +pushd $OUTPUT_DIR + +echo "Unzipping files..." +unzip -f "*.zip" + +if [ -z "$(ls *.shp)" ] + then + echo "No *.shp files found, exiting..." + break + else + for f in *.shp + do + echo "Loading "$f + + ogr2ogr -overwrite -progress -skipfailures -f "PostgreSQL" PG:"host=localhost user=postgres dbname=city" $f# + + done +fi + +for f in *.mdb + + do + echo "Extracting tables from $f" + + mdb-schema $f postgres | sed 's/Char/Varchar/g' | sed 's/Postgres_Unknown 0x0c/text/g' | psql -h localhost -U postgres -d city + + tables=$(echo -en $(mdb-schema $f postgres | grep "CREATE TABLE" | awk '{ print $3 }' | sed -e 's/"//g');) + + for i in $tables + + do + echo "[File: "$f" ] [Table - "$i"]" + + mdb-export -D ‘%%Y-%%m-%%d %%H:%%M:%%S’ -I postgress -q \’ -R \; $f $i | psql -d city -U postgres -w -h localhost + + done + +done + +# # return to project root $BASEDIR +popd + diff --git a/run-cityscrape.sh b/run-cityscrape.sh deleted file mode 100755 index c41e3f2..0000000 --- a/run-cityscrape.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash -e - -if [[ -z "$1" ]] ; then - echo "Cityscape requires at least 1 argument - the database to be used for the ingest (currently \"postgresql\")" - exit 3 -fi - -CONFIGFILE="config/cityscrape-config.sh" - -# Bootstrap the config into our bash env -. $CONFIGFILE - -# Activate virtualenv -. $CITYSCRAPE_VIRTUALENV_DIR/bin/activate - -echo "Running Cityscrape" - -python $BASEDIR/src/grab_all_files.py - -# Push path to stack (as opposed to cd) and unzip files -pushd $STL_CITY_DOWNLOAD_DIR - -unzip -f "*.zip" - -for f in *.shp -do - echo "Loading "$f - ogr2ogr -overwrite -progress -skipfailures -f "PostgreSQL" PG:"host=localhost user=postgres dbname=city" $f# -done - - -for f in *.mdb -do - echo "Extracting tables from $f" - - mdb-schema $f postgres | sed 's/Char/Varchar/g' | sed 's/Postgres_Unknown 0x0c/text/g' | psql -h localhost -U postgres -d city - - tables=$(echo -en $(mdb-schema $f postgres | grep "CREATE TABLE" | awk '{ print $3 }' | sed -e 's/"//g');) - for i in $tables - do - echo "[File: "$f" ] [Table - "$i"]" - mdb-export -D ‘%%Y-%%m-%%d %%H:%%M:%%S’ -I postgress -q \’ -R \; $f $i | psql -d city -U postgres -w -h localhost - - done - -done - -# return to project root $BASEDIR -popd - diff --git a/run-ddl-generation.sh b/run-ddl-generation.sh new file mode 100755 index 0000000..1aa86f8 --- /dev/null +++ b/run-ddl-generation.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +CONFIGFILE="config/cityscrape-config.sh" + +. $CONFIGFILE + +echo "Running Cityscrape PostgreSQL Ingest" + +pushd $WORKDIR +echo "Unzipping files..." + +zip_files=$(echo `ls *.zip 2>/dev/null`) +if [ -z "$zip_files" ]; then + echo "No *.zip files found, skipping unzip..." +else + for zip_file in $zip_files; + do + unzip -o $zip_file + done + echo "Unzip complete" +fi + +shp_files=$(echo `ls *.shp 2>/dev/null`) +if [ -z "$shp_files" ]; then + echo "No *.shp files found, skipping ogr2ogr..." +else + for shp_file in $shp_files; + do + echo `ls $shp_file` + # ogr2ogr -overwrite -progress -skipfailures -f "PostgreSQL" PG:"host=localhost user=postgres dbname=city" $shp_file + done +fi + +pushd $DDL_FILES +echo "Building ddl sql files now..." + +mdb_files=$(echo `ls *.mdb 2>/dev/null`) +if [ -z "$mdb_files" ]; then + echo "No *.mdb files found, exiting..." +else + for mdb_file in $mdb_files + do + echo "Extracting tables from $mdb_file" + ddl_file=$mdb_file$DDL_FILE_SUFFIX + + mdb-schema $mdb_file | sed 's/Char/Varchar/g' | sed 's/Postgres_Unknown 0x0c/text/g' > "$ddl_file" + + tables=$(echo -en $(mdb-schema $mdb_file postgres | grep "CREATE TABLE IF NOT EXISTS" | awk '{ print $3 }' | sed -e 's/"//g');) + + if [ -z "$tables" ] + then + echo "No tables found, skipping table ddl generation." + else + for table in $tables + do + echo $table > "$table$DDL_FILE_SUFFIX" + done + fi + done +fi + +popd + diff --git a/src/grab_all_files.py b/src/grab_all_files.py index 0bc489f..a612e8c 100755 --- a/src/grab_all_files.py +++ b/src/grab_all_files.py @@ -6,7 +6,6 @@ # Base Imports import re -import wget import os import sys import logging @@ -19,7 +18,7 @@ from util.log import configure_log # Globals -OUTPUT_DIR = os.environ['OUTPUT_DIR'] +WORKDIR = os.environ['WORKDIR'] SOURCEFILE_URL = os.environ['SOURCEFILE_URL'] @@ -53,24 +52,34 @@ def get_files(soup): for endpoint in soup.find_all('a', href=re.compile("\.zip")): link = endpoint['href'] + logger.info("Link: {}".format(link)) - if os.path.isfile(link) == False: - logger.info('Downloading ' + link) + filename = '/'.join([WORKDIR, link]) - filename = OUTPUT_DIR + link + download_link = source_files + link + logger.info('url: {}'.format(download_link)) - download_link = source_files + link - logger.info('Http endpoint: {}'.format(download_link)) + if os.path.isfile(filename) == True: + logger.info('File: {} already exists'.format(link)) - request = requests.get(download_link, stream=True) + else: + logger.info('Downloading ' + link) + try: + request = requests.get(download_link, stream=True) + except requests.exceptions.ConnectionError as err: + logger.warn("Manually Raised Error {}: {}".format(err.errno, err.strerror)) + break with open(filename, 'wb') as f: - for chunk in request.iter_content(chunk_size=1024): - if chunk: - f.write(chunk) - f.flush - - logger.info('All zip files downloaded from ' + source_files) + logger.info("Writing out to file: {}".format(filename)) + try: + for chunk in request.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + f.flush + except requests.exceptions.ConnectionError as err: + logger.warn("Error: {} | {}".format(err.errno, err.strerror)) + break def main(): @@ -87,7 +96,7 @@ def main(): logger.info('Fetching files now!') get_files(soup) - logger.info('CityScrape complete!') + logger.info('CityScrape download complete!') if __name__ == '__main__':