From b375b28deda412c8d7f11b78c0dab09b897c5f57 Mon Sep 17 00:00:00 2001 From: Dylan Raithel Date: Wed, 5 Aug 2015 22:57:57 -0700 Subject: [PATCH 1/4] split download and ingest logic into two scripts --- cityscrape-setup.sh | 1 - run-cityscrape-get.sh | 13 ++++++++ run-cityscrape-postgresql-ingest.sh | 42 ++++++++++++++++++++++++ run-cityscrape.sh | 50 ----------------------------- src/grab_all_files.py | 3 +- 5 files changed, 56 insertions(+), 53 deletions(-) create mode 100755 run-cityscrape-get.sh create mode 100755 run-cityscrape-postgresql-ingest.sh delete mode 100755 run-cityscrape.sh diff --git a/cityscrape-setup.sh b/cityscrape-setup.sh index 57930ea..128e26f 100755 --- a/cityscrape-setup.sh +++ b/cityscrape-setup.sh @@ -30,6 +30,5 @@ fi # Install Python libraries pip install beautifulsoup4==4.4.0 pip install requests==2.7.0 -pip install wget==2.2 echo "Cityscrape setup complete!" \ No newline at end of file diff --git a/run-cityscrape-get.sh b/run-cityscrape-get.sh new file mode 100755 index 0000000..4d03a1d --- /dev/null +++ b/run-cityscrape-get.sh @@ -0,0 +1,13 @@ +#!/bin/bash -e + +CONFIGFILE="config/cityscrape-config.sh" + +# Bootstrap the config into our bash env +. $CONFIGFILE + +# Activate virtualenv +. $CITYSCRAPE_VIRTUALENV_DIR/bin/activate + +echo "Running Cityscrape Download" + +python $BASEDIR/src/grab_all_files.py \ No newline at end of file diff --git a/run-cityscrape-postgresql-ingest.sh b/run-cityscrape-postgresql-ingest.sh new file mode 100755 index 0000000..cb60654 --- /dev/null +++ b/run-cityscrape-postgresql-ingest.sh @@ -0,0 +1,42 @@ +#!/bin/bash -e + +CONFIGFILE="config/cityscrape-config.sh" + +. $CONFIGFILE + +echo "Running Cityscrape PostgreSQL Ingest" +pushd $OUTPUT_DIR + +echo "Unzipping files..." +unzip -f "*.zip" + +for f in *.shp + do + echo "Loading "$f + + ogr2ogr -overwrite -progress -skipfailures -f "PostgreSQL" PG:"host=localhost user=postgres dbname=city" $f# + +done + + +for f in *.mdb +do + echo "Extracting tables from $f" + + mdb-schema $f postgres | sed 's/Char/Varchar/g' | sed 's/Postgres_Unknown 0x0c/text/g' | psql -h localhost -U postgres -d city + + tables=$(echo -en $(mdb-schema $f postgres | grep "CREATE TABLE" | awk '{ print $3 }' | sed -e 's/"//g');) + + for i in $tables + do + echo "[File: "$f" ] [Table - "$i"]" + + mdb-export -D ‘%%Y-%%m-%%d %%H:%%M:%%S’ -I postgress -q \’ -R \; $f $i | psql -d city -U postgres -w -h localhost + + done + +done + +# return to project root $BASEDIR +popd + diff --git a/run-cityscrape.sh b/run-cityscrape.sh deleted file mode 100755 index c41e3f2..0000000 --- a/run-cityscrape.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash -e - -if [[ -z "$1" ]] ; then - echo "Cityscape requires at least 1 argument - the database to be used for the ingest (currently \"postgresql\")" - exit 3 -fi - -CONFIGFILE="config/cityscrape-config.sh" - -# Bootstrap the config into our bash env -. $CONFIGFILE - -# Activate virtualenv -. $CITYSCRAPE_VIRTUALENV_DIR/bin/activate - -echo "Running Cityscrape" - -python $BASEDIR/src/grab_all_files.py - -# Push path to stack (as opposed to cd) and unzip files -pushd $STL_CITY_DOWNLOAD_DIR - -unzip -f "*.zip" - -for f in *.shp -do - echo "Loading "$f - ogr2ogr -overwrite -progress -skipfailures -f "PostgreSQL" PG:"host=localhost user=postgres dbname=city" $f# -done - - -for f in *.mdb -do - echo "Extracting tables from $f" - - mdb-schema $f postgres | sed 's/Char/Varchar/g' | sed 's/Postgres_Unknown 0x0c/text/g' | psql -h localhost -U postgres -d city - - tables=$(echo -en $(mdb-schema $f postgres | grep "CREATE TABLE" | awk '{ print $3 }' | sed -e 's/"//g');) - for i in $tables - do - echo "[File: "$f" ] [Table - "$i"]" - mdb-export -D ‘%%Y-%%m-%%d %%H:%%M:%%S’ -I postgress -q \’ -R \; $f $i | psql -d city -U postgres -w -h localhost - - done - -done - -# return to project root $BASEDIR -popd - diff --git a/src/grab_all_files.py b/src/grab_all_files.py index 0bc489f..19eef1b 100755 --- a/src/grab_all_files.py +++ b/src/grab_all_files.py @@ -6,7 +6,6 @@ # Base Imports import re -import wget import os import sys import logging @@ -87,7 +86,7 @@ def main(): logger.info('Fetching files now!') get_files(soup) - logger.info('CityScrape complete!') + logger.info('CityScrape download complete!') if __name__ == '__main__': From 19bdf0b1329f1a95d5e521668ff7547d1cebd57f Mon Sep 17 00:00:00 2001 From: Dylan Raithel Date: Sat, 8 Aug 2015 12:10:54 -0700 Subject: [PATCH 2/4] modified config file --- config/cityscrape-config.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/cityscrape-config.sh b/config/cityscrape-config.sh index 7d9774e..4426fcc 100644 --- a/config/cityscrape-config.sh +++ b/config/cityscrape-config.sh @@ -17,5 +17,5 @@ export GITHUB_URL='https://github.com/hopcity/cityscrape/' export SOURCEFILE_URL='http://dynamic.stlouis-mo.gov/citydata/downloads/' # temporary file download and extraction before loading into database -export OUTPUT_DIR=$BASEDIR/stl_city_files/ +export OUTPUT_DIR=stl_city_files From fabc5ad9cc6083440a667cd99f726a026f299a6a Mon Sep 17 00:00:00 2001 From: Dylan Raithel Date: Sat, 8 Aug 2015 12:13:27 -0700 Subject: [PATCH 3/4] handle no .shp file scenario --- cityscrape-setup.sh | 4 +++- run-cityscrape-postgresql-ingest.sh | 29 ++++++++++++++++++----------- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/cityscrape-setup.sh b/cityscrape-setup.sh index 128e26f..9a0b949 100755 --- a/cityscrape-setup.sh +++ b/cityscrape-setup.sh @@ -31,4 +31,6 @@ fi pip install beautifulsoup4==4.4.0 pip install requests==2.7.0 -echo "Cityscrape setup complete!" \ No newline at end of file +# echo "Cityscrape setup complete!" + +sudo -u postgres psql \ No newline at end of file diff --git a/run-cityscrape-postgresql-ingest.sh b/run-cityscrape-postgresql-ingest.sh index cb60654..b775787 100755 --- a/run-cityscrape-postgresql-ingest.sh +++ b/run-cityscrape-postgresql-ingest.sh @@ -10,24 +10,31 @@ pushd $OUTPUT_DIR echo "Unzipping files..." unzip -f "*.zip" -for f in *.shp - do - echo "Loading "$f - - ogr2ogr -overwrite -progress -skipfailures -f "PostgreSQL" PG:"host=localhost user=postgres dbname=city" $f# +if [ -z "$(ls *.shp)" ] + then + echo "No *.shp files found, exiting..." + break + else + for f in *.shp + do + echo "Loading "$f -done + ogr2ogr -overwrite -progress -skipfailures -f "PostgreSQL" PG:"host=localhost user=postgres dbname=city" $f# + done +fi for f in *.mdb -do - echo "Extracting tables from $f" - mdb-schema $f postgres | sed 's/Char/Varchar/g' | sed 's/Postgres_Unknown 0x0c/text/g' | psql -h localhost -U postgres -d city + do + echo "Extracting tables from $f" + + mdb-schema $f postgres | sed 's/Char/Varchar/g' | sed 's/Postgres_Unknown 0x0c/text/g' | psql -h localhost -U postgres -d city - tables=$(echo -en $(mdb-schema $f postgres | grep "CREATE TABLE" | awk '{ print $3 }' | sed -e 's/"//g');) + tables=$(echo -en $(mdb-schema $f postgres | grep "CREATE TABLE" | awk '{ print $3 }' | sed -e 's/"//g');) for i in $tables + do echo "[File: "$f" ] [Table - "$i"]" @@ -37,6 +44,6 @@ do done -# return to project root $BASEDIR +# # return to project root $BASEDIR popd From 7d38756ba9637eba8150764a2a1f57363e9f23c4 Mon Sep 17 00:00:00 2001 From: Dylan Raithel Date: Sun, 9 Aug 2015 20:57:08 -0700 Subject: [PATCH 4/4] split ddl deneration out from database calls --- .gitignore | 4 ++- README.md | 2 +- cityscrape-setup.sh | 11 +++---- config/cityscrape-config.sh | 6 ++-- run-ddl-generation.sh | 63 +++++++++++++++++++++++++++++++++++++ src/grab_all_files.py | 36 +++++++++++++-------- 6 files changed, 99 insertions(+), 23 deletions(-) create mode 100755 run-ddl-generation.sh diff --git a/.gitignore b/.gitignore index 0b1a0f1..af58d46 100644 --- a/.gitignore +++ b/.gitignore @@ -68,4 +68,6 @@ cityscrape/* # Tempfiles *.zip -*.tmp \ No newline at end of file +*.tmp + +workdir/* diff --git a/README.md b/README.md index 859c670..996a032 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ Scrape open data sources provided by the city of Saint Louis. Load data into a P - clone the project - `vagrant up` - - Magic! + - `./cityscrape-setup.sh` # Deployment diff --git a/cityscrape-setup.sh b/cityscrape-setup.sh index 9a0b949..4f97c7b 100755 --- a/cityscrape-setup.sh +++ b/cityscrape-setup.sh @@ -6,11 +6,12 @@ CONFIGFILE="config/cityscrape-config.sh" . $CONFIGFILE # Create working directories for ingest -if [ -d "$STL_CITY_DOWNLOAD_DIR" ] +if [ -d "$DDL_FILES" ] then - echo "$STL_CITY_DOWNLOAD_DIR already exists" + echo "$DDL_FILES already exists" else - mkdir $STL_CITY_DOWNLOAD_DIR + mkdir $WORKDIR + mkdir $DDL_FILES fi # Need to install virtualenv first @@ -31,6 +32,4 @@ fi pip install beautifulsoup4==4.4.0 pip install requests==2.7.0 -# echo "Cityscrape setup complete!" - -sudo -u postgres psql \ No newline at end of file +# Add the create database stuff here \ No newline at end of file diff --git a/config/cityscrape-config.sh b/config/cityscrape-config.sh index 4426fcc..43264dc 100644 --- a/config/cityscrape-config.sh +++ b/config/cityscrape-config.sh @@ -8,7 +8,7 @@ export DATABASE_USER='postgres' export DATABASE_PASSWORD=[redacted] # Set up paths -export BASEDIR=`dirname "$0"` +export BASEDIR=`dirname "$_"` export CITYSCRAPE_VIRTUALENV_DIR=$BASEDIR/.py-env # URL housing zip files of open city data @@ -17,5 +17,7 @@ export GITHUB_URL='https://github.com/hopcity/cityscrape/' export SOURCEFILE_URL='http://dynamic.stlouis-mo.gov/citydata/downloads/' # temporary file download and extraction before loading into database -export OUTPUT_DIR=stl_city_files +export WORKDIR=$BASEDIR/workdir +export DDL_FILES=$WORKDIR/ddl +export $DDL_FILE_SUFFIX=".sql" diff --git a/run-ddl-generation.sh b/run-ddl-generation.sh new file mode 100755 index 0000000..1aa86f8 --- /dev/null +++ b/run-ddl-generation.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +CONFIGFILE="config/cityscrape-config.sh" + +. $CONFIGFILE + +echo "Running Cityscrape PostgreSQL Ingest" + +pushd $WORKDIR +echo "Unzipping files..." + +zip_files=$(echo `ls *.zip 2>/dev/null`) +if [ -z "$zip_files" ]; then + echo "No *.zip files found, skipping unzip..." +else + for zip_file in $zip_files; + do + unzip -o $zip_file + done + echo "Unzip complete" +fi + +shp_files=$(echo `ls *.shp 2>/dev/null`) +if [ -z "$shp_files" ]; then + echo "No *.shp files found, skipping ogr2ogr..." +else + for shp_file in $shp_files; + do + echo `ls $shp_file` + # ogr2ogr -overwrite -progress -skipfailures -f "PostgreSQL" PG:"host=localhost user=postgres dbname=city" $shp_file + done +fi + +pushd $DDL_FILES +echo "Building ddl sql files now..." + +mdb_files=$(echo `ls *.mdb 2>/dev/null`) +if [ -z "$mdb_files" ]; then + echo "No *.mdb files found, exiting..." +else + for mdb_file in $mdb_files + do + echo "Extracting tables from $mdb_file" + ddl_file=$mdb_file$DDL_FILE_SUFFIX + + mdb-schema $mdb_file | sed 's/Char/Varchar/g' | sed 's/Postgres_Unknown 0x0c/text/g' > "$ddl_file" + + tables=$(echo -en $(mdb-schema $mdb_file postgres | grep "CREATE TABLE IF NOT EXISTS" | awk '{ print $3 }' | sed -e 's/"//g');) + + if [ -z "$tables" ] + then + echo "No tables found, skipping table ddl generation." + else + for table in $tables + do + echo $table > "$table$DDL_FILE_SUFFIX" + done + fi + done +fi + +popd + diff --git a/src/grab_all_files.py b/src/grab_all_files.py index 19eef1b..a612e8c 100755 --- a/src/grab_all_files.py +++ b/src/grab_all_files.py @@ -18,7 +18,7 @@ from util.log import configure_log # Globals -OUTPUT_DIR = os.environ['OUTPUT_DIR'] +WORKDIR = os.environ['WORKDIR'] SOURCEFILE_URL = os.environ['SOURCEFILE_URL'] @@ -52,24 +52,34 @@ def get_files(soup): for endpoint in soup.find_all('a', href=re.compile("\.zip")): link = endpoint['href'] + logger.info("Link: {}".format(link)) - if os.path.isfile(link) == False: - logger.info('Downloading ' + link) + filename = '/'.join([WORKDIR, link]) - filename = OUTPUT_DIR + link + download_link = source_files + link + logger.info('url: {}'.format(download_link)) - download_link = source_files + link - logger.info('Http endpoint: {}'.format(download_link)) + if os.path.isfile(filename) == True: + logger.info('File: {} already exists'.format(link)) - request = requests.get(download_link, stream=True) + else: + logger.info('Downloading ' + link) + try: + request = requests.get(download_link, stream=True) + except requests.exceptions.ConnectionError as err: + logger.warn("Manually Raised Error {}: {}".format(err.errno, err.strerror)) + break with open(filename, 'wb') as f: - for chunk in request.iter_content(chunk_size=1024): - if chunk: - f.write(chunk) - f.flush - - logger.info('All zip files downloaded from ' + source_files) + logger.info("Writing out to file: {}".format(filename)) + try: + for chunk in request.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + f.flush + except requests.exceptions.ConnectionError as err: + logger.warn("Error: {} | {}".format(err.errno, err.strerror)) + break def main():