Skip to content

Commit

Permalink
split ddl deneration out from database calls
Browse files Browse the repository at this point in the history
  • Loading branch information
dylanraithel committed Aug 10, 2015
1 parent fabc5ad commit 7d38756
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 23 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -68,4 +68,6 @@ cityscrape/*

# Tempfiles
*.zip
*.tmp
*.tmp

workdir/*
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Scrape open data sources provided by the city of Saint Louis. Load data into a P

- clone the project
- `vagrant up`
- Magic!
- `./cityscrape-setup.sh`

# Deployment

Expand Down
11 changes: 5 additions & 6 deletions cityscrape-setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@ CONFIGFILE="config/cityscrape-config.sh"
. $CONFIGFILE

# Create working directories for ingest
if [ -d "$STL_CITY_DOWNLOAD_DIR" ]
if [ -d "$DDL_FILES" ]
then
echo "$STL_CITY_DOWNLOAD_DIR already exists"
echo "$DDL_FILES already exists"
else
mkdir $STL_CITY_DOWNLOAD_DIR
mkdir $WORKDIR
mkdir $DDL_FILES
fi

# Need to install virtualenv first
Expand All @@ -31,6 +32,4 @@ fi
pip install beautifulsoup4==4.4.0
pip install requests==2.7.0

# echo "Cityscrape setup complete!"

sudo -u postgres psql
# Add the create database stuff here
6 changes: 4 additions & 2 deletions config/cityscrape-config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ export DATABASE_USER='postgres'
export DATABASE_PASSWORD=[redacted]

# Set up paths
export BASEDIR=`dirname "$0"`
export BASEDIR=`dirname "$_"`
export CITYSCRAPE_VIRTUALENV_DIR=$BASEDIR/.py-env

# URL housing zip files of open city data
Expand All @@ -17,5 +17,7 @@ export GITHUB_URL='https://github.com/hopcity/cityscrape/'
export SOURCEFILE_URL='http://dynamic.stlouis-mo.gov/citydata/downloads/'

# temporary file download and extraction before loading into database
export OUTPUT_DIR=stl_city_files
export WORKDIR=$BASEDIR/workdir
export DDL_FILES=$WORKDIR/ddl
export $DDL_FILE_SUFFIX=".sql"

63 changes: 63 additions & 0 deletions run-ddl-generation.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/bin/bash

CONFIGFILE="config/cityscrape-config.sh"

. $CONFIGFILE

echo "Running Cityscrape PostgreSQL Ingest"

pushd $WORKDIR
echo "Unzipping files..."

zip_files=$(echo `ls *.zip 2>/dev/null`)
if [ -z "$zip_files" ]; then
echo "No *.zip files found, skipping unzip..."
else
for zip_file in $zip_files;
do
unzip -o $zip_file
done
echo "Unzip complete"
fi

shp_files=$(echo `ls *.shp 2>/dev/null`)
if [ -z "$shp_files" ]; then
echo "No *.shp files found, skipping ogr2ogr..."
else
for shp_file in $shp_files;
do
echo `ls $shp_file`
# ogr2ogr -overwrite -progress -skipfailures -f "PostgreSQL" PG:"host=localhost user=postgres dbname=city" $shp_file
done
fi

pushd $DDL_FILES
echo "Building ddl sql files now..."

mdb_files=$(echo `ls *.mdb 2>/dev/null`)
if [ -z "$mdb_files" ]; then
echo "No *.mdb files found, exiting..."
else
for mdb_file in $mdb_files
do
echo "Extracting tables from $mdb_file"
ddl_file=$mdb_file$DDL_FILE_SUFFIX

mdb-schema $mdb_file | sed 's/Char/Varchar/g' | sed 's/Postgres_Unknown 0x0c/text/g' > "$ddl_file"

tables=$(echo -en $(mdb-schema $mdb_file postgres | grep "CREATE TABLE IF NOT EXISTS" | awk '{ print $3 }' | sed -e 's/"//g');)

if [ -z "$tables" ]
then
echo "No tables found, skipping table ddl generation."
else
for table in $tables
do
echo $table > "$table$DDL_FILE_SUFFIX"
done
fi
done
fi

popd

36 changes: 23 additions & 13 deletions src/grab_all_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from util.log import configure_log

# Globals
OUTPUT_DIR = os.environ['OUTPUT_DIR']
WORKDIR = os.environ['WORKDIR']
SOURCEFILE_URL = os.environ['SOURCEFILE_URL']


Expand Down Expand Up @@ -52,24 +52,34 @@ def get_files(soup):

for endpoint in soup.find_all('a', href=re.compile("\.zip")):
link = endpoint['href']
logger.info("Link: {}".format(link))

if os.path.isfile(link) == False:
logger.info('Downloading ' + link)
filename = '/'.join([WORKDIR, link])

filename = OUTPUT_DIR + link
download_link = source_files + link
logger.info('url: {}'.format(download_link))

download_link = source_files + link
logger.info('Http endpoint: {}'.format(download_link))
if os.path.isfile(filename) == True:
logger.info('File: {} already exists'.format(link))

request = requests.get(download_link, stream=True)
else:
logger.info('Downloading ' + link)
try:
request = requests.get(download_link, stream=True)
except requests.exceptions.ConnectionError as err:
logger.warn("Manually Raised Error {}: {}".format(err.errno, err.strerror))
break

with open(filename, 'wb') as f:
for chunk in request.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush

logger.info('All zip files downloaded from ' + source_files)
logger.info("Writing out to file: {}".format(filename))
try:
for chunk in request.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush
except requests.exceptions.ConnectionError as err:
logger.warn("Error: {} | {}".format(err.errno, err.strerror))
break


def main():
Expand Down

0 comments on commit 7d38756

Please sign in to comment.