From f8ab55ffc003963b69b73706dccc2e658cc18d02 Mon Sep 17 00:00:00 2001 From: Dylan Raithel Date: Tue, 1 Sep 2015 21:10:09 -0700 Subject: [PATCH 1/3] ddl generation and Access to Postgresql marshalling with bash utils --- Vagrantfile | 10 ++++-- config/cityscrape-config.sh | 3 +- db-ingest.sh | 37 ++++++++++++++++++++ generate-ddl.sh | 32 +++++++++++++++++ generate-schema-from-ddl.sh | 19 ++++++++++ generate-shapefile-manifest.sh | 27 ++++++++++++++ run-cityscrape-get.sh => get.sh | 0 run-cityscrape-postgresql-ingest.sh | 4 +-- run-cityscrape.sh | 23 ++++++++++++ run-ddl-generation.sh | 32 +---------------- unzip.sh | 13 +++++++ upload-shapefiles-from-shapefile-manifest.sh | 10 ++++++ 12 files changed, 174 insertions(+), 36 deletions(-) create mode 100755 db-ingest.sh create mode 100755 generate-ddl.sh create mode 100755 generate-schema-from-ddl.sh create mode 100755 generate-shapefile-manifest.sh rename run-cityscrape-get.sh => get.sh (100%) mode change 100755 => 100644 run-cityscrape-postgresql-ingest.sh create mode 100755 run-cityscrape.sh mode change 100755 => 100644 run-ddl-generation.sh create mode 100755 unzip.sh create mode 100755 upload-shapefiles-from-shapefile-manifest.sh diff --git a/Vagrantfile b/Vagrantfile index 1a61212..f288414 100644 --- a/Vagrantfile +++ b/Vagrantfile @@ -1,12 +1,18 @@ # Vagrantfile API/syntax version. Don't touch unless you know what you're doing! +# +# Note: +# edit /etc/apt/sources.list to uncomment the following +# deb http://security.ubuntu.com/ubuntu trusty-security multiverse +# deb-src http://security.ubuntu.com/ubuntu trusty-security multiverse + VAGRANTFILE_API_VERSION = "2" Vagrant.configure(VAGRANTFILE_API_VERSION) do |config| - config.vm.box = "ubuntu/vivid64" + config.vm.box = "ubuntu/trusty64" # Change this to be something relevant to your project - config.vm.hostname = "cityscrape" + config.vm.hostname = "city-scrape" config.vm.provision :shell, inline: "apt-get -y install --fix-missing" config.vm.provision :shell, inline: "apt-get -y install python" diff --git a/config/cityscrape-config.sh b/config/cityscrape-config.sh index 43264dc..56d9774 100644 --- a/config/cityscrape-config.sh +++ b/config/cityscrape-config.sh @@ -19,5 +19,6 @@ export SOURCEFILE_URL='http://dynamic.stlouis-mo.gov/citydata/downloads/' # temporary file download and extraction before loading into database export WORKDIR=$BASEDIR/workdir export DDL_FILES=$WORKDIR/ddl -export $DDL_FILE_SUFFIX=".sql" +export DDL_FILE_SUFFIX="" +export SHAPEFILE_MANIFEST=shp_file.manifest \ No newline at end of file diff --git a/db-ingest.sh b/db-ingest.sh new file mode 100755 index 0000000..704e7b2 --- /dev/null +++ b/db-ingest.sh @@ -0,0 +1,37 @@ +#!/bin/bash -e + +# Something fucky with the postgres configuration +# vagrant@city-scrape:/vagrant$ sudo su +# root@city-scrape:/vagrant# su postgres +# postgres@city-scrape:/vagrant$ + +CONFIGFILE="config/cityscrape-config.sh" + +. $CONFIGFILE + +echo "Running Cityscrape PostgreSQL Ingest" +pushd $WORKDIR + +for f in *.mdb + + do + echo "Extracting tables from $f" + + mdb-schema $f postgres | sed 's/Char/Varchar/g' | sed 's/Postgres_Unknown 0x0c/text/g' | psql -U vagrant city -a -f + + tables=$(echo -en $(mdb-schema $f postgres | grep "CREATE TABLE" | awk '{ print $3 }' | sed -e 's/"//g');) + + for i in $tables + + do + echo "[File: "$f" ] [Table - "$i"]" + + mdb-export -D ‘%%Y-%%m-%%d %%H:%%M:%%S’ -I postgress -q \’ -R \; $f $i | psql -U vagrant city -w + + done + + done + +# # return to project root $BASEDIR +popd + diff --git a/generate-ddl.sh b/generate-ddl.sh new file mode 100755 index 0000000..f3e8bd8 --- /dev/null +++ b/generate-ddl.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +CONFIGFILE="config/cityscrape-config.sh" + +. $CONFIGFILE + +pushd $WORKDIR +mdb_files=$(echo `ls *.mdb 2>/dev/null`) +if [ -z "$mdb_files" ]; then + echo "No *.mdb files found, exiting..." +else + for mdb_file in $mdb_files + do + echo "Extracting tables from $mdb_file" + ddl_file=$mdb_file$DDL_FILE_SUFFIX + + mdb-schema $mdb_file | sed 's/Char/Varchar/g' | sed 's/Postgres_Unknown 0x0c/text/g' > ddl/$ddl_file + + tables=$(echo -en $(mdb-schema $mdb_file postgres | grep "CREATE TABLE IF NOT EXISTS" | awk '{ print $3 }' | sed -e 's/"//g');) + + if [ -z "$tables" ] + then + echo "No tables found, skipping table ddl generation." + else + for table in $tables + do + echo $table > "$table$DDL_FILE_SUFFIX" + done + fi + done +fi +popd \ No newline at end of file diff --git a/generate-schema-from-ddl.sh b/generate-schema-from-ddl.sh new file mode 100755 index 0000000..bf83c60 --- /dev/null +++ b/generate-schema-from-ddl.sh @@ -0,0 +1,19 @@ +CONFIGFILE="config/cityscrape-config.sh" + +. $CONFIGFILE + +pushd $DDL_FILES + +files=$(echo `ls *.mdb 2>/dev/null`) + +if [[ -z "$files" ]]; then + echo "No Schema Definitions Found, Exiting..." + exit 3 +else + for file in $files + do + cat $file | tr -d "[]" > $file.sql + done +fi + +popd \ No newline at end of file diff --git a/generate-shapefile-manifest.sh b/generate-shapefile-manifest.sh new file mode 100755 index 0000000..f7bba98 --- /dev/null +++ b/generate-shapefile-manifest.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +CONFIGFILE="config/cityscrape-config.sh" + +. $CONFIGFILE + +pushd $WORKDIR + +if ! [[ -z $SHAPEFILE_MANIFEST ]]; then + rm $SHAPEFILE_MANIFEST +else + echo "Removeing old shapefile manifest" + rm $SHAPEFILE_MANIFEST +fi + +shp_files=$(echo `ls *.shp 2>/dev/null`) + +if [ -z "$shp_files" ]; then + echo "No *.shp files found, skipping ogr2ogr..." +else + for shp_file in $shp_files; + do + echo ogr2ogr -overwrite -progress -skipfailures -f "PostgreSQL" PG:"host=localhost user=postgres dbname=city" $shp_file >> $SHAPEFILE_MANIFEST + done +fi + +popd \ No newline at end of file diff --git a/run-cityscrape-get.sh b/get.sh similarity index 100% rename from run-cityscrape-get.sh rename to get.sh diff --git a/run-cityscrape-postgresql-ingest.sh b/run-cityscrape-postgresql-ingest.sh old mode 100755 new mode 100644 index b775787..7d4db2d --- a/run-cityscrape-postgresql-ingest.sh +++ b/run-cityscrape-postgresql-ingest.sh @@ -40,9 +40,9 @@ for f in *.mdb mdb-export -D ‘%%Y-%%m-%%d %%H:%%M:%%S’ -I postgress -q \’ -R \; $f $i | psql -d city -U postgres -w -h localhost - done + done -done + done # # return to project root $BASEDIR popd diff --git a/run-cityscrape.sh b/run-cityscrape.sh new file mode 100755 index 0000000..7103ac8 --- /dev/null +++ b/run-cityscrape.sh @@ -0,0 +1,23 @@ +#!/bin/bash -e + +CONFIGFILE="config/cityscrape-config.sh" + +. $CONFIGFILE + +# echo "Step 1: Fetching Cityscrape data" +# ./get.sh + +# echo "Step 2: Unzipping archives" +# ./unzip.sh + +# echo "Step 3: Generating DDL files" +# ./generate-ddl.sh + +# echo "Step 4: Generatign Shapefile load commands" +# ./generate-shapefile-manifest.sh + +echo "Step 5: Generating Schema from ddl definitions" +./generate-schema-from-ddl.sh + +# echo "Step 6: Upload Shapefiles to database" +# ./upload-shapefiles-from-manifest.sh \ No newline at end of file diff --git a/run-ddl-generation.sh b/run-ddl-generation.sh old mode 100755 new mode 100644 index 1aa86f8..b19abee --- a/run-ddl-generation.sh +++ b/run-ddl-generation.sh @@ -4,33 +4,6 @@ CONFIGFILE="config/cityscrape-config.sh" . $CONFIGFILE -echo "Running Cityscrape PostgreSQL Ingest" - -pushd $WORKDIR -echo "Unzipping files..." - -zip_files=$(echo `ls *.zip 2>/dev/null`) -if [ -z "$zip_files" ]; then - echo "No *.zip files found, skipping unzip..." -else - for zip_file in $zip_files; - do - unzip -o $zip_file - done - echo "Unzip complete" -fi - -shp_files=$(echo `ls *.shp 2>/dev/null`) -if [ -z "$shp_files" ]; then - echo "No *.shp files found, skipping ogr2ogr..." -else - for shp_file in $shp_files; - do - echo `ls $shp_file` - # ogr2ogr -overwrite -progress -skipfailures -f "PostgreSQL" PG:"host=localhost user=postgres dbname=city" $shp_file - done -fi - pushd $DDL_FILES echo "Building ddl sql files now..." @@ -57,7 +30,4 @@ else done fi done -fi - -popd - +fi \ No newline at end of file diff --git a/unzip.sh b/unzip.sh new file mode 100755 index 0000000..63c9b40 --- /dev/null +++ b/unzip.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +CONFIGFILE="config/cityscrape-config.sh" + +. $CONFIGFILE + +echo "Running Cityscrape PostgreSQL Ingest" + +pushd $WORKDIR +echo "Unzipping files..." + +echo `ls *.zip` | xargs -n 1 unzip -o +popd \ No newline at end of file diff --git a/upload-shapefiles-from-shapefile-manifest.sh b/upload-shapefiles-from-shapefile-manifest.sh new file mode 100755 index 0000000..fbaab37 --- /dev/null +++ b/upload-shapefiles-from-shapefile-manifest.sh @@ -0,0 +1,10 @@ +#!/bin/bash +CONFIGFILE="config/cityscrape-config.sh" + +. $CONFIGFILE + +pushd $WORKDIR +while read line; do + $line +done < "$SHAPEFILE_MANIFEST" +popd \ No newline at end of file From 204ad8aef377f2f738d2906b2af4a219de5b61e8 Mon Sep 17 00:00:00 2001 From: Dylan Raithel Date: Tue, 1 Sep 2015 21:31:38 -0700 Subject: [PATCH 2/3] accesses and runs file, needs type marshalling before succesful --- generate-schema-from-ddl.sh | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/generate-schema-from-ddl.sh b/generate-schema-from-ddl.sh index bf83c60..6822a5b 100755 --- a/generate-schema-from-ddl.sh +++ b/generate-schema-from-ddl.sh @@ -4,15 +4,27 @@ CONFIGFILE="config/cityscrape-config.sh" pushd $DDL_FILES -files=$(echo `ls *.mdb 2>/dev/null`) +mdb_files=$(echo `ls *.mdb 2>/dev/null`) -if [[ -z "$files" ]]; then - echo "No Schema Definitions Found, Exiting..." +if [[ -z "$mdb_files" ]]; then + echo "No MDB Schema Definitions Found, Exiting..." exit 3 else - for file in $files + for mdb in $mdb_files do - cat $file | tr -d "[]" > $file.sql + cat $mdb | tr -d "[]" > $mdb.sql + done +fi + +sql_files=$(echo `ls *.sql 2>/dev/null`) +if [[ -z "$sql_files" ]]; then + echo "No Postgres Schema DDL, Exiting..." + exit 3 +else + for sql in $sql_files + do + psql -U vagrant -d city -a -w -f $sql + echo "[File: $sql] Completed Succesfully" done fi From 74718f45615e707c8beb136bf74b64c2cda75249 Mon Sep 17 00:00:00 2001 From: Dylan Raithel Date: Tue, 8 Sep 2015 20:07:52 -0700 Subject: [PATCH 3/3] ddl generation succeeded, generates tables --- generate-schema-from-ddl.sh | 4 +- marshall-datatypes.sh | 16 +++++++ run-cityscrape.sh | 19 +++++---- src/__init__.py | 0 src/marshall.py | 85 +++++++++++++++++++++++++++++++++++++ 5 files changed, 115 insertions(+), 9 deletions(-) create mode 100755 marshall-datatypes.sh create mode 100644 src/__init__.py create mode 100644 src/marshall.py diff --git a/generate-schema-from-ddl.sh b/generate-schema-from-ddl.sh index 6822a5b..e071c6a 100755 --- a/generate-schema-from-ddl.sh +++ b/generate-schema-from-ddl.sh @@ -6,6 +6,7 @@ pushd $DDL_FILES mdb_files=$(echo `ls *.mdb 2>/dev/null`) +# Create SQL files out of mdb files if [[ -z "$mdb_files" ]]; then echo "No MDB Schema Definitions Found, Exiting..." exit 3 @@ -16,7 +17,8 @@ else done fi -sql_files=$(echo `ls *.sql 2>/dev/null`) +# Pass SQL files to psql and execute against db +sql_files=$(echo `ls *postgres.sql 2>/dev/null`) if [[ -z "$sql_files" ]]; then echo "No Postgres Schema DDL, Exiting..." exit 3 diff --git a/marshall-datatypes.sh b/marshall-datatypes.sh new file mode 100755 index 0000000..8cfc968 --- /dev/null +++ b/marshall-datatypes.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +CONFIGFILE="config/cityscrape-config.sh" + +. $CONFIGFILE + +pushd $DDL_FILES + +sql=$(echo `ls *b.sql 2>/dev/null`) + +if [[ -z "$sql" ]]; then + echo "No DDL files found for marshalling, exiting..." + exit 3 +else + echo $sql | xargs -0 python ../../src/marshall.py +fi \ No newline at end of file diff --git a/run-cityscrape.sh b/run-cityscrape.sh index 7103ac8..a1c1129 100755 --- a/run-cityscrape.sh +++ b/run-cityscrape.sh @@ -4,17 +4,20 @@ CONFIGFILE="config/cityscrape-config.sh" . $CONFIGFILE -# echo "Step 1: Fetching Cityscrape data" -# ./get.sh +echo "Step 1: Fetching Cityscrape data" +./get.sh -# echo "Step 2: Unzipping archives" -# ./unzip.sh +echo "Step 2: Unzipping archives" +./unzip.sh -# echo "Step 3: Generating DDL files" -# ./generate-ddl.sh +echo "Step 3: Generating DDL files" +./generate-ddl.sh -# echo "Step 4: Generatign Shapefile load commands" -# ./generate-shapefile-manifest.sh +echo "Step 4: Generatign Shapefile load commands" +./generate-shapefile-manifest.sh + +echo "Step 5: DataType Marshalling with RegEx" +./marshall-datatypes.sh echo "Step 5: Generating Schema from ddl definitions" ./generate-schema-from-ddl.sh diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/marshall.py b/src/marshall.py new file mode 100644 index 0000000..a604d2f --- /dev/null +++ b/src/marshall.py @@ -0,0 +1,85 @@ +''' +@author dylan.raithel +@date 2 Sep 2015 + +"Marshall" the Access types to Postgresql + +''' + +import re +import sys + + +def test_typeMap(): + # from marshall import typeMap + + file_name = '../workdir/ddl/prcl.mdb.sql' + + mapper = typeMap() + + mapper.handle_file(file_name) + + +class typeMap(object): + + SUFFIX = 'postgres.sql' + PATH = '../workdir/ddl/' + + def __init__(self, file_name): + + self.file_name = file_name + + def convert_file(self): + ''' + Iterate over all the ddl files in the working directory that + need typeMap conversion + ''' + self._handle_file(self.file_name) + + def _handle_file(self, file_name): + + self.mapper = self._access_to_postgres() + filepath = self.PATH + file_name + with open(filepath, 'r') as raw: + sqlmap = dict(( + re.escape(k), v) for k, v in self.mapper.iteritems()) + + pattern = re.compile("|".join(sqlmap.keys())) + + text_stream = raw.read() + + text = pattern.sub( + lambda m: sqlmap[re.escape(m.group(0))], text_stream) + + newfilename = '{}_{}'.format(file_name, self.SUFFIX) + + with open(newfilename, 'w') as newfile: + newfile.write(text) + + def _access_to_postgres(self): + ''' + Return a map of MSsql data types to Posqgresql + + ''' + dictmap = {"Double": "Varchar", "Integer": "Varchar", + "Byte": "Varchar", "Text (4)": "Varchar", + "Long Integer": "Varchar", "DateTime": "Varchar", + "Boolean NOT NULL": "Varchar", + "Text (2)": "Varchar", "Single": "Varchar", + "Double": "Varchar", "Text (22)": "Varchar", + "Text (8)": "Varchar", "Text (80)": "Varchar", + "Text (26)": "Varchar", "Text (18)": "Varchar", + "Currency": "Varchar"} + return dictmap + +def main(): + + file_name = sys.argv[1] + + mapper = typeMap(file_name) + + mapper.convert_file() + + +if __name__ == '__main__': + main()