From 0c91790df8fecabda39fd701a646e9060a4d947f Mon Sep 17 00:00:00 2001 From: madtibo Date: Thu, 16 Aug 2018 12:30:17 +0200 Subject: [PATCH 1/3] add foreign key support for users id and posts id using the "--foreign-keys" switch WARNING: when using the foreign keys option, some entries in votes and postlinks might be updated to enforce data integrity --- load_into_pg.py | 249 ++++++++++++++++++++++------------------- sql/Badges_fk.sql | 1 + sql/Comments_fk.sql | 2 + sql/Comments_post.sql | 2 +- sql/PostHistory_fk.sql | 2 + sql/PostLinks_fk.sql | 5 + sql/Posts_fk.sql | 3 + sql/Tags_fk.sql | 2 + sql/Users_fk.sql | 2 + sql/Votes_fk.sql | 4 + sql/Votes_pre.sql | 2 +- 11 files changed, 156 insertions(+), 118 deletions(-) create mode 100644 sql/Badges_fk.sql create mode 100644 sql/Comments_fk.sql create mode 100644 sql/PostHistory_fk.sql create mode 100644 sql/PostLinks_fk.sql create mode 100644 sql/Posts_fk.sql create mode 100644 sql/Tags_fk.sql create mode 100644 sql/Users_fk.sql create mode 100644 sql/Votes_fk.sql diff --git a/load_into_pg.py b/load_into_pg.py index 33be75c..6765c37 100755 --- a/load_into_pg.py +++ b/load_into_pg.py @@ -30,7 +30,7 @@ def _createCmdTuple(cursor, keys, templ, attribs, insertJson): """Use the cursor to mogrify a tuple of data. The passed data in `attribs` is augmented with default data (NULLs) and the order of data in the tuple is the same as in the list of `keys`. The - `cursor` is used toe mogrify the data and the `templ` is the template used + `cursor` is used to mogrify the data and the `templ` is the template used for the mogrification. """ defs = _makeDefValues(keys) @@ -45,8 +45,114 @@ def _createCmdTuple(cursor, keys, templ, attribs, insertJson): values_to_insert = cursor.mogrify(templ, defs) return cursor.mogrify(templ, defs) -def handleTable(table, keys, insertJson, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPassword): +def _getTableKeys(table): + """Return an array of the keys for a given table""" + keys = None + if table == 'Users': + keys = [ + 'Id' + , 'Reputation' + , 'CreationDate' + , 'DisplayName' + , 'LastAccessDate' + , 'WebsiteUrl' + , 'Location' + , 'AboutMe' + , 'Views' + , 'UpVotes' + , 'DownVotes' + , 'ProfileImageUrl' + , 'Age' + , 'AccountId' + ] + elif table == 'Badges': + keys = [ + 'Id' + , 'UserId' + , 'Name' + , 'Date' + ] + elif table == 'PostLinks': + keys = [ + 'Id' + , 'CreationDate' + , 'PostId' + , 'RelatedPostId' + , 'LinkTypeId' + ] + elif table == 'Comments': + keys = [ + 'Id' + , 'PostId' + , 'Score' + , 'Text' + , 'CreationDate' + , 'UserId' + ] + elif table == 'Votes': + keys = [ + 'Id' + , 'PostId' + , 'VoteTypeId' + , 'UserId' + , 'CreationDate' + , 'BountyAmount' + ] + elif table == 'Posts': + keys = [ + 'Id' + , 'PostTypeId' + , 'AcceptedAnswerId' + , 'ParentId' + , 'CreationDate' + , 'Score' + , 'ViewCount' + , 'Body' + , 'OwnerUserId' + , 'LastEditorUserId' + , 'LastEditorDisplayName' + , 'LastEditDate' + , 'LastActivityDate' + , 'Title' + , 'Tags' + , 'AnswerCount' + , 'CommentCount' + , 'FavoriteCount' + , 'ClosedDate' + , 'CommunityOwnedDate' + ] + elif table == 'Tags': + keys = [ + 'Id' + , 'TagName' + , 'Count' + , 'ExcerptPostId' + , 'WikiPostId' + ] + elif table == 'PostHistory': + keys = [ + 'Id', + 'PostHistoryTypeId', + 'PostId', + 'RevisionGUID', + 'CreationDate', + 'UserId', + 'Text' + ] + elif table == 'Comments': + keys = [ + 'Id', + 'PostId', + 'Score', + 'Text', + 'CreationDate', + 'UserId', + ] + return keys + +def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPassword): """Handle the table including the post/pre processing.""" + keys = _getTableKeys(table) dbFile = mbDbFile if mbDbFile is not None else table + '.xml' tmpl = _createMogrificationTemplate(table, keys, insertJson) start_time = time.time() @@ -54,8 +160,9 @@ def handleTable(table, keys, insertJson, dbname, mbDbFile, mbHost, mbPort, mbUse try: pre = open('./sql/' + table + '_pre.sql').read() post = open('./sql/' + table + '_post.sql').read() + fk = open('./sql/' + table + '_fk.sql').read() except IOError as e: - six.print_("Could not load pre/post sql. Are you running from the correct path?", file=sys.stderr) + six.print_("Could not load pre/post/fk sql. Are you running from the correct path?", file=sys.stderr) sys.exit(-1) dbConnectionParam = "dbname={}".format(dbname) @@ -74,6 +181,7 @@ def handleTable(table, keys, insertJson, dbname, mbDbFile, mbHost, mbPort, mbUse if mbPassword is not None: dbConnectionParam += ' password={}'.format(mbPassword) + try: with pg.connect(dbConnectionParam) as conn: with conn.cursor() as cur: @@ -95,13 +203,12 @@ def handleTable(table, keys, insertJson, dbname, mbDbFile, mbHost, mbPort, mbUse for row_attribs in rows ] ) - if len(valuesStr) > 0: cmd = 'INSERT INTO ' + table + \ ' VALUES\n' + valuesStr + ';' cur.execute(cmd) conn.commit() - six.print_('Table processing took {:.1f} seconds'.format(time.time() - start_time)) + six.print_('Table processing took {1:.1f} seconds'.format(table, time.time() - start_time)) # Post-processing (creation of indexes) start_time = time.time() @@ -110,6 +217,14 @@ def handleTable(table, keys, insertJson, dbname, mbDbFile, mbHost, mbPort, mbUse cur.execute(post) conn.commit() six.print_('Post processing took {} seconds'.format(time.time() - start_time)) + if createFk: + # fk-processing (creation of foreign keys) + start_time = time.time() + six.print_('fk processing ...') + if post != '': + cur.execute(fk) + conn.commit() + six.print_('fk processing took {} seconds'.format(time.time() - start_time)) except IOError as e: six.print_("Could not read from file {}.".format(dbFile), file=sys.stderr) @@ -122,8 +237,6 @@ def handleTable(table, keys, insertJson, dbname, mbDbFile, mbHost, mbPort, mbUse six.print_("Warning from the database.", file=sys.stderr) six.print_("pg.Warning: {0}".format(str(w)), file=sys.stderr) - - ############################################################# parser = argparse.ArgumentParser() @@ -173,116 +286,16 @@ def handleTable(table, keys, insertJson, dbname, mbDbFile, mbHost, mbPort, mbUse , action = 'store_true' , default = False ) -args = parser.parse_args() -table = args.table -keys = None - -if table == 'Users': - keys = [ - 'Id' - , 'Reputation' - , 'CreationDate' - , 'DisplayName' - , 'LastAccessDate' - , 'WebsiteUrl' - , 'Location' - , 'AboutMe' - , 'Views' - , 'UpVotes' - , 'DownVotes' - , 'ProfileImageUrl' - , 'Age' - , 'AccountId' - ] -elif table == 'Badges': - keys = [ - 'Id' - , 'UserId' - , 'Name' - , 'Date' - ] -elif table == 'PostLinks': - keys = [ - 'Id' - , 'CreationDate' - , 'PostId' - , 'RelatedPostId' - , 'LinkTypeId' - ] -elif table == 'Comments': - keys = [ - 'Id' - , 'PostId' - , 'Score' - , 'Text' - , 'CreationDate' - , 'UserId' - ] -elif table == 'Votes': - keys = [ - 'Id' - , 'PostId' - , 'VoteTypeId' - , 'UserId' - , 'CreationDate' - , 'BountyAmount' - ] -elif table == 'Posts': - keys = [ - 'Id' - , 'PostTypeId' - , 'AcceptedAnswerId' - , 'ParentId' - , 'CreationDate' - , 'Score' - , 'ViewCount' - , 'Body' - , 'OwnerUserId' - , 'LastEditorUserId' - , 'LastEditorDisplayName' - , 'LastEditDate' - , 'LastActivityDate' - , 'Title' - , 'Tags' - , 'AnswerCount' - , 'CommentCount' - , 'FavoriteCount' - , 'ClosedDate' - , 'CommunityOwnedDate' - ] +parser.add_argument( '--foreign-keys' + , help = 'Create foreign keys.' + , action = 'store_true' + , default = False + ) - # If the user has not explicitly asked for loading the body, we replace it with NULL - if not args.with_post_body: - specialRules[('Posts', 'Body')] = 'NULL' +args = parser.parse_args() -elif table == 'Tags': - keys = [ - 'Id' - , 'TagName' - , 'Count' - , 'ExcerptPostId' - , 'WikiPostId' - ] -elif table == 'PostHistory': - keys = [ - 'Id', - 'PostHistoryTypeId', - 'PostId', - 'RevisionGUID', - 'CreationDate', - 'UserId', - 'Text' - ] -elif table == 'Comments': - keys = [ - 'Id', - 'PostId', - 'Score', - 'Text', - 'CreationDate', - 'UserId', - ] +table = args.table try: # Python 2/3 compatibility @@ -290,10 +303,14 @@ def handleTable(table, keys, insertJson, dbname, mbDbFile, mbHost, mbPort, mbUse except NameError: pass -choice = input('This will drop the {} table. Are you sure [y/n]?'.format(table)) +if table == 'Posts': + # If the user has not explicitly asked for loading the body, we replace it with NULL + if not args.with_post_body: + specialRules[('Posts', 'Body')] = 'NULL' + +choice = input('This will drop the {} table. Are you sure [y/n]?'.format(table)) if len(choice) > 0 and choice[0].lower() == 'y': handleTable(table, keys, args.insert_json, args.dbname, args.file, args.host, args.port, args.username, args.password) else: six.print_("Cancelled.") - diff --git a/sql/Badges_fk.sql b/sql/Badges_fk.sql new file mode 100644 index 0000000..b5a4e3f --- /dev/null +++ b/sql/Badges_fk.sql @@ -0,0 +1 @@ +ALTER TABLE badges ADD CONSTRAINT fk_badges_userid FOREIGN KEY (userid) REFERENCES users (id); diff --git a/sql/Comments_fk.sql b/sql/Comments_fk.sql new file mode 100644 index 0000000..aea00c9 --- /dev/null +++ b/sql/Comments_fk.sql @@ -0,0 +1,2 @@ +ALTER TABLE Comments ADD CONSTRAINT fk_comments_userid FOREIGN KEY (userid) REFERENCES users (id); +ALTER TABLE Comments ADD CONSTRAINT fk_comments_postid FOREIGN KEY (postid) REFERENCES posts (id); diff --git a/sql/Comments_post.sql b/sql/Comments_post.sql index e19e8b8..2c3e7a2 100644 --- a/sql/Comments_post.sql +++ b/sql/Comments_post.sql @@ -6,4 +6,4 @@ CREATE INDEX cmnts_postid_idx ON Comments USING hash (PostId) CREATE INDEX cmnts_creation_date_idx ON Comments USING btree (CreationDate) WITH (FILLFACTOR = 100); CREATE INDEX cmnts_userid_idx ON Comments USING btree (UserId) - WITH (FILLFACTOR = 100); \ No newline at end of file + WITH (FILLFACTOR = 100); diff --git a/sql/PostHistory_fk.sql b/sql/PostHistory_fk.sql new file mode 100644 index 0000000..91379eb --- /dev/null +++ b/sql/PostHistory_fk.sql @@ -0,0 +1,2 @@ +ALTER TABLE Posthistory ADD CONSTRAINT fk_posthistory_userid FOREIGN KEY (userid) REFERENCES users (id); +ALTER TABLE Posthistory ADD CONSTRAINT fk_posthistory_postid FOREIGN KEY (postid) REFERENCES posts (id); diff --git a/sql/PostLinks_fk.sql b/sql/PostLinks_fk.sql new file mode 100644 index 0000000..7d01d03 --- /dev/null +++ b/sql/PostLinks_fk.sql @@ -0,0 +1,5 @@ +-- impossible to enforce so set NULL +UPDATE Postlinks SET postid=NULL WHERE postid NOT IN (SELECT DISTINCT id FROM Posts); +ALTER TABLE Postlinks ADD CONSTRAINT fk_postlinks_postid FOREIGN KEY (postid) REFERENCES posts (id); +UPDATE Postlinks SET relatedpostid=NULL WHERE relatedpostid NOT IN (SELECT DISTINCT id FROM Posts); +ALTER TABLE Postlinks ADD CONSTRAINT fk_postlinks_relatedpostid FOREIGN KEY (relatedpostid) REFERENCES posts (id); diff --git a/sql/Posts_fk.sql b/sql/Posts_fk.sql new file mode 100644 index 0000000..65fea37 --- /dev/null +++ b/sql/Posts_fk.sql @@ -0,0 +1,3 @@ +ALTER TABLE Posts ADD CONSTRAINT fk_posts_parentid FOREIGN KEY (parentid) REFERENCES posts (id); +ALTER TABLE Posts ADD CONSTRAINT fk_posts_owneruserid FOREIGN KEY (owneruserid) REFERENCES users (id); +ALTER TABLE Posts ADD CONSTRAINT fk_posts_lasteditoruserid FOREIGN KEY (lasteditoruserid) REFERENCES users (id); diff --git a/sql/Tags_fk.sql b/sql/Tags_fk.sql new file mode 100644 index 0000000..ca4ca40 --- /dev/null +++ b/sql/Tags_fk.sql @@ -0,0 +1,2 @@ +-- dummy query +SELECT 1; diff --git a/sql/Users_fk.sql b/sql/Users_fk.sql new file mode 100644 index 0000000..ca4ca40 --- /dev/null +++ b/sql/Users_fk.sql @@ -0,0 +1,2 @@ +-- dummy query +SELECT 1; diff --git a/sql/Votes_fk.sql b/sql/Votes_fk.sql new file mode 100644 index 0000000..37cdfb0 --- /dev/null +++ b/sql/Votes_fk.sql @@ -0,0 +1,4 @@ +ALTER TABLE Votes ADD CONSTRAINT fk_votes_userid FOREIGN KEY (userid) REFERENCES users (id); +-- impossible to enforce so set NULL +UPDATE Votes SET postid=NULL WHERE postid NOT IN (SELECT DISTINCT id FROM Posts); +ALTER TABLE Votes ADD CONSTRAINT fk_votes_postid FOREIGN KEY (postid) REFERENCES posts (id); diff --git a/sql/Votes_pre.sql b/sql/Votes_pre.sql index 29aebe0..3ed0b53 100644 --- a/sql/Votes_pre.sql +++ b/sql/Votes_pre.sql @@ -1,7 +1,7 @@ DROP TABLE IF EXISTS Votes CASCADE; CREATE TABLE Votes ( Id int PRIMARY KEY , - PostId int not NULL , + PostId int , -- not NULL , VoteTypeId int not NULL , UserId int , CreationDate timestamp not NULL , From b583f1f0f13f1cce9c9ed70595ff9e4c95a478ae Mon Sep 17 00:00:00 2001 From: madtibo Date: Mon, 21 Jan 2019 14:31:51 +0100 Subject: [PATCH 2/3] Do not force constraint validation by setting them as 'not valid' --- load_into_pg.py | 6 +++--- sql/PostLinks_fk.sql | 18 +++++++++++++----- sql/Votes_fk.sql | 12 +++++++++--- sql/Votes_pre.sql | 2 +- 4 files changed, 26 insertions(+), 12 deletions(-) diff --git a/load_into_pg.py b/load_into_pg.py index 6765c37..c452169 100755 --- a/load_into_pg.py +++ b/load_into_pg.py @@ -150,7 +150,7 @@ def _getTableKeys(table): ] return keys -def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPassword): +def handleTable(table, insertJson, createFk, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPassword): """Handle the table including the post/pre processing.""" keys = _getTableKeys(table) dbFile = mbDbFile if mbDbFile is not None else table + '.xml' @@ -309,8 +309,8 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas if not args.with_post_body: specialRules[('Posts', 'Body')] = 'NULL' -choice = input('This will drop the {} table. Are you sure [y/n]?'.format(table)) +choice = input('This will drop the {} table. Are you sure [y/n]? '.format(table)) if len(choice) > 0 and choice[0].lower() == 'y': - handleTable(table, keys, args.insert_json, args.dbname, args.file, args.host, args.port, args.username, args.password) + handleTable(table, args.insert_json, args.foreign_keys, args.dbname, args.file, args.host, args.port, args.username, args.password) else: six.print_("Cancelled.") diff --git a/sql/PostLinks_fk.sql b/sql/PostLinks_fk.sql index 7d01d03..5c40cb4 100644 --- a/sql/PostLinks_fk.sql +++ b/sql/PostLinks_fk.sql @@ -1,5 +1,13 @@ --- impossible to enforce so set NULL -UPDATE Postlinks SET postid=NULL WHERE postid NOT IN (SELECT DISTINCT id FROM Posts); -ALTER TABLE Postlinks ADD CONSTRAINT fk_postlinks_postid FOREIGN KEY (postid) REFERENCES posts (id); -UPDATE Postlinks SET relatedpostid=NULL WHERE relatedpostid NOT IN (SELECT DISTINCT id FROM Posts); -ALTER TABLE Postlinks ADD CONSTRAINT fk_postlinks_relatedpostid FOREIGN KEY (relatedpostid) REFERENCES posts (id); +-- impossible to enforce these constraints, set as 'not valid' to disable +-- initial test. +-- +-- These constaints can be forced running the following queries: +-- ALTER TABLE postlinks ALTER postid DROP NOT NULL; +-- UPDATE postlinks SET postid=NULL WHERE postid NOT IN (SELECT DISTINCT id FROM Posts); +-- ALTER TABLE postlinks VALIDATE CONSTRAINT fk_postlinks_postid; +-- ALTER TABLE postlinks ALTER relatedpostid DROP NOT NULL; +-- UPDATE postlinks SET relatedpostid=NULL WHERE relatedpostid NOT IN (SELECT DISTINCT id FROM Posts); +-- ALTER TABLE postlinks VALIDATE CONSTRAINT fk_postlinks_relatedpostid; +-- +ALTER TABLE Postlinks ADD CONSTRAINT fk_postlinks_postid FOREIGN KEY (postid) REFERENCES posts (id) NOT VALID; +ALTER TABLE Postlinks ADD CONSTRAINT fk_postlinks_relatedpostid FOREIGN KEY (relatedpostid) REFERENCES posts (id) NOT VALID; diff --git a/sql/Votes_fk.sql b/sql/Votes_fk.sql index 37cdfb0..a52a2a1 100644 --- a/sql/Votes_fk.sql +++ b/sql/Votes_fk.sql @@ -1,4 +1,10 @@ ALTER TABLE Votes ADD CONSTRAINT fk_votes_userid FOREIGN KEY (userid) REFERENCES users (id); --- impossible to enforce so set NULL -UPDATE Votes SET postid=NULL WHERE postid NOT IN (SELECT DISTINCT id FROM Posts); -ALTER TABLE Votes ADD CONSTRAINT fk_votes_postid FOREIGN KEY (postid) REFERENCES posts (id); +-- impossible to enforce this constraint, set as 'not valid' to disable +-- initial test. +-- +-- This constaint can be forced running the following queries: +-- ALTER TABLE votes ALTER PostId DROP NOT NULL; +-- UPDATE votes SET postid=NULL WHERE postid NOT IN (SELECT DISTINCT id FROM Posts); +-- ALTER TABLE votes VALIDATE CONSTRAINT fk_votes_postid; +-- +ALTER TABLE Votes ADD CONSTRAINT fk_votes_postid FOREIGN KEY (postid) REFERENCES posts (id) NOT VALID; diff --git a/sql/Votes_pre.sql b/sql/Votes_pre.sql index 3ed0b53..29aebe0 100644 --- a/sql/Votes_pre.sql +++ b/sql/Votes_pre.sql @@ -1,7 +1,7 @@ DROP TABLE IF EXISTS Votes CASCADE; CREATE TABLE Votes ( Id int PRIMARY KEY , - PostId int , -- not NULL , + PostId int not NULL , VoteTypeId int not NULL , UserId int , CreationDate timestamp not NULL , From ce68bb18ffcf40c7a2022db812d53294e3823c08 Mon Sep 17 00:00:00 2001 From: madtibo Date: Wed, 23 Jan 2019 09:24:55 +0100 Subject: [PATCH 3/3] log table name --- load_into_pg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/load_into_pg.py b/load_into_pg.py index c452169..66b651d 100755 --- a/load_into_pg.py +++ b/load_into_pg.py @@ -208,7 +208,7 @@ def handleTable(table, insertJson, createFk, dbname, mbDbFile, mbHost, mbPort, m ' VALUES\n' + valuesStr + ';' cur.execute(cmd) conn.commit() - six.print_('Table processing took {1:.1f} seconds'.format(table, time.time() - start_time)) + six.print_('Table {0} processing took {1:.1f} seconds'.format(table, time.time() - start_time)) # Post-processing (creation of indexes) start_time = time.time()