Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add foreign key support for users id and posts id #8

Merged
merged 3 commits into from
Jan 23, 2019
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
251 changes: 134 additions & 117 deletions load_into_pg.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def _createCmdTuple(cursor, keys, templ, attribs, insertJson):
"""Use the cursor to mogrify a tuple of data.
The passed data in `attribs` is augmented with default data (NULLs) and the
order of data in the tuple is the same as in the list of `keys`. The
`cursor` is used toe mogrify the data and the `templ` is the template used
`cursor` is used to mogrify the data and the `templ` is the template used
for the mogrification.
"""
defs = _makeDefValues(keys)
Expand All @@ -45,17 +45,124 @@ def _createCmdTuple(cursor, keys, templ, attribs, insertJson):
values_to_insert = cursor.mogrify(templ, defs)
return cursor.mogrify(templ, defs)

def handleTable(table, keys, insertJson, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPassword):
def _getTableKeys(table):
"""Return an array of the keys for a given table"""
keys = None
if table == 'Users':
keys = [
'Id'
, 'Reputation'
, 'CreationDate'
, 'DisplayName'
, 'LastAccessDate'
, 'WebsiteUrl'
, 'Location'
, 'AboutMe'
, 'Views'
, 'UpVotes'
, 'DownVotes'
, 'ProfileImageUrl'
, 'Age'
, 'AccountId'
]
elif table == 'Badges':
keys = [
'Id'
, 'UserId'
, 'Name'
, 'Date'
]
elif table == 'PostLinks':
keys = [
'Id'
, 'CreationDate'
, 'PostId'
, 'RelatedPostId'
, 'LinkTypeId'
]
elif table == 'Comments':
keys = [
'Id'
, 'PostId'
, 'Score'
, 'Text'
, 'CreationDate'
, 'UserId'
]
elif table == 'Votes':
keys = [
'Id'
, 'PostId'
, 'VoteTypeId'
, 'UserId'
, 'CreationDate'
, 'BountyAmount'
]
elif table == 'Posts':
keys = [
'Id'
, 'PostTypeId'
, 'AcceptedAnswerId'
, 'ParentId'
, 'CreationDate'
, 'Score'
, 'ViewCount'
, 'Body'
, 'OwnerUserId'
, 'LastEditorUserId'
, 'LastEditorDisplayName'
, 'LastEditDate'
, 'LastActivityDate'
, 'Title'
, 'Tags'
, 'AnswerCount'
, 'CommentCount'
, 'FavoriteCount'
, 'ClosedDate'
, 'CommunityOwnedDate'
]
elif table == 'Tags':
keys = [
'Id'
, 'TagName'
, 'Count'
, 'ExcerptPostId'
, 'WikiPostId'
]
elif table == 'PostHistory':
keys = [
'Id',
'PostHistoryTypeId',
'PostId',
'RevisionGUID',
'CreationDate',
'UserId',
'Text'
]
elif table == 'Comments':
keys = [
'Id',
'PostId',
'Score',
'Text',
'CreationDate',
'UserId',
]
return keys

def handleTable(table, insertJson, createFk, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPassword):
"""Handle the table including the post/pre processing."""
keys = _getTableKeys(table)
dbFile = mbDbFile if mbDbFile is not None else table + '.xml'
tmpl = _createMogrificationTemplate(table, keys, insertJson)
start_time = time.time()

try:
pre = open('./sql/' + table + '_pre.sql').read()
post = open('./sql/' + table + '_post.sql').read()
fk = open('./sql/' + table + '_fk.sql').read()
except IOError as e:
six.print_("Could not load pre/post sql. Are you running from the correct path?", file=sys.stderr)
six.print_("Could not load pre/post/fk sql. Are you running from the correct path?", file=sys.stderr)
sys.exit(-1)

dbConnectionParam = "dbname={}".format(dbname)
Expand All @@ -74,6 +181,7 @@ def handleTable(table, keys, insertJson, dbname, mbDbFile, mbHost, mbPort, mbUse
if mbPassword is not None:
dbConnectionParam += ' password={}'.format(mbPassword)


try:
with pg.connect(dbConnectionParam) as conn:
with conn.cursor() as cur:
Expand All @@ -95,13 +203,12 @@ def handleTable(table, keys, insertJson, dbname, mbDbFile, mbHost, mbPort, mbUse
for row_attribs in rows
]
)

if len(valuesStr) > 0:
cmd = 'INSERT INTO ' + table + \
' VALUES\n' + valuesStr + ';'
cur.execute(cmd)
conn.commit()
six.print_('Table processing took {:.1f} seconds'.format(time.time() - start_time))
six.print_('Table processing took {1:.1f} seconds'.format(table, time.time() - start_time))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not 'Processing table {} took ...' ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I might have had the idea to log it. Well spotted!
I added it.


# Post-processing (creation of indexes)
start_time = time.time()
Expand All @@ -110,6 +217,14 @@ def handleTable(table, keys, insertJson, dbname, mbDbFile, mbHost, mbPort, mbUse
cur.execute(post)
conn.commit()
six.print_('Post processing took {} seconds'.format(time.time() - start_time))
if createFk:
# fk-processing (creation of foreign keys)
start_time = time.time()
six.print_('fk processing ...')
if post != '':
cur.execute(fk)
conn.commit()
six.print_('fk processing took {} seconds'.format(time.time() - start_time))

except IOError as e:
six.print_("Could not read from file {}.".format(dbFile), file=sys.stderr)
Expand All @@ -122,8 +237,6 @@ def handleTable(table, keys, insertJson, dbname, mbDbFile, mbHost, mbPort, mbUse
six.print_("Warning from the database.", file=sys.stderr)
six.print_("pg.Warning: {0}".format(str(w)), file=sys.stderr)



#############################################################

parser = argparse.ArgumentParser()
Expand Down Expand Up @@ -173,127 +286,31 @@ def handleTable(table, keys, insertJson, dbname, mbDbFile, mbHost, mbPort, mbUse
, action = 'store_true'
, default = False
)
args = parser.parse_args()

table = args.table
keys = None

if table == 'Users':
keys = [
'Id'
, 'Reputation'
, 'CreationDate'
, 'DisplayName'
, 'LastAccessDate'
, 'WebsiteUrl'
, 'Location'
, 'AboutMe'
, 'Views'
, 'UpVotes'
, 'DownVotes'
, 'ProfileImageUrl'
, 'Age'
, 'AccountId'
]
elif table == 'Badges':
keys = [
'Id'
, 'UserId'
, 'Name'
, 'Date'
]
elif table == 'PostLinks':
keys = [
'Id'
, 'CreationDate'
, 'PostId'
, 'RelatedPostId'
, 'LinkTypeId'
]
elif table == 'Comments':
keys = [
'Id'
, 'PostId'
, 'Score'
, 'Text'
, 'CreationDate'
, 'UserId'
]
elif table == 'Votes':
keys = [
'Id'
, 'PostId'
, 'VoteTypeId'
, 'UserId'
, 'CreationDate'
, 'BountyAmount'
]
elif table == 'Posts':
keys = [
'Id'
, 'PostTypeId'
, 'AcceptedAnswerId'
, 'ParentId'
, 'CreationDate'
, 'Score'
, 'ViewCount'
, 'Body'
, 'OwnerUserId'
, 'LastEditorUserId'
, 'LastEditorDisplayName'
, 'LastEditDate'
, 'LastActivityDate'
, 'Title'
, 'Tags'
, 'AnswerCount'
, 'CommentCount'
, 'FavoriteCount'
, 'ClosedDate'
, 'CommunityOwnedDate'
]
parser.add_argument( '--foreign-keys'
, help = 'Create foreign keys.'
, action = 'store_true'
, default = False
)

# If the user has not explicitly asked for loading the body, we replace it with NULL
if not args.with_post_body:
specialRules[('Posts', 'Body')] = 'NULL'
args = parser.parse_args()

elif table == 'Tags':
keys = [
'Id'
, 'TagName'
, 'Count'
, 'ExcerptPostId'
, 'WikiPostId'
]
elif table == 'PostHistory':
keys = [
'Id',
'PostHistoryTypeId',
'PostId',
'RevisionGUID',
'CreationDate',
'UserId',
'Text'
]
elif table == 'Comments':
keys = [
'Id',
'PostId',
'Score',
'Text',
'CreationDate',
'UserId',
]
table = args.table

try:
# Python 2/3 compatibility
input = raw_input
except NameError:
pass

choice = input('This will drop the {} table. Are you sure [y/n]?'.format(table))

if table == 'Posts':
# If the user has not explicitly asked for loading the body, we replace it with NULL
if not args.with_post_body:
specialRules[('Posts', 'Body')] = 'NULL'

choice = input('This will drop the {} table. Are you sure [y/n]? '.format(table))
if len(choice) > 0 and choice[0].lower() == 'y':
handleTable(table, keys, args.insert_json, args.dbname, args.file, args.host, args.port, args.username, args.password)
handleTable(table, args.insert_json, args.foreign_keys, args.dbname, args.file, args.host, args.port, args.username, args.password)
else:
six.print_("Cancelled.")

1 change: 1 addition & 0 deletions sql/Badges_fk.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ALTER TABLE badges ADD CONSTRAINT fk_badges_userid FOREIGN KEY (userid) REFERENCES users (id);
2 changes: 2 additions & 0 deletions sql/Comments_fk.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ALTER TABLE Comments ADD CONSTRAINT fk_comments_userid FOREIGN KEY (userid) REFERENCES users (id);
ALTER TABLE Comments ADD CONSTRAINT fk_comments_postid FOREIGN KEY (postid) REFERENCES posts (id);
2 changes: 1 addition & 1 deletion sql/Comments_post.sql
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ CREATE INDEX cmnts_postid_idx ON Comments USING hash (PostId)
CREATE INDEX cmnts_creation_date_idx ON Comments USING btree (CreationDate)
WITH (FILLFACTOR = 100);
CREATE INDEX cmnts_userid_idx ON Comments USING btree (UserId)
WITH (FILLFACTOR = 100);
WITH (FILLFACTOR = 100);
2 changes: 2 additions & 0 deletions sql/PostHistory_fk.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ALTER TABLE Posthistory ADD CONSTRAINT fk_posthistory_userid FOREIGN KEY (userid) REFERENCES users (id);
ALTER TABLE Posthistory ADD CONSTRAINT fk_posthistory_postid FOREIGN KEY (postid) REFERENCES posts (id);
13 changes: 13 additions & 0 deletions sql/PostLinks_fk.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
-- impossible to enforce these constraints, set as 'not valid' to disable
-- initial test.
--
-- These constaints can be forced running the following queries:
-- ALTER TABLE postlinks ALTER postid DROP NOT NULL;
-- UPDATE postlinks SET postid=NULL WHERE postid NOT IN (SELECT DISTINCT id FROM Posts);
-- ALTER TABLE postlinks VALIDATE CONSTRAINT fk_postlinks_postid;
-- ALTER TABLE postlinks ALTER relatedpostid DROP NOT NULL;
-- UPDATE postlinks SET relatedpostid=NULL WHERE relatedpostid NOT IN (SELECT DISTINCT id FROM Posts);
-- ALTER TABLE postlinks VALIDATE CONSTRAINT fk_postlinks_relatedpostid;
--
ALTER TABLE Postlinks ADD CONSTRAINT fk_postlinks_postid FOREIGN KEY (postid) REFERENCES posts (id) NOT VALID;
ALTER TABLE Postlinks ADD CONSTRAINT fk_postlinks_relatedpostid FOREIGN KEY (relatedpostid) REFERENCES posts (id) NOT VALID;
3 changes: 3 additions & 0 deletions sql/Posts_fk.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ALTER TABLE Posts ADD CONSTRAINT fk_posts_parentid FOREIGN KEY (parentid) REFERENCES posts (id);
ALTER TABLE Posts ADD CONSTRAINT fk_posts_owneruserid FOREIGN KEY (owneruserid) REFERENCES users (id);
ALTER TABLE Posts ADD CONSTRAINT fk_posts_lasteditoruserid FOREIGN KEY (lasteditoruserid) REFERENCES users (id);
2 changes: 2 additions & 0 deletions sql/Tags_fk.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
-- dummy query
SELECT 1;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this here because Postgres complains if we try to run just a file with a comment?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, that is the reason.

2 changes: 2 additions & 0 deletions sql/Users_fk.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
-- dummy query
SELECT 1;
10 changes: 10 additions & 0 deletions sql/Votes_fk.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
ALTER TABLE Votes ADD CONSTRAINT fk_votes_userid FOREIGN KEY (userid) REFERENCES users (id);
-- impossible to enforce this constraint, set as 'not valid' to disable
-- initial test.
--
-- This constaint can be forced running the following queries:
-- ALTER TABLE votes ALTER PostId DROP NOT NULL;
-- UPDATE votes SET postid=NULL WHERE postid NOT IN (SELECT DISTINCT id FROM Posts);
-- ALTER TABLE votes VALIDATE CONSTRAINT fk_votes_postid;
--
ALTER TABLE Votes ADD CONSTRAINT fk_votes_postid FOREIGN KEY (postid) REFERENCES posts (id) NOT VALID;