Skip to content

Commit 619cadd

Browse files
committed
download and load a complete stackexchange project
using the '-s' switch, download the compressed file from _https://ia800107.us.archive.org/27/items/stackexchange/_, then, uncompress it and load all the files in the database. Add a '-n' switch to move the tables to a given schema WARNING: since using the urllib.request module, set the script to use python3
1 parent ab92755 commit 619cadd

File tree

2 files changed

+164
-39
lines changed

2 files changed

+164
-39
lines changed

README.md

+22-8
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,14 @@ Schema hints are taken from [a post on Meta.StackExchange](http://meta.stackexch
1818
`Badges.xml`, `Votes.xml`, `Posts.xml`, `Users.xml`, `Tags.xml`.
1919
- In some old dumps, the cases in the filenames are different.
2020
- Execute in the current folder (in parallel, if desired):
21-
- `python load_into_pg.py Badges`
22-
- `python load_into_pg.py Posts`
23-
- `python load_into_pg.py Tags` (not present in earliest dumps)
24-
- `python load_into_pg.py Users`
25-
- `python load_into_pg.py Votes`
26-
- `python load_into_pg.py PostLinks`
27-
- `python load_into_pg.py PostHistory`
28-
- `python load_into_pg.py Comments`
21+
- `python load_into_pg.py -t Badges`
22+
- `python load_into_pg.py -t Posts`
23+
- `python load_into_pg.py -t Tags` (not present in earliest dumps)
24+
- `python load_into_pg.py -t Users`
25+
- `python load_into_pg.py -t Votes`
26+
- `python load_into_pg.py -t PostLinks`
27+
- `python load_into_pg.py -t PostHistory`
28+
- `python load_into_pg.py -t Comments`
2929
- Finally, after all the initial tables have been created:
3030
- `psql stackoverflow < ./sql/final_post.sql`
3131
- If you used a different database name, make sure to use that instead of
@@ -34,6 +34,20 @@ Schema hints are taken from [a post on Meta.StackExchange](http://meta.stackexch
3434
- `psql stackoverflow < ./sql/optional_post.sql`
3535
- Again, remember to user the correct database name here, if not `stackoverflow`.
3636

37+
## Loading a complete stackexchange project
38+
39+
You can use the script to download a given stackexchange compressed file from [archive.org](https://ia800107.us.archive.org/27/items/stackexchange/) and load all the tables at once, using the `-s` switch.
40+
41+
You will need the `urllib` and `libarchive` modules.
42+
43+
If you give a schema name using the `-n` switch, all the tables will be moved to the given schema. This schema will be created in the script.
44+
45+
To load the _dba.stackexchange.com_ project in the `dba` schema, you would execute:
46+
`./load_into_pg.py -s dba -n dba`
47+
48+
The paths are not changed in the final scripts `sql/final_post.sql` and `sql/optional_post.sql`. To run them, first set the _search_path_ to your schema name:
49+
`SET search_path TO <myschema>;`
50+
3751
## Caveats and TODOs
3852

3953
- It prepares some indexes and views which may not be necessary for your analysis.

load_into_pg.py

+142-31
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1-
#!/usr/bin/env python
1+
#!/usr/bin/env python3
22
import sys
33
import time
44
import argparse
55
import psycopg2 as pg
6+
import os
67
import row_processor as Processor
78
import six
89

@@ -11,6 +12,51 @@
1112
('Posts', 'ViewCount'): "NULLIF(%(ViewCount)s, '')::int"
1213
}
1314

15+
# part of the file already downloaded
16+
file_part = None
17+
18+
def show_progress(block_num, block_size, total_size):
19+
"""Display the total size of the file to download and the progess in percent"""
20+
global file_part
21+
if file_part is None:
22+
suffixes=['B','KB','MB','GB','TB']
23+
suffixIndex = 0
24+
pp_size = total_size
25+
while pp_size > 1024:
26+
suffixIndex += 1 #increment the index of the suffix
27+
pp_size = pp_size/1024.0 #apply the division
28+
six.print_('Total file size is: {0:.1f} {1}'.format(pp_size,suffixes[suffixIndex]))
29+
six.print_("0 % of the file downloaded ...\r", end="", flush=True)
30+
file_part = 0
31+
32+
downloaded = block_num * block_size
33+
if downloaded < total_size:
34+
percent = 100 * downloaded / total_size
35+
if percent - file_part > 1:
36+
file_part = percent
37+
six.print_("{0} % of the file downloaded ...\r".format(int(percent)), end="", flush=True)
38+
else:
39+
file_part = None
40+
six.print_("")
41+
42+
def buildConnectionString(dbname, mbHost, mbPort, mbUsername, mbPassword):
43+
dbConnectionParam = "dbname={}".format(dbname)
44+
45+
if mbPort is not None:
46+
dbConnectionParam += ' port={}'.format(mbPort)
47+
48+
if mbHost is not None:
49+
dbConnectionParam += ' host={}'.format(mbHost)
50+
51+
# TODO Is the escaping done here correct?
52+
if mbUsername is not None:
53+
dbConnectionParam += ' user={}'.format(mbUsername)
54+
55+
# TODO Is the escaping done here correct?
56+
if mbPassword is not None:
57+
dbConnectionParam += ' password={}'.format(mbPassword)
58+
return dbConnectionParam
59+
1460
def _makeDefValues(keys):
1561
"""Returns a dictionary containing None for all keys."""
1662
return dict(( (k, None) for k in keys ))
@@ -141,7 +187,7 @@ def _getTableKeys(table):
141187
]
142188
return keys
143189

144-
def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPassword):
190+
def handleTable(table, createFk, mbDbFile, dbConnectionParam):
145191
"""Handle the table including the post/pre processing."""
146192
keys = _getTableKeys(table)
147193
dbFile = mbDbFile if mbDbFile is not None else table + '.xml'
@@ -156,23 +202,6 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas
156202
six.print_("Could not load pre/post/fk sql. Are you running from the correct path?", file=sys.stderr)
157203
sys.exit(-1)
158204

159-
dbConnectionParam = "dbname={}".format(dbname)
160-
161-
if mbPort is not None:
162-
dbConnectionParam += ' port={}'.format(mbPort)
163-
164-
if mbHost is not None:
165-
dbConnectionParam += ' host={}'.format(mbHost)
166-
167-
# TODO Is the escaping done here correct?
168-
if mbUsername is not None:
169-
dbConnectionParam += ' user={}'.format(mbUsername)
170-
171-
# TODO Is the escaping done here correct?
172-
if mbPassword is not None:
173-
dbConnectionParam += ' password={}'.format(mbPassword)
174-
175-
176205
try:
177206
with pg.connect(dbConnectionParam) as conn:
178207
with conn.cursor() as cur:
@@ -199,7 +228,7 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas
199228
' VALUES\n' + valuesStr + ';'
200229
cur.execute(cmd)
201230
conn.commit()
202-
six.print_('Table processing took {1:.1f} seconds'.format(table, time.time() - start_time))
231+
six.print_('Table \'{0}\' processing took {1:.1f} seconds'.format(table, time.time() - start_time))
203232

204233
# Post-processing (creation of indexes)
205234
start_time = time.time()
@@ -228,12 +257,32 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas
228257
six.print_("Warning from the database.", file=sys.stderr)
229258
six.print_("pg.Warning: {0}".format(str(w)), file=sys.stderr)
230259

260+
261+
def moveTableToSchema(table, schemaName, dbConnectionParam):
262+
try:
263+
with pg.connect(dbConnectionParam) as conn:
264+
with conn.cursor() as cur:
265+
# create the schema
266+
cur.execute('CREATE SCHEMA IF NOT EXISTS '+schemaName+';')
267+
conn.commit()
268+
# move the table to the right schema
269+
cur.execute('ALTER TABLE '+table+' SET SCHEMA '+schemaName+';')
270+
conn.commit()
271+
except pg.Error as e:
272+
six.print_("Error in dealing with the database.", file=sys.stderr)
273+
six.print_("pg.Error ({0}): {1}".format(e.pgcode, e.pgerror), file=sys.stderr)
274+
six.print_(str(e), file=sys.stderr)
275+
except pg.Warning as w:
276+
six.print_("Warning from the database.", file=sys.stderr)
277+
six.print_("pg.Warning: {0}".format(str(w)), file=sys.stderr)
278+
231279
#############################################################
232280

233281
parser = argparse.ArgumentParser()
234-
parser.add_argument( 'table'
282+
parser.add_argument( '-t', '--table'
235283
, help = 'The table to work on.'
236284
, choices = ['Users', 'Badges', 'Posts', 'Tags', 'Votes', 'PostLinks', 'PostHistory', 'Comments']
285+
, default = None
237286
)
238287

239288
parser.add_argument( '-d', '--dbname'
@@ -246,6 +295,16 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas
246295
, default = None
247296
)
248297

298+
parser.add_argument( '-s', '--so-project'
299+
, help = 'stackexchange project to load.'
300+
, default = None
301+
)
302+
303+
parser.add_argument( '--archive-url'
304+
, help = 'URL of the archive directory to retrieve.'
305+
, default = 'https://ia800107.us.archive.org/27/items/stackexchange'
306+
)
307+
249308
parser.add_argument( '-u', '--username'
250309
, help = 'Username for the database.'
251310
, default = None
@@ -272,6 +331,11 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas
272331
, default = False
273332
)
274333

334+
parser.add_argument( '-n', '--schema-name'
335+
, help = 'Use specific schema.'
336+
, default = 'public'
337+
)
338+
275339
parser.add_argument( '--foreign-keys'
276340
, help = 'Create foreign keys.'
277341
, action = 'store_true'
@@ -280,22 +344,69 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas
280344

281345
args = parser.parse_args()
282346

283-
table = args.table
284-
285347
try:
286348
# Python 2/3 compatibility
287349
input = raw_input
288350
except NameError:
289351
pass
290352

353+
dbConnectionParam = buildConnectionString(args.dbname, args.host, args.port, args.username, args.password)
354+
355+
# load given file in table
356+
if args.file and args.table:
357+
table = args.table
358+
359+
if table == 'Posts':
360+
# If the user has not explicitly asked for loading the body, we replace it with NULL
361+
if not args.with_post_body:
362+
specialRules[('Posts', 'Body')] = 'NULL'
363+
364+
choice = input('This will drop the {} table. Are you sure [y/n]?'.format(table))
365+
if len(choice) > 0 and choice[0].lower() == 'y':
366+
handleTable(table, args.foreign_keys, args.file, dbConnectionParam)
367+
else:
368+
six.print_("Cancelled.")
369+
if args.schema_name != 'public':
370+
moveTableToSchema(table, args.schema_name, dbConnectionParam)
371+
exit(0)
372+
373+
# load a project
374+
elif args.so_project:
375+
import urllib.request
376+
import libarchive
377+
378+
# download the 7z archive in /tmp
379+
file_name = args.so_project + '.stackexchange.com.7z'
380+
url = '{0}/{1}'.format(args.archive_url, file_name)
381+
filepath = '/tmp/'+file_name
382+
six.print_('Downloading the archive, please be patient ...')
383+
try:
384+
urllib.request.urlretrieve(url, filepath, show_progress)
385+
except Exception as e:
386+
six.print_('Error: impossible to download the {0} archive ({1})'.format(url, e))
387+
exit(1)
291388

292-
if table == 'Posts':
293-
# If the user has not explicitly asked for loading the body, we replace it with NULL
294-
if not args.with_post_body:
295-
specialRules[('Posts', 'Body')] = 'NULL'
389+
try:
390+
libarchive.extract_file(filepath)
391+
except Exception as e:
392+
six.print_('Error: impossible to extract the {0} archive ({1})'.format(url, e))
393+
exit(1)
394+
395+
tables = [ 'Tags', 'Users', 'Badges', 'Posts', 'Comments', 'Votes', 'PostLinks', 'PostHistory' ]
396+
397+
for table in tables:
398+
six.print_('Load {0}.xml file'.format(table))
399+
handleTable(table, args.foreign_keys, args.file, dbConnectionParam)
400+
# remove file
401+
os.remove(table+'.xml')
402+
# remove archive
403+
os.remove(filepath)
404+
405+
if args.schema_name != 'public':
406+
for table in tables:
407+
moveTableToSchema(table, args.schema_name, dbConnectionParam)
408+
exit(0)
296409

297-
choice = input('This will drop the {} table. Are you sure [y/n]?'.format(table))
298-
if len(choice) > 0 and choice[0].lower() == 'y':
299-
handleTable(table, keys, args.dbname, args.file, args.host, args.port, args.username, args.password)
300410
else:
301-
six.print_("Cancelled.")
411+
six.print_("Error: you must either use '-f' and '-t' arguments or the '-s' argument.")
412+
parser.print_help()

0 commit comments

Comments
 (0)