From eb674adaccc2fdb5c65e1396a5bc551c247194a4 Mon Sep 17 00:00:00 2001 From: Manos Tsagkias Date: Wed, 25 Feb 2015 15:13:04 +0100 Subject: [PATCH 1/3] Refactored parse_wikidump to allow programmtic access to its functionality, e.g., initialize db, and initiate downloads via code. --- semanticizest/parse_wikidump/__main__.py | 121 ++++++++++++++--------- 1 file changed, 75 insertions(+), 46 deletions(-) diff --git a/semanticizest/parse_wikidump/__main__.py b/semanticizest/parse_wikidump/__main__.py index 3a01789..c23dd8c 100644 --- a/semanticizest/parse_wikidump/__main__.py +++ b/semanticizest/parse_wikidump/__main__.py @@ -17,10 +17,12 @@ import re import sqlite3 import sys +import errno from six.moves.urllib.error import HTTPError from six.moves.urllib.request import urlretrieve +import argparse from docopt import docopt from . import parse_dump @@ -43,6 +45,41 @@ def __call__(self, n_blocks, blocksize, totalsize): self.threshold += .05 +class Db(object): + def __init__(self, fname): + self.db_fname = fname + self.db = "" + + + def connect(self): + try: + self.db = sqlite3.connect(self.db_fname) + except sqlite3.OperationalError as e: + if 'unable to open' in str(e): + # This exception doesn't store the path. + die("%s: %r" % (e, self.db_fname)) + else: + raise + + def disconnect(self): + if self.db: + self.db.close() + + def setup(self): + logger.info("Creating database at %r" % self.db_fname) + with open(createtables_path()) as f: + create = f.read() + + c = self.db.cursor() + try: + c.executescript(create) + except sqlite3.OperationalError as e: + if re.search(r'table .* already exists', str(e)): + die("database %r already populated" % self.db_fname) + else: + raise + + DUMP_TEMPLATE = ( "https://dumps.wikimedia.org/{0}/latest/{0}-latest-pages-articles.xml.bz2") @@ -52,53 +89,45 @@ def die(msg): sys.exit(1) -def main(args): - args = docopt(__doc__, args) - - if args["--download"]: - wikidump = args["--download"] + ".xml.bz2" - else: - wikidump = args[''] - - model_fname = args[''] - ngram = args['--ngram'] - if ngram == "None": - ngram = None - else: - ngram = int(ngram) - - logger.info("Creating database at %r" % model_fname) - try: - db = sqlite3.connect(model_fname) - except sqlite3.OperationalError as e: - if 'unable to open' in str(e): - # This exception doesn't store the path. - die("%s: %r" % (e, model_fname)) - else: - raise - with open(createtables_path()) as f: - create = f.read() +if __name__ == '__main__': + parser = argparse.ArgumentParser(prog="semanticizer.parse_wikidump", description="Semanticizest Wiki parser") + parser.add_argument('snapshot', + help='Local Wikipedia snapshot to use.') + parser.add_argument('model', + help='File to store the model.') + parser.add_argument('--download', dest='download', action="store_true", + help='Download snapshot if it does not exist as snapshot.xml.bz2. The corpus file name should match that of snapshot.') + parser.add_argument('-N', '--ngram', dest='ngram', default=7, type=int, + help='Maximum order of ngrams, set to None to disable [default: 7].') + args = parser.parse_args() - c = db.cursor() try: - c.executescript(create) - except sqlite3.OperationalError as e: - if re.search(r'table .* already exists', str(e)): - die("database %r already populated" % model_fname) + fh = open(args.snapshot, 'r') + except (IOError, OSError) as e: + if e.errno == errno.ENOENT and args.download: + m = re.match(r"(.+?)\.xml") + if m: + args.snapshot = m.group(1) + url = DUMP_TEMPLATE.format(args.snapshot) + print(url) + args.snapshot = args.snapshot + ".xml.bz2" + try: + urlretrieve(url, args.snapshot, Progress()) + except HTTPError as e: + die("Cannot download {0!r}: {1}".format(url, e)) else: raise - - if args["--download"]: - url = DUMP_TEMPLATE.format(args["--download"]) - logger.info("Saving wikidump to %r", wikidump) - try: - urlretrieve(url, wikidump, Progress()) - except HTTPError as e: - die("Cannot download {0!r}: {1}".format(url, e)) - - parse_dump(wikidump, db, N=ngram) - db.close() - - -if __name__ == '__main__': - main(sys.argv[1:]) + else: + fh.close() + + # Init, connect to DB and setup db schema + db = Db(args.model) + db.connect() + db.setup() + + # Parse wiki snapshot and store it to DB + parse_dump(args.snapshot, db.db, N=args.ngram) + + # Close connection to DB and exit + db.disconnect() + From 79c76e89de4d2332641c41f8edd7ed43168e29c2 Mon Sep 17 00:00:00 2001 From: Manos Tsagkias Date: Wed, 25 Feb 2015 15:43:23 +0100 Subject: [PATCH 2/3] Changed __doc__ --- semanticizest/parse_wikidump/__main__.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/semanticizest/parse_wikidump/__main__.py b/semanticizest/parse_wikidump/__main__.py index c23dd8c..11a6fa7 100644 --- a/semanticizest/parse_wikidump/__main__.py +++ b/semanticizest/parse_wikidump/__main__.py @@ -1,15 +1,9 @@ -"""parse_wikidump - -Usage: - parse_wikidump [options] - parse_wikidump --download= - -Options: - --download=wikiname Download dump from dumps.wikimedia.org first - --ngram=, -N - Maximum order of ngrams, set to None to disable - [default: 7] - --help, -h This help +""" +Parse Wikidump + +Reads in a Wikipedia snapshot file, or downloads it if it doesn't exist locally. +Then it attempts to parse it and store it in an SQL3 database, which it first +initializes. """ from __future__ import print_function From 251671b4232e87a6aff80a25f612fa67494dc8c7 Mon Sep 17 00:00:00 2001 From: Manos Tsagkias Date: Wed, 25 Feb 2015 16:25:25 +0100 Subject: [PATCH 3/3] Moving logic from __main__ to __init__. We also need a "download()" function in __init__ that takes care of downloading. --- semanticizest/parse_wikidump/__init__.py | 35 ++++++++++++++++++++++ semanticizest/parse_wikidump/__main__.py | 37 +----------------------- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/semanticizest/parse_wikidump/__init__.py b/semanticizest/parse_wikidump/__init__.py index e3d952a..bd67433 100644 --- a/semanticizest/parse_wikidump/__init__.py +++ b/semanticizest/parse_wikidump/__init__.py @@ -338,3 +338,38 @@ def parse_dump(dump, db, N=7, sentence_splitter=None, tokenizer=None): _logger.info("Dump parsing done: processed %d articles", i) db.commit() + + +class Db(object): + def __init__(self, fname): + self.db_fname = fname + self.db = "" + + + def connect(self): + try: + self.db = sqlite3.connect(self.db_fname) + except sqlite3.OperationalError as e: + if 'unable to open' in str(e): + # This exception doesn't store the path. + die("%s: %r" % (e, self.db_fname)) + else: + raise + + def disconnect(self): + if self.db: + self.db.close() + + def setup(self): + logger.info("Creating database at %r" % self.db_fname) + with open(createtables_path()) as f: + create = f.read() + + c = self.db.cursor() + try: + c.executescript(create) + except sqlite3.OperationalError as e: + if re.search(r'table .* already exists', str(e)): + die("database %r already populated" % self.db_fname) + else: + raise diff --git a/semanticizest/parse_wikidump/__main__.py b/semanticizest/parse_wikidump/__main__.py index 11a6fa7..5743827 100644 --- a/semanticizest/parse_wikidump/__main__.py +++ b/semanticizest/parse_wikidump/__main__.py @@ -19,7 +19,7 @@ import argparse from docopt import docopt -from . import parse_dump +from . import parse_dump, Db from .._semanticizer import createtables_path @@ -39,41 +39,6 @@ def __call__(self, n_blocks, blocksize, totalsize): self.threshold += .05 -class Db(object): - def __init__(self, fname): - self.db_fname = fname - self.db = "" - - - def connect(self): - try: - self.db = sqlite3.connect(self.db_fname) - except sqlite3.OperationalError as e: - if 'unable to open' in str(e): - # This exception doesn't store the path. - die("%s: %r" % (e, self.db_fname)) - else: - raise - - def disconnect(self): - if self.db: - self.db.close() - - def setup(self): - logger.info("Creating database at %r" % self.db_fname) - with open(createtables_path()) as f: - create = f.read() - - c = self.db.cursor() - try: - c.executescript(create) - except sqlite3.OperationalError as e: - if re.search(r'table .* already exists', str(e)): - die("database %r already populated" % self.db_fname) - else: - raise - - DUMP_TEMPLATE = ( "https://dumps.wikimedia.org/{0}/latest/{0}-latest-pages-articles.xml.bz2")