diff --git a/mwrefs/bibs/__main__.py b/mwrefs/bibs/__main__.py
new file mode 100644
index 0000000..b9b65dd
--- /dev/null
+++ b/mwrefs/bibs/__main__.py
@@ -0,0 +1,116 @@
+import argparse
+import subprocess
+import codecs
+import os
+
+import mw.xml_dump
+import mwxml
+import pathlib
+
+from . import utils, processors
+
+
+def open_xml_file(path):
+ f = mw.xml_dump.functions.open_file(
+ mw.xml_dump.functions.file(path)
+ )
+ return f
+
+
+def compressor_7z(file_path):
+ p = subprocess.Popen(
+ ['7z', 'a', '-si', file_path],
+ stdin=subprocess.PIPE,
+ stderr=subprocess.DEVNULL,
+ stdout=subprocess.DEVNULL,
+ )
+ utf8writer = codecs.getwriter('utf-8')
+
+ return utf8writer(p.stdin)
+
+
+def output_writer(path, compression):
+ if compression == '7z':
+ return compressor_7z(path + '.7z')
+ else:
+ return open(path, 'wt', encoding='utf-8')
+
+
+def create_path(path):
+ path = pathlib.Path(path)
+ path.parent.mkdir(parents=True, exist_ok=True)
+
+
+def get_args():
+ parser = argparse.ArgumentParser(
+ prog='wikidump',
+ description='Wikidump features extractor.',
+ )
+ parser.add_argument('files',
+ metavar='FILE',
+ type=pathlib.Path,
+ nargs='+',
+ help='XML Wikidump file to parse. It accepts only 7z.'
+ )
+ parser.add_argument('output_dir_path',
+ metavar='OUTPUT_DIR',
+ type=pathlib.Path,
+ help='XML output directory.',
+ )
+ parser.add_argument('--output-compression',
+ choices={None, '7z'},
+ required=False,
+ default=None,
+ help='Output compression format.',
+ )
+ parser.add_argument('--dry-run', '-n',
+ action='store_true',
+ help="Don't write any file",
+ )
+
+ subparsers = parser.add_subparsers(help='sub-commands help')
+ processors.bibliography_extractor.configure_subparsers(subparsers)
+ processors.identifiers_extractor.configure_subparsers(subparsers)
+ processors.sections_counter.configure_subparsers(subparsers)
+
+ parsed_args = parser.parse_args()
+ if 'func' not in parsed_args:
+ parser.print_usage()
+ parser.exit(1)
+
+ return parsed_args
+
+
+def main():
+ args = get_args()
+
+ args.output_dir_path.mkdir(parents=True, exist_ok=True)
+
+ for input_file_path in args.files:
+ utils.log("Analyzing {}...".format(input_file_path))
+
+ dump = mwxml.Dump.from_file(open_xml_file(str(input_file_path)))
+
+ basename = input_file_path.name
+
+ if args.dry_run:
+ pages_output = open(os.devnull, 'wt')
+ stats_output = open(os.devnull, 'wt')
+ else:
+ pages_output = output_writer(
+ path=str(args.output_dir_path/(basename + '.features.xml')),
+ compression=args.output_compression,
+ )
+ stats_output = output_writer(
+ path=str(args.output_dir_path/(basename + '.stats.xml')),
+ compression=args.output_compression,
+ )
+ args.func(dump,
+ pages_output,
+ stats_output,
+ args,
+ )
+
+
+if __name__ == '__main__':
+ main()
diff --git a/mwrefs/bibs/dumper.py b/mwrefs/bibs/dumper.py
new file mode 100644
index 0000000..7e78a8b
--- /dev/null
+++ b/mwrefs/bibs/dumper.py
@@ -0,0 +1,99 @@
+import mako.runtime
+import mako.template
+
+pages_revisions_template = '''
+<%!
+ from itertools import groupby
+ def groupby_action(diff):
+ return groupby(diff, lambda d: d.action)
+%>
+
+ % for page in pages:
+
+ ${page.title}
+ ${page.id}
+
+ % for revision in page.revisions:
+
+ ${revision.id}
+
+ ${revision.timestamp}
+
+ % for key, group in groupby_action(revision.references_diff):
+
+ % for _, text in group:
+ ${text}
+ % endfor
+
+ % endfor
+
+
+ % for key, group in groupby_action(revision.publication_identifiers_diff):
+
+ % for _, identifier in group:
+
+ % endfor
+
+ % endfor
+
+
+ % for section in revision.sections:
+
+ % endfor
+
+ ${revision.bibliography}
+
+ %endfor
+
+
+ % endfor
+
+'''
+
+stats_template = '''
+
+
+ ${stats['performance']['start_time']}
+ ${stats['performance']['end_time']}
+ ${stats['performance']['revisions_analyzed']}
+ ${stats['performance']['pages_analyzed']}
+
+
+ % for key in ['global', 'last_revision']:
+ <${key}>
+ % for where, count in stats['identifiers'][key].items():
+
+ % endfor
+ ${key}>
+ % endfor
+
+
+'''
+
+
+def render_template(template, output_handler, default_filters=None, **kwargs):
+ ctx = mako.runtime.Context(output_handler, **kwargs)
+
+ xml_template = mako.template.Template(
+ template,
+ default_filters=default_filters,
+ )
+ xml_template.render_context(ctx)
+
+
+def serialize_page_revisions(pages, output_handler):
+ render_template(
+ pages_revisions_template,
+ output_handler,
+ default_fiters=['x'], # XML escaping
+ pages=pages,
+ )
+
+
+def serialize_stats(stats, output_handler):
+ render_template(
+ stats_template,
+ output_handler,
+ default_filters=['x'], # XML escaping
+ stats=stats,
+ )
diff --git a/mwrefs/bibs/languages.py b/mwrefs/bibs/languages.py
new file mode 100644
index 0000000..655e06f
--- /dev/null
+++ b/mwrefs/bibs/languages.py
@@ -0,0 +1,82 @@
+supported = {'en', 'it'}
+
+bibliography = {
+ 'en': {
+ 'bibliography',
+ 'references',
+ 'reference',
+ 'further reading',
+ 'notes',
+ 'sources',
+ 'footnotes',
+ 'citations',
+ 'publications',
+ 'publication history',
+ 'literature',
+ },
+ 'it': {'bibliografia'},
+}
+
+citation = {
+ 'en': {'Citation', 'cite', 'vcite'},
+}
+
+"""
+What I mean for:
+* References: a section containing footnotes for works cited in the text.
+* Bibliography: a section containing articles and journals.
+* Further reading: like `Bibliography`, but contains references not used in the text.
+* Footnotes: a section containing explainations to concepts.
+
+From now on, words in backquotes (`) are to be interpreted as concept using the above definitions, while words in double quotes (") are to be interpreted as terms found in the text of the articles.
+
+"References" (term) is commonly used as `Bibliography` (concept), i.e. articles and journals without backref to the text.
+And, of course, "Bibliography" (term) is sometimes used as `References` (concept).
+* https://en.wikipedia.org/w/index.php?title=Anabaptists&oldid=49953891 "References" interpreted as `Bibliography`
+* https://en.wikipedia.org/w/index.php?title=Alcopop&oldid=296736852 "References" interpreted as `Bibliography`
+* https://en.wikipedia.org/w/index.php?title=Amu%20Darya&oldid=66374611 "References" interpreted as `Bibliography`
+
+"Citations" (term) sometimes used as synonym for "References" or "Bibliography" (terms):
+* https://en.wikipedia.org/w/index.php?title=Augustine_of_Canterbury&oldid=676642624 "Citations" used as `References`, "References" used as `Bibliography`
+* https://en.wikipedia.org/w/index.php?title=Anemometer&oldid=674186492#Citations "Citations" used as `References`
+
+"Notes and References" and "References and Notes" (terms) are used as synonyms for "References" (term):
+* https://en.wikipedia.org/w/index.php?title=Ackermann%20function&oldid=335603599#Notes_and_references "Notes and References" converted to "References" (term) and interpreted as `References`
+* https://en.wikipedia.org/w/index.php?title=albanians&oldid=391045161#Notes_and_references "Notes and References" is a wrapper around "Notes" (interpreted as `footnotes`) and "References" (interpreted as `References`)
+* https://en.wikipedia.org/w/index.php?title=assassination&oldid=678057527#Notes_and_references interpreted as `References`
+
+"Sources" seems to be interpreted as `Bibliography` or `References`, and sometimes then converted by users to "References" or "Bibliography"
+* https://en.wikipedia.org/w/index.php?title=artemis&diff=next&oldid=565871969 "Sources" has been converted to "References and sources"
+* https://en.wikipedia.org/w/index.php?title=Amakusa&direction=next&oldid=667294099 "Sources" used as `Bibliography`
+* https://en.wikipedia.org/w/index.php?title=A%20Doll's%20House&oldid=676505492#Sources "Sources" used as `Bibliography`
+* https://en.wikipedia.org/w/index.php?title=A.%20E.%20Housman&diff=next&oldid=678259900#Sources "Sources" used `Bibliography`
+
+"Footnotes" is commonly interpreted as `References`, with the following terms: "References" and "Citations"
+* https://en.wikipedia.org/w/index.php?title=Augustine%20of%20Canterbury&oldid=459457206#Footnotes "Footnotes" is used as `References`; "Footnotes" is then converted to "Citations", used as `References`
+* https://en.wikipedia.org/w/index.php?title=Amoxicillin&diff=next&oldid=423375138 "Footnotes" used as and converted to `References`
+* https://en.wikipedia.org/w/index.php?title=Anabaptists&oldid=49953891#Footnotes_and_references "Footnotes" interpreted as `References`. The next revision converts "Footnotes" to "Footnotes and References".
+* https://en.wikipedia.org/w/index.php?title=Alcopop&oldid=296736852#Footnotes "Footnotes" used as `References`
+* https://en.wikipedia.org/w/index.php?title=Archaeopteryx&diff=next&oldid=326796096 "Footnotes" interpreteda s and then converted to `References` (term and concept)
+* https://en.wikipedia.org/w/index.php?title=Al%20Capp&oldid=590148186#Footnotes "Footnotes" interpreted as `References`. It is then converted to "Notes"
+* https://en.wikipedia.org/w/index.php?title=Amu%20Darya&oldid=66374611#Footnotes "Footnotes" interpreted as `References`. Later converted to "Notes"
+* https://en.wikipedia.org/w/index.php?title=Albert%20Brooks&oldid=150996845#Footnotes "Footnotes" used as and then converted to `References` (term and concept)
+
+"Literature" is used most of the times as a subsection for things like "Culture", and in some cases is a replacement for "bibliography":
+* https://en.wikipedia.org/w/index.php?title=Alexandria&oldid=678355005 "Literature" used as subsection of "Culture"
+* https://en.wikipedia.org/w/index.php?title=Bible&oldid=23508742#Literature "Literature" used as `Bibliography`
+* https://en.wikipedia.org/w/index.php?title=Board_game&oldid=7131437#Literature "Literature" used as "Bibliography", then converted to "References" (used as "Bibliography")
+* https://en.wikipedia.org/w/index.php?title=Ahuitzotl&oldid=118183827 "Literature" interpreted as `Bibliography`
+
+"Publications" and "Publication history" are used as a subsection for the "Biography" with the works of the person described.
+
+"Reference" is almost always converted to "References" in a successive revision.
+
+
+"Notes" is sometimes interpreted as `References` or `Footnotes`
+* https://en.wikipedia.org/w/index.php?title=Ahuitzotl&oldid=118183827 "Notes" used as `Footnotes`
+* https://en.wikipedia.org/w/index.php?title=Archaeoastronomy&oldid=678777218#Notes "Notes" used as `References`
+* https://en.wikipedia.org/w/index.php?title=Alexander_of_Hales&oldid=661215939#Other_historical_works "Notes" interpreted as `References`
+
+"See also" and "Related pages" usually contain links to other wikipedia pages.
+"""
+
diff --git a/mwrefs/bibs/processors/__init__.py b/mwrefs/bibs/processors/__init__.py
new file mode 100644
index 0000000..d613340
--- /dev/null
+++ b/mwrefs/bibs/processors/__init__.py
@@ -0,0 +1 @@
+from . import bibliography_extractor, identifiers_extractor, sections_counter
diff --git a/mwrefs/bibs/processors/bibliography_extractor.py b/mwrefs/bibs/processors/bibliography_extractor.py
new file mode 100644
index 0000000..8285a65
--- /dev/null
+++ b/mwrefs/bibs/processors/bibliography_extractor.py
@@ -0,0 +1,202 @@
+import collections
+import functools
+import datetime
+
+import more_itertools
+import fuzzywuzzy.process
+
+from .. import utils, extractors, dumper, languages
+
+FUZZY_MATCH_CUTOFF = 91 # between 0, 100
+
+features_template = '''
+<%!
+ from itertools import groupby
+ def groupby_action(diff):
+ return groupby(diff, lambda d: d.action)
+%>
+<%def name="attribute_if_exists(name, text)" filter="trim">
+ % if text is not None:
+ ${name}="${text | x}"
+ % endif
+%def>
+<%def name="tag_user_if_exists(user)" filter="trim">
+ % if user:
+
+ % endif
+%def>
+
+ % for page in pages:
+
+ ${page.title | x}
+ ${page.id | x}
+
+ % for revision in page.revisions:
+
+ ${revision.id | x}
+ ${tag_user_if_exists(revision.user)}
+ ${revision.timestamp | x}
+
+ % for section in revision.sections:
+
+ % endfor
+
+
+ % endfor
+
+
+ % endfor
+
+'''
+
+stats_template = '''
+
+
+ ${stats['performance']['start_time'] | x}
+ ${stats['performance']['end_time'] | x}
+ ${stats['performance']['revisions_analyzed'] | x}
+ ${stats['performance']['pages_analyzed'] | x}
+
+
+ % for key in ['global', 'last_revision']:
+ <${key}>
+ % for section_name, count in stats['section_names'][key].most_common():
+
+ % endfor
+ ${key}>
+ % endfor
+
+
+'''
+
+Page = collections.namedtuple('Page', [
+ 'id',
+ 'title',
+ 'revisions',
+])
+Revision = collections.namedtuple('Revision', [
+ 'id',
+ 'user',
+ 'timestamp',
+ 'sections',
+])
+
+
+# TODO: instead of comparing section_name to a bib synonym,
+# search all the possible bib synonyms in the section name
+@functools.lru_cache(maxsize=500)
+def is_secion_bibliography(section_name, language, score_cutoff=FUZZY_MATCH_CUTOFF):
+ bibliography_synonyms = languages.bibliography[language]
+ match = fuzzywuzzy.process.extractOne(
+ section_name,
+ bibliography_synonyms,
+ score_cutoff=score_cutoff,
+ )
+ return bool(match)
+
+
+def extract_revisions(mw_page, language, stats, only_last_revision):
+ section_names_stats = stats['section_names']
+ revisions = more_itertools.peekable(mw_page)
+ for mw_revision in revisions:
+ utils.dot()
+
+ is_last_revision = not utils.has_next(revisions)
+ if only_last_revision and not is_last_revision:
+ continue
+
+ text = utils.remove_comments(mw_revision.text or '')
+
+ sections = (section for section, _ in extractors.sections(text))
+
+ bibliography_sections = list(section
+ for section in sections
+ if is_secion_bibliography(section.name, language))
+
+ for section in bibliography_sections:
+ section_names_stats['global'][section.name] += 1
+ if is_last_revision:
+ section_names_stats['last_revision'][section.name] += 1
+
+ yield Revision(
+ id=mw_revision.id,
+ user=mw_revision.user,
+ timestamp=mw_revision.timestamp.to_json(),
+ sections=bibliography_sections,
+ )
+
+ stats['performance']['revisions_analyzed'] += 1
+
+
+def extract_pages(dump, language, stats, only_last_revision):
+ for mw_page in dump:
+ utils.log("Processing", mw_page.title)
+
+ # Skip non-articles
+ if mw_page.namespace != 0:
+ utils.log('Skipped (namespace != 0)')
+ continue
+
+ revisions_generator = extract_revisions(
+ mw_page,
+ language=language,
+ stats=stats,
+ only_last_revision=only_last_revision,
+ )
+
+ yield Page(
+ id=mw_page.id,
+ title=mw_page.title,
+ revisions=revisions_generator,
+ )
+ stats['performance']['pages_analyzed'] += 1
+
+
+def configure_subparsers(subparsers):
+ parser = subparsers.add_parser('extract-bibliography',
+ help='Extract only sections may be a bibliography')
+ parser.add_argument('-l', '--language',
+ choices=languages.supported,
+ required=True,
+ help='The language of the dump.',
+ )
+ parser.add_argument('--only-last-revision',
+ action='store_true',
+ help='Consider only the last revision for each page.',
+ )
+ parser.set_defaults(func=main)
+
+
+def main(dump, features_output_h, stats_output_h, args):
+ stats = {
+ 'performance': {
+ 'start_time': None,
+ 'end_time': None,
+ 'revisions_analyzed': 0,
+ 'pages_analyzed': 0,
+ },
+ 'section_names': {
+ 'global': collections.Counter(),
+ 'last_revision': collections.Counter(),
+ },
+ }
+ pages_generator = extract_pages(dump,
+ language=args.language,
+ stats=stats,
+ only_last_revision=args.only_last_revision,
+ )
+ with features_output_h:
+ stats['performance']['start_time'] = datetime.datetime.utcnow()
+ dumper.render_template(
+ features_template,
+ output_handler=features_output_h,
+ pages=pages_generator,
+ )
+ stats['performance']['end_time'] = datetime.datetime.utcnow()
+
+ with stats_output_h:
+ dumper.render_template(
+ stats_template,
+ stats_output_h,
+ stats=stats,
+ )
diff --git a/mwrefs/bibs/processors/identifiers_extractor.py b/mwrefs/bibs/processors/identifiers_extractor.py
new file mode 100644
index 0000000..9ba5961
--- /dev/null
+++ b/mwrefs/bibs/processors/identifiers_extractor.py
@@ -0,0 +1,220 @@
+import collections
+import datetime
+import more_itertools
+
+from .. import utils, extractors, dumper
+
+features_template = '''
+<%!
+ from itertools import groupby
+ def groupby_action(diff):
+ return groupby(diff, lambda d: d.action)
+%>
+<%def name="attribute_if_exists(name, text)" filter="trim">
+ % if text is not None:
+ ${name}="${text | x}"
+ % endif
+%def>
+<%def name="tag_user_if_exists(user)" filter="trim">
+ % if user:
+
+ % endif
+%def>
+
+ % for page in pages:
+
+ ${page.title | x}
+ ${page.id | x}
+
+ % for revision in page.revisions:
+
+ ${revision.id | x}
+ ${tag_user_if_exists(revision.user)}
+ ${revision.timestamp | x}
+
+ % for key, group in groupby_action(revision.publication_identifiers_diff):
+
+ % for _, identifier in group:
+
+ % endfor
+
+ % endfor
+
+
+ %endfor
+
+
+ % endfor
+
+'''
+
+stats_template = '''
+
+
+ ${stats['performance']['start_time']}
+ ${stats['performance']['end_time']}
+ ${stats['performance']['revisions_analyzed']}
+ ${stats['performance']['pages_analyzed']}
+
+
+ % for key in ['global', 'last_revision']:
+ <${key}>
+ % for where, count in stats['identifiers'][key].items():
+
+ % endfor
+ ${key}>
+ % endfor
+
+
+'''
+
+Page = collections.namedtuple('Page', [
+ 'id',
+ 'title',
+ 'revisions',
+])
+Revision = collections.namedtuple('Revision', [
+ 'id',
+ 'user',
+ 'timestamp',
+ 'publication_identifiers_diff',
+])
+
+def IdentifierStatsDict():
+ return {
+ 'only_in_raw_text': 0,
+ 'only_in_tag_ref': 0,
+ 'only_in_template': 0,
+ 'in_tag_ref_and_template': 0,
+ }
+
+
+@utils.listify(wrapper=set)
+def where_appears(span, **spans):
+ span_le = extractors.Span.__le__
+ for key, span_list in spans.items():
+ # if any(span <= other_span) for other_span in span_list):
+ # HACK: the following is more efficient. Sorry :(
+ if any(span_le(span, other_span) for other_span in span_list):
+ yield key
+
+
+def identifier_appearance_stat_key(appearances):
+ if {'templates', 'references'} <= appearances:
+ return 'in_tag_ref_and_template'
+ elif 'templates' in appearances:
+ return 'only_in_template'
+ elif 'references' in appearances:
+ return 'only_in_tag_ref'
+ else:
+ return 'only_in_raw_text'
+
+
+def extract_revisions(page, stats, only_last_revision):
+ revisions = more_itertools.peekable(page)
+
+ prev_identifiers = set()
+ for mw_revision in revisions:
+ utils.dot()
+
+ is_last_revision = not utils.has_next(revisions)
+ if only_last_revision and not is_last_revision:
+ continue
+
+ text = utils.remove_comments(mw_revision.text or '')
+
+ references_captures = list(extractors.references(text))
+
+ templates_captures = list(extractors.templates(text))
+
+ identifiers_captures = list(extractors.pub_identifiers(text))
+ identifiers = [identifier for identifier, _ in identifiers_captures]
+
+ for identifier, span in identifiers_captures:
+ appearances = where_appears(span,
+ references=(span for _, span in references_captures),
+ templates=(span for _, span in templates_captures),
+ )
+ key_to_increment = identifier_appearance_stat_key(appearances)
+
+ stats['identifiers']['global'][key_to_increment] += 1
+ if is_last_revision:
+ stats['identifiers']['last_revision'][key_to_increment] += 1
+
+ yield Revision(
+ id=mw_revision.id,
+ user=mw_revision.user,
+ timestamp=mw_revision.timestamp.to_json(),
+ publication_identifiers_diff=utils.diff(prev_identifiers,
+ identifiers),
+ )
+
+ stats['performance']['revisions_analyzed'] += 1
+ prev_identifiers = identifiers
+
+
+def extract_pages(dump, stats, only_last_revision):
+ for mw_page in dump:
+ utils.log("Processing", mw_page.title)
+
+ # Skip non-articles
+ if mw_page.namespace != 0:
+ utils.log('Skipped (namespace != 0)')
+ continue
+
+ revisions_generator = extract_revisions(
+ mw_page,
+ stats=stats,
+ only_last_revision=only_last_revision,
+ )
+
+ yield Page(
+ id=mw_page.id,
+ title=mw_page.title,
+ revisions=revisions_generator,
+ )
+ stats['performance']['pages_analyzed'] += 1
+
+
+def configure_subparsers(subparsers):
+ parser = subparsers.add_parser('extract-identifiers',
+ help='Extract the identifiers from the text (doi, isbn, arxiv and pubmed.')
+ parser.add_argument('--only-last-revision',
+ action='store_true',
+ help='Consider only the last revision for each page.',
+ )
+ parser.set_defaults(func=main)
+
+
+def main(dump, features_output_h, stats_output_h, args):
+ stats = {
+ 'performance': {
+ 'start_time': None,
+ 'end_time': None,
+ 'revisions_analyzed': 0,
+ 'pages_analyzed': 0,
+ },
+ 'identifiers': {
+ 'global': IdentifierStatsDict(),
+ 'last_revision': IdentifierStatsDict(),
+ },
+ }
+ pages_generator = extract_pages(dump,
+ stats=stats,
+ only_last_revision=args.only_last_revision,
+ )
+ with features_output_h:
+ stats['performance']['start_time'] = datetime.datetime.utcnow()
+ dumper.render_template(
+ features_template,
+ output_handler=features_output_h,
+ pages=pages_generator,
+ )
+ stats['performance']['end_time'] = datetime.datetime.utcnow()
+
+ with stats_output_h:
+ dumper.render_template(
+ stats_template,
+ stats_output_h,
+ stats=stats,
+ )
diff --git a/mwrefs/bibs/processors/sections_counter.py b/mwrefs/bibs/processors/sections_counter.py
new file mode 100644
index 0000000..1ae3141
--- /dev/null
+++ b/mwrefs/bibs/processors/sections_counter.py
@@ -0,0 +1,136 @@
+import collections
+import datetime
+
+import more_itertools
+
+from .. import utils, extractors, dumper
+
+
+stats_template = '''
+
+
+ ${stats['performance']['start_time']}
+ ${stats['performance']['end_time']}
+ ${stats['performance']['revisions_analyzed']}
+ ${stats['performance']['pages_analyzed']}
+
+
+ % for key in ['global', 'last_revision']:
+ <${key}>
+ % for section_name, count in stats['section_names_per_revision'][key].most_common():
+
+ % endfor
+ ${key}>
+ % endfor
+
+
+ % for key in ['global', 'last_revision']:
+ <${key}>
+ % for sections_in_revision, count in stats['sections_per_revision'][key].most_common():
+
+ % endfor
+ ${key}>
+ % endfor
+
+
+
+
+
+
+'''
+
+
+def analyze_revisions(page, stats, only_last_revision):
+ revisions = more_itertools.peekable(page)
+
+ section_names_stats = stats['section_names_per_revision']
+ sections_stats = stats['sections_per_revision']
+
+ for mw_revision in revisions:
+ utils.dot()
+
+ is_last_revision = not utils.has_next(revisions)
+ if only_last_revision and not is_last_revision:
+ continue
+
+ text = utils.remove_comments(mw_revision.text or '')
+
+ section_names = [section.name.strip().lower()
+ for section, _ in extractors.sections(text)]
+ sections_count = len(section_names)
+
+ for section_name in section_names:
+ section_names_stats['global'][section_name] += 1
+ if is_last_revision:
+ section_names_stats['last_revision'][section_name] += 1
+
+ sections_stats['global'][sections_count] += 1
+ if is_last_revision:
+ sections_stats['last_revision'][sections_count] += 1
+
+ stats['revisions']['global'] += 1
+ if is_last_revision:
+ stats['revisions']['last_revision'] += 1
+
+ stats['performance']['revisions_analyzed'] += 1
+
+
+def analyze_pages(dump, stats, only_last_revision):
+ for mw_page in dump:
+ utils.log("Processing", mw_page.title)
+
+ # Skip non-articles
+ if mw_page.namespace != 0:
+ utils.log('Skipped (namespace != 0)')
+ continue
+
+ analyze_revisions(
+ mw_page,
+ stats=stats,
+ only_last_revision=only_last_revision,
+ )
+
+ stats['performance']['pages_analyzed'] += 1
+
+
+def configure_subparsers(subparsers):
+ parser = subparsers.add_parser('count-sections',
+ help='Count the number of sections and the section names of the dump.')
+ parser.add_argument('--only-last-revision',
+ action='store_true',
+ help='Consider only the last revision for each page.',
+ )
+ parser.set_defaults(func=main)
+
+
+def main(dump, features_output_h, stats_output_h, args):
+ stats = {
+ 'sections_per_revision': {
+ 'global': collections.Counter(),
+ 'last_revision': collections.Counter(),
+ },
+ 'section_names_per_revision': {
+ 'global': collections.Counter(),
+ 'last_revision': collections.Counter(),
+ },
+ 'revisions': collections.Counter(),
+ 'performance': {
+ 'start_time': None,
+ 'end_time': None,
+ 'revisions_analyzed': 0,
+ 'pages_analyzed': 0,
+ }
+ }
+ stats['performance']['start_time'] = datetime.datetime.utcnow()
+ analyze_pages(dump,
+ stats=stats,
+ only_last_revision=args.only_last_revision,
+ )
+ stats['performance']['end_time'] = datetime.datetime.utcnow()
+
+ with stats_output_h:
+ dumper.render_template(
+ stats_template,
+ stats_output_h,
+ stats=stats,
+ )
diff --git a/mwrefs/bibs/utils.py b/mwrefs/bibs/utils.py
new file mode 100644
index 0000000..3651180
--- /dev/null
+++ b/mwrefs/bibs/utils.py
@@ -0,0 +1,91 @@
+import functools
+import collections
+import sys
+import regex as re
+
+
+Diff = collections.namedtuple("Diff", "action data")
+
+
+def diff(previous, current):
+ # previous = [ref.text for ref in previous]
+ # current = [ref.text for ref in current]
+
+ added = set(current) - set(previous)
+ removed = set(previous) - set(current)
+
+ diff = (
+ [Diff('added', el) for el in added]
+ + [Diff('removed', el) for el in removed]
+ )
+
+ return diff
+
+
+# https://github.com/shazow/unstdlib.py/blob/master/unstdlib/standard/list_.py#L149
+def listify(fn=None, wrapper=list):
+ """
+ A decorator which wraps a function's return value in ``list(...)``.
+
+ Useful when an algorithm can be expressed more cleanly as a generator but
+ the function should return an list.
+
+ Example::
+
+ >>> @listify
+ ... def get_lengths(iterable):
+ ... for i in iterable:
+ ... yield len(i)
+ >>> get_lengths(["spam", "eggs"])
+ [4, 4]
+ >>>
+ >>> @listify(wrapper=tuple)
+ ... def get_lengths_tuple(iterable):
+ ... for i in iterable:
+ ... yield len(i)
+ >>> get_lengths_tuple(["foo", "bar"])
+ (3, 3)
+ """
+ def listify_return(fn):
+ @functools.wraps(fn)
+ def listify_helper(*args, **kw):
+ return wrapper(fn(*args, **kw))
+ return listify_helper
+ if fn is None:
+ return listify_return
+ return listify_return(fn)
+
+
+def iter_with_prev(iterable):
+ last = None
+ for el in iterable:
+ yield last, el
+ last = el
+
+
+def dot(num=None):
+ if not num:
+ what = '.'
+ elif num < 10:
+ what = str(num)
+ else:
+ what = '>'
+ print(what, end='', file=sys.stderr, flush=True)
+
+
+def log(*args):
+ first, *rest = args
+ print('\n' + str(first), *rest, end='', file=sys.stderr, flush=True)
+
+
+def remove_comments(source):
+ pattern = re.compile(r'', re.MULTILINE | re.DOTALL)
+ return pattern.sub('', source)
+
+
+def has_next(peekable):
+ try:
+ peekable.peek()
+ return True
+ except StopIteration:
+ return False
diff --git a/mwrefs/ids/__init__.py b/mwrefs/ids/__init__.py
new file mode 100644
index 0000000..e5d4c7d
--- /dev/null
+++ b/mwrefs/ids/__init__.py
@@ -0,0 +1,3 @@
+from .identifier import Identifier
+
+__version__ = "0.2.0"
diff --git a/mwrefs/ids/extractors/__init__.py b/mwrefs/ids/extractors/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/mwrefs/ids/extractors/arxiv.py b/mwrefs/ids/extractors/arxiv.py
new file mode 100644
index 0000000..c7c52c6
--- /dev/null
+++ b/mwrefs/ids/extractors/arxiv.py
@@ -0,0 +1,17 @@
+import re
+
+from ..identifier import Identifier
+
+# From http://arxiv.org/help/arxiv_identifier
+old_id = r"-?(?P([a-z]+(.[a-z]+)/)?[0-9]{4}[0-9]+)"
+new_id = r"(?P[0-9]{4}.[0-9]+)(v[0-9]+)?"
+
+prefixes=["arxiv\s*=\s*", "//arxiv\.org/(abs/)?", "arxiv:\s?"]
+
+ARXIV_RE = re.compile(r"({0})".format("|".join(prefixes)) +
+ r"({0}|{1})".format(old_id, new_id), re.I|re.U)
+
+def extract(text):
+ for match in ARXIV_RE.finditer(text):
+ id = match.group('new_id') or match.group("old_id")
+ yield Identifier("arxiv", id.lower())
diff --git a/mwrefs/ids/extractors/doi.py b/mwrefs/ids/extractors/doi.py
new file mode 100644
index 0000000..c76bb3d
--- /dev/null
+++ b/mwrefs/ids/extractors/doi.py
@@ -0,0 +1,150 @@
+import re
+from collections import defaultdict
+
+from more_itertools import peekable
+
+from ..identifier import Identifier
+
+DOI_START_RE = re.compile(r'10\.[0-9]{4,}/')
+
+HTML_TAGS = ['ref', 'span', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+ 'b', 'u', 'i', 's', 'ins', 'del', 'code', 'tt', 'blockquote',
+ 'pre']
+
+TAGS_RE = re.compile(r'<(/\s*)?(' + '|'.join(HTML_TAGS) + ')(\s[^>\n\r]+)?>', re.I)
+
+'''
+DOI_RE = re.compile(r'\b(10\.\d+/[^\s\|\]\}\?\,]+)')
+
+def extract_regex(text):
+ for match in DOI_RE.finditer(text):
+ id = re.sub(TAGS_RE, "", match.group(1)).rstrip(".")
+ yield Identifier("doi", id)
+
+import mwparserfromhell as mwp
+def extract_mwp(text):
+ no_tags = mwp.parse(text).strip_code()
+ for match in DOI_RE.finditer(no_tags):
+ id = re.sub(TAGS_RE, "", match.group(1)).rstrip(".")
+ yield Identifier("doi", id)
+'''
+
+LEXICON = [
+ (DOI_START_RE.pattern, 'doi_start'),
+ (r'\(', 'open_paren'),
+ (r'\)', 'close_paren'),
+ (r'\[', 'open_bracket'),
+ (r'\]', 'close_bracket'),
+ (r'', 'comment_end'),
+ (TAGS_RE.pattern, 'tag'),
+ (r'<', 'open_angle'),
+ (r'>', 'close_angle'),
+ (r'\{', 'open_curly'),
+ (r'\}', 'close_curly'),
+ (r'\|', 'pipe'),
+ (r'[,\.;!]', 'punct'),
+ (r'[\?#]', 'url_end'),
+ (r'[\n\r]+', 'break'),
+ (r'\s+', 'whitespace'),
+ (r'\w+', 'word'),
+ (r'.', 'etc')
+]
+
+def extract_island(text):
+ tokens = tokenize_finditer(text, LEXICON)
+ tokens = peekable(tokens)
+
+ while tokens.peek(None) is not None:
+
+ if tokens.peek()[0] == 'doi_start':
+ yield ('doi', read_doi(tokens))
+
+ next(tokens)
+
+
+def tokenize_finditer(text, lexicon=LEXICON):
+ pattern = '|'.join("(?P<{0}>{1})".format(name, pattern)
+ for pattern, name in lexicon)
+
+ group_regex = re.compile(pattern, re.I|re.U|re.M)
+
+ for match in group_regex.finditer(text):
+ yield match.lastgroup, match.group(0)
+
+
+"""
+def tokenize_scanner(text, lexicon=LEXICON):
+ scanner = re.Scanner(lexicon)
+ tokens, remainder = scanner.scan(text)
+ return tokens
+"""
+
+#from mwcites.extractors.doi import tokenize_scan
+#list(tokenize_scan("foo bar baz.{}"))
+
+def read_doi(tokens):
+ assert tokens.peek()[0] == 'doi_start'
+
+ depth = defaultdict(lambda: 0)
+
+ doi_buffer = [next(tokens)[1]]
+
+ while tokens.peek(None) is not None:
+ name, match = tokens.peek()
+
+ if name in ('url_end', 'break', 'whitespace', 'tag', 'pipe',
+ 'comment_start', 'comment_end'):
+ break
+ elif name == 'open_bracket':
+ depth['bracket'] += 1
+ doi_buffer.append(next(tokens)[1])
+ elif name == 'open_curly':
+ depth['curly'] += 1
+ doi_buffer.append(next(tokens)[1])
+ elif name == 'close_bracket':
+ if depth['bracket'] > 0:
+ depth['bracket'] -= 1
+ doi_buffer.append(next(tokens)[1])
+ else:
+ break
+ elif name == 'close_curly':
+ if depth['curly'] > 0:
+ depth['curly'] -= 1
+ doi_buffer.append(next(tokens)[1])
+ else:
+ break
+ else:
+ doi_buffer.append(next(tokens)[1])
+
+
+ # Do not return a doi with punctuation at the end
+ return re.sub(r'[\.,!]+$', '', ''.join(doi_buffer))
+
+
+
+def tokenize_search(text, start, lexicon=LEXICON):
+ pattern = '|'.join("(?P<{0}>{1})".format(name, pattern)
+ for pattern, name in lexicon)
+
+ group_regex = re.compile(pattern, re.I|re.U)
+
+ match = group_regex.search(text, start)
+ while match is not None:
+ yield match.lastgroup, match.group(0)
+ match = group_regex.search(text, match.span()[1])
+
+def extract_search(text, lexicon=LEXICON):
+
+ last_end = 0
+ for match in DOI_START_RE.finditer(text):
+ if match.span()[0] > last_end:
+ tokens = tokenize_search(text, match.span()[0], lexicon=lexicon)
+ tokens = peekable(tokens)
+ doi = read_doi(tokens)
+ last_end = match.span()[0] + len(doi)
+ yield Identifier('doi', doi)
+ else:
+ last_end = max(match.span()[1], last_end)
+
+extract = extract_search # Setting the default to the best method
diff --git a/mwrefs/ids/extractors/isbn.py b/mwrefs/ids/extractors/isbn.py
new file mode 100644
index 0000000..12883f3
--- /dev/null
+++ b/mwrefs/ids/extractors/isbn.py
@@ -0,0 +1,8 @@
+import re
+from ..identifier import Identifier
+
+ISBN_RE = re.compile('isbn\s?=?\s?([0-9\-Xx]+)', re.I)
+
+def extract(text):
+ for match in ISBN_RE.finditer(text):
+ yield Identifier('isbn', match.group(1).replace('-', ''))
diff --git a/mwrefs/ids/extractors/pubmed.py b/mwrefs/ids/extractors/pubmed.py
new file mode 100644
index 0000000..5fbaf67
--- /dev/null
+++ b/mwrefs/ids/extractors/pubmed.py
@@ -0,0 +1,22 @@
+import re
+
+from ..identifier import Identifier
+
+TEMPLATE_RE = re.compile(r"\b(pmid|pmc)\s*=\s*(pmc)?([0-9]+)\b", re.I)
+
+PMURL_RE = re.compile(r"//www\.ncbi\.nlm\.nih\.gov" +
+ r"/pubmed/([0-9]+)\b", re.I)
+PMCURL_RE = re.compile(r"//www\.ncbi\.nlm\.nih\.gov" +
+ r"/pmc/articles/PMC([0-9]+)\b", re.I)
+
+def extract(text):
+ text = str(text or "")
+
+ for match in TEMPLATE_RE.finditer(text):
+ yield Identifier(match.group(1).lower(), match.group(3))
+
+ for match in PMURL_RE.finditer(text):
+ yield Identifier("pmid", match.group(1))
+
+ for match in PMCURL_RE.finditer(text):
+ yield Identifier("pmc", match.group(1))
diff --git a/mwrefs/ids/extractors/tests/__init__.py b/mwrefs/ids/extractors/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/mwrefs/ids/extractors/tests/test_arxiv.py b/mwrefs/ids/extractors/tests/test_arxiv.py
new file mode 100644
index 0000000..ee0e7a6
--- /dev/null
+++ b/mwrefs/ids/extractors/tests/test_arxiv.py
@@ -0,0 +1,43 @@
+import pprint
+
+from nose.tools import eq_
+
+from .. import arxiv
+from ...identifier import Identifier
+
+INPUT_TEXT = """
+This is a doi randomly placed in the text 10.0000/m1
+Here's a typo that might be construed as a doi 10.60 people were there.
+{{cite|...|arxiv=0706.0001v1|pmid=10559875}}
+[Halfaker, A., Geiger, R. S., Morgan, J. T., & Riedl, J. (2012).
+The rise and decline of an open collaboration system: How Wikipedia’s
+reaction to popularity is causing its decline.
+American Behavioral Scientist,
+0002764212469365 arxiv:0706.0002v1]. Hats pants and banana
+[http://arxiv.org/0706.0003]
+[http://arxiv.org/abs/0706.0004v1]
+[https://arxiv.org/abs/0706.0005v1]
+[https://arxiv.org/abs/math.GT/0309001]
+[https://arxiv.org/abs/-math.gs/0309002]
+{{cite|...|arxiv=foobar.hats/0101003|issue=1656}}
+http://www.google.com/sky/#latitude=3.362&longitude=160.1238441&zoom=
+10.2387/234310.2347/39423
+
+"""
+EXPECTED = [
+ Identifier('arxiv', "0706.0001"),
+ Identifier('arxiv', "0706.0002"),
+ Identifier('arxiv', "0706.0003"),
+ Identifier('arxiv', "0706.0004"),
+ Identifier('arxiv', "0706.0005"),
+ Identifier('arxiv', "math.gt/0309001"),
+ Identifier('arxiv', "math.gs/0309002"),
+ Identifier('arxiv', "foobar.hats/0101003")
+]
+
+def test_extract():
+ ids = list(arxiv.extract(INPUT_TEXT))
+ pprint.pprint(ids)
+ pprint.pprint(EXPECTED)
+ eq_(ids, EXPECTED)
diff --git a/mwrefs/ids/extractors/tests/test_doi.py b/mwrefs/ids/extractors/tests/test_doi.py
new file mode 100644
index 0000000..05b85fa
--- /dev/null
+++ b/mwrefs/ids/extractors/tests/test_doi.py
@@ -0,0 +1,67 @@
+import pprint
+
+from nose.tools import eq_
+
+from .. import doi
+from ...identifier import Identifier
+
+INPUT_TEXT = """
+This is a doi randomly placed in the text 10.0000/m1
+Here's a typo that might be construed as a doi 10.60 people were there.
+{{cite|...|doi=10.0000/m2|pmid=10559875}}
+[Halfaker, A., Geiger, R. S., Morgan, J. T., & Riedl, J. (2012).
+The rise and decline of an open collaboration system: How Wikipedia’s
+reaction to popularity is causing its decline.
+American Behavioral Scientist,
+0002764212469365 doi: 10.1177/0002764212469365]. Hats pants and banana
+[http://dx.doi.org/10.1170/foo(herp)derp]
+[http://dx.doi.org/10.1170/foo(herp)derp[waffles]]
+{{cite|...|doi=10.1098/rspb.2008.1131|issue=1656}}
+http://www.google.com/sky/#latitude=3.362&longitude=160.1238441&zoom=
+10.2387/234310.2347/39423
+
+"""
+EXPECTED = [
+ Identifier('doi', "10.0000/m1"),
+ Identifier('doi', "10.0000/m2"),
+ Identifier('doi', "10.1177/0002764212469365"),
+ Identifier('doi', "10.1170/foo(herp)derp"),
+ Identifier('doi', "10.1170/foo(herp)derp[waffles]"),
+ Identifier('doi', "10.1098/rspb.2008.1131"),
+ Identifier('doi', "10.2387/234310.2347/39423"),
+ Identifier('doi', "10.2387/234310.2347/39423")
+]
+
+"""
+def test_extract_regex():
+ ids = list(doi.extract_regex(INPUT_TEXT))
+ pprint.pprint(ids)
+ pprint.pprint(EXPECTED)
+ eq_(ids, EXPECTED)
+
+def test_extract_mwp():
+ ids = list(doi.extract_mwp(INPUT_TEXT))
+ pprint.pprint(ids)
+ pprint.pprint(EXPECTED)
+ eq_(ids, EXPECTED)
+"""
+
+def test_extract():
+ ids = list(doi.extract(INPUT_TEXT))
+ pprint.pprint(ids)
+ pprint.pprint(EXPECTED)
+ eq_(ids, EXPECTED)
+
+def test_extract_island():
+ ids = list(doi.extract_island(INPUT_TEXT))
+ pprint.pprint(ids)
+ pprint.pprint(EXPECTED)
+ eq_(ids, EXPECTED)
+
+def test_extract_search():
+ ids = list(doi.extract_search(INPUT_TEXT))
+ pprint.pprint(ids)
+ pprint.pprint(EXPECTED)
+ #pprint.pprint(list(doi.tokenize_finditer(INPUT_TEXT)))
+ eq_(ids, EXPECTED)
diff --git a/mwrefs/ids/extractors/tests/test_isbn.py b/mwrefs/ids/extractors/tests/test_isbn.py
new file mode 100644
index 0000000..cd41776
--- /dev/null
+++ b/mwrefs/ids/extractors/tests/test_isbn.py
@@ -0,0 +1,44 @@
+import pprint
+from nose.tools import eq_
+
+from .. import isbn
+from ...identifier import Identifier
+
+INPUT_TEXT = """
+ | publisher=Academic Press | isbn=0124366031
+ | isbn=3540206310
+ | accessdate=2008-02-05 | isbn=0-618-34342-3
+ | isbn=978-0-140-27666-4
+ | isbn = 0-13-054091-9
+ | isbn=0195305736 }}</ref> schlug [[Irving Langmuir]] 1919 vor, dass das Elektronen in einem Atom verbunden oder verklumpt seien. Elektronengruppen beset
+ | ISBN=978-3-7046-5112-9
+ * Peter L. Bergen: ''Heiliger Krieg, Inc.: Osama bin Ladens Terrornetz''. Siedler, Berlin 2001, ISBN 3-88680-752-5.
+ * Marwan Abou-Taam, Ruth Bigalke (Hgg) ''Die Reden des Osama bin Laden''. Diederichs, München 2006, ISBN 3-72052-773-5. (Reden und Ansprachen des b.L. im Original - ''Rezensionen: '' [http://www.sicherheit-heute.de/index.php?cccpage=readpolitik&set_z_artikel=221 ]und [http://www.fr-online.de/in_und_ausland/kultur_und_medien/buecher/?em_cnt=868715&sid=f55727] Frankf. Rundschau 26. April 2006)
+ * Michael Pekler, Andreas Ungerböck: ''Ang Lee und seine Filme''. Schüren, Marburg 2009, ISBN 978-3-89472-665-2.
+ <ref name="flos1">{{Literatur | Autor = René Flosdorff, Günther Hilgarth | Titel = Elektrische Energieverteilung | Verlag = Teubner | Auflage = 8. | Jahr = 2003 | Kapitel = Kapitel 1.2.2.4 | ISBN = 3-519-26424-2 }}</ref>
+ Bei einer [[Sprungtemperatur]] von 1,2 K wird reines Aluminium [[Supraleiter|supraleitend]].<ref>{{Literatur | Autor = Ilschner | first = Bernhard | Titel = Werkstoffwissenschaften und Fertigungstechnik Eigenschaften, Vorgänge, Technologien | Verlag = Springer | Ort = Berlin | Jahr = 2010 | ISBN = 978-3-642-01734-6 | Seiten = 277}}</ref>
+ * {{Literatur | Autor=Michael J. Padilla, Ioannis Miaoulis, Martha Cyr | Jahr = 2002 | Titel = Prentice Hall Science Explorer: Chemical Building Blocks | Verlag = Prentice-Hall, Inc. | Ort = Upper Saddle River, New Jersey USA | ISBN = 0-13-054091-9 | |Originalsprache=en}}
+ """
+
+
+EXPECTED = [
+ Identifier('isbn', '0124366031'),
+ Identifier('isbn', '3540206310'),
+ Identifier('isbn', '0618343423'),
+ Identifier('isbn', '9780140276664'),
+ Identifier('isbn', '0130540919'),
+ Identifier('isbn', '0195305736'),
+ Identifier('isbn', '9783704651129'),
+ Identifier('isbn', '3886807525'),
+ Identifier('isbn', '3720527735'),
+ Identifier('isbn', '9783894726652'),
+ Identifier('isbn', '3519264242'),
+ Identifier('isbn', '9783642017346'),
+ Identifier('isbn', '0130540919'),
+]
+
+def test_extract():
+ ids = list(isbn.extract(INPUT_TEXT))
+ pprint.pprint(ids)
+ pprint.pprint(EXPECTED)
+ eq_(ids, EXPECTED)
diff --git a/mwrefs/ids/extractors/tests/test_pubmed.py b/mwrefs/ids/extractors/tests/test_pubmed.py
new file mode 100644
index 0000000..48f98d9
--- /dev/null
+++ b/mwrefs/ids/extractors/tests/test_pubmed.py
@@ -0,0 +1,27 @@
+from nose.tools import eq_
+
+from .. import pubmed
+from ...identifier import Identifier
+
+def test_extract():
+
+ text = """
+ This is some text with a template cite. {{cite|...|...|pmid=1}}.
+ This is some text with a template cite. {{cite|...|...|pmid = 2|...}}.
+ This is some text with a template cite. {{cite|...|...|pmc = 3|...}}.
+ This is some text with a template cite. {{cite|...|...|pmc = pmc4|...}}.
+ This is some text with a link [http://www.ncbi.nlm.nih.gov/pubmed/5 ID]
+ Another link [https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6 ID]
+ """
+ ids = list(pubmed.extract(text))
+ expected = [
+ Identifier('pmid', "1"),
+ Identifier('pmid', "2"),
+ Identifier('pmc', "3"),
+ Identifier('pmc', "4"),
+ Identifier('pmid', "5"),
+ Identifier('pmc', "6")
+ ]
+ print(ids)
+ print(expected)
+ eq_(ids, expected)
diff --git a/mwrefs/ids/fetchers/doi.py b/mwrefs/ids/fetchers/doi.py
new file mode 100644
index 0000000..eb963d3
--- /dev/null
+++ b/mwrefs/ids/fetchers/doi.py
@@ -0,0 +1,48 @@
+import requests
+
+CITOID_HOST = 'https://citoid.wikimedia.org'
+
+
+def lookup_via_citoid(doi):
+ url = CITOID_HOST + "/api"
+ params = {
+ 'format': "mediawiki",
+ 'search': doi
+ }
+ response = requests.get(url, params=params)
+ doc = response.json()
+ if 'Error' in doc:
+ raise RuntimeError(doc['Error'])
+ else:
+ return doc
+
+
+def lookup_via_doidotorg(doi):
+ url = "http://doi.org"
+ data = {
+ "hdl": doi
+ }
+ response = requests.post(
+ url, data=data, headers={'Accept': "application/json"})
+ if response.status_code == 404:
+ raise RuntimeError("DOI not found")
+ elif response.status_code == 200:
+ return response.json()
+ else:
+ raise RuntimeError("Unknown error")
+
+METHODS = {
+ 'doi.org': lookup_via_doidotorg,
+ 'citoid.wikimedia.org': lookup_via_citoid
+}
+
+
+def lookup(doi, methods=['doi.org']):
+ for i, method in enumerate(methods):
+ try:
+ return METHODS[method](doi)
+ except RuntimeError as e:
+ if i+1 == len(methods):
+ raise e
+ else:
+ continue
diff --git a/mwrefs/ids/identifier.py b/mwrefs/ids/identifier.py
new file mode 100644
index 0000000..44f9b03
--- /dev/null
+++ b/mwrefs/ids/identifier.py
@@ -0,0 +1,3 @@
+from collections import namedtuple
+
+Identifier = namedtuple("Identifier", ['type', 'id'])
diff --git a/mwrefs/ids/utilities/__init__.py b/mwrefs/ids/utilities/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/mwrefs/ids/utilities/extract.py b/mwrefs/ids/utilities/extract.py
new file mode 100644
index 0000000..1dfa0ca
--- /dev/null
+++ b/mwrefs/ids/utilities/extract.py
@@ -0,0 +1,160 @@
+"""
+Extracts academic citations from articles from the history of Wikipedia
+articles by processing a pages-meta-history XML dump and matching regular
+expressions to revision content.
+
+Currently supported identifies include:
+
+ * PubMed
+ * DOI
+ * ISBN
+ * arXiv
+
+Outputs a TSV file with the following fields:
+
+ * page_id: The identifier of the Wikipedia article (int), e.g. 1325125
+ * page_title: The title of the Wikipedia article (utf-8), e.g. Club cell
+ * rev_id: The Wikipedia revision where the citation was first added (int),
+ e.g. 282470030
+ * timestamp: The timestamp of the revision where the citation was first added.
+ (ISO 8601 datetime), e.g. 2009-04-08T01:52:20Z
+ * type: The type of identifier, e.g. pmid, pmcid, doi, arxiv or isbn
+ * id: The id of the cited scholarly article (utf-8),
+ e.g 10.1183/09031936.00213411
+
+Usage:
+ extract -h | --help
+ extract ... [--extractor=...]
+
+Options:
+ -h --help Shows this documentation
+ The path to a set of dump files to process. If no
+ files are specified, will be read.
+ --extractor= The class path to set of extractors to apply
+ [default: ]
+"""
+import sys
+from itertools import chain
+
+import docopt
+import mwxml
+
+import mysqltsv
+
+from ..extractors import arxiv, doi, isbn, pubmed
+
+ALL_EXTRACTORS = [doi, pubmed, isbn, arxiv]
+
+HEADERS = ("page_id", "page_title", "rev_id", "timestamp", "type", "id")
+
+def main(argv=None):
+ args = docopt.docopt(__doc__, argv=argv)
+ dump_files = args['']
+
+ if args['--extractor'] == ['']:
+ extractors = ALL_EXTRACTORS
+ else:
+ extractors = [import_from_path(path.lower)
+ for path in args['--extractor']]
+
+ run(dump_files, extractors)
+
+def run(dump_files, extractors):
+ writer = mysqltsv.Writer(sts.stdout, headers=HEADERS)
+
+ cites = extract(dump_files, extractors=extractors)
+ for page_id, title, rev_id, timestamp, type, id in cites:
+ writer.write(page_id, title, rev_id, timestamp.long_format(), type, id)
+
+def extract(dump_files, extractors=ALL_EXTRACTORS):
+ """
+ Extracts cites from a set of `dump_files`.
+
+ :Parameters:
+ dump_files : str | `file`
+ A set of files MediaWiki XML dump files
+ (expects: pages-meta-history)
+ extractors : `list`(`extractor`)
+ A list of extractors to apply to the text
+
+ :Returns:
+ `iterable` -- a generator of extracted cites
+
+ """
+ # Dump processor function
+ def process_dump(dump, path):
+ for page in dump:
+ if page.namespace != 0: continue
+ else:
+ for cite in extract_cite_history(page, extractors):
+ yield cite
+
+ # Map call
+ return mwxml.map(process_dump, dump_files)
+
+def extract_cite_history(page, extractors):
+ """
+ Extracts cites from the history of a `page` (`mwxml.Page`).
+
+ :Parameters:
+ page : `iterable`(`mwxml.Revision`)
+ The page to extract cites from
+ extractors : `list`(`extractor`)
+ A list of extractors to apply to the text
+
+ :Returns:
+ `iterable` -- a generator of extracted cites
+
+ """
+ appearances = {} # For tracking the first appearance of an ID
+ ids = set() # For holding onto the ids in the last revision.
+ for revision in page:
+ ids = set(extract_ids(revision.text, extractors))
+
+ # For each ID, check to see if we have seen it before
+ for id in ids:
+ if id not in appearances:
+ appearances[id] = (revision.id, revision.timestamp)
+
+ for id in ids: #For the ids in the last version of the page
+ rev_id, timestamp = appearances[id]
+ yield (page.id, page.title, rev_id, timestamp, id.type, id.id)
+
+def extract_ids(text, extractors):
+ """
+ Uses `extractors` to extract citation identifiers from a text.
+
+ :Parameters:
+ text : str
+ The text to process
+ extractors : `list`(`extractor`)
+ A list of extractors to apply to the text
+
+ :Returns:
+ `iterable` -- a generator of extracted identifiers
+ """
+ for extractor in extractors:
+ for id in extractor.extract(text):
+ yield id
+
+def import_from_path(path):
+ """
+ Imports a specific attribute from a module based on a class path.
+
+ :Parameters:
+ path : str
+ A dot delimited string representing the import path of the desired
+ object.
+
+ :Returns:
+ object -- An imported object
+ """
+ parts = path.split(".")
+ module_path = ".".join(parts[:-1])
+ attribute_name = parts[-1]
+
+ module = import_module(module_path)
+
+ attribute = getattr(module, attribute_name)
+
+ return attribute
diff --git a/mwrefs/ids/utilities/tests/__init__.py b/mwrefs/ids/utilities/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/mwrefs/ids/utilities/tests/test_extract.py b/mwrefs/ids/utilities/tests/test_extract.py
new file mode 100644
index 0000000..e4353ca
--- /dev/null
+++ b/mwrefs/ids/utilities/tests/test_extract.py
@@ -0,0 +1,41 @@
+from collections import namedtuple
+
+from mw import Timestamp
+from nose.tools import eq_
+
+from ..extract import extract_cite_history
+from ...identifier import Identifier
+
+
+def test_extract_cite_history():
+ FakeRevision = namedtuple("Revision", ['id', 'timestamp', 'text'])
+
+ FakeExtractor = namedtuple("Extractor", ['extract'])
+
+ class FakePage:
+ def __init__(self, id, title):
+ self.id = id
+ self.title = title
+ def __iter__(self):
+ return iter([
+ FakeRevision(1, Timestamp(1), "id1 id2"),
+ FakeRevision(2, Timestamp(2), "id1 id3"),
+ FakeRevision(3, Timestamp(3), "id1 id2 id3"),
+ FakeRevision(4, Timestamp(4), "id1 id2 id4"),
+ FakeRevision(5, Timestamp(5), "id1 id2 id4"),
+ ])
+
+ fake_page = FakePage(1, "Title")
+
+ def extract(text):
+ return (Identifier('fake', id) for id in text.split(" "))
+ extractor = FakeExtractor(extract)
+
+ expected = [(1, "Title", 1, Timestamp(1), "fake", "id1"),
+ (1, "Title", 1, Timestamp(1), "fake", "id2"),
+ (1, "Title", 4, Timestamp(4), "fake", "id4")]
+
+ citations = list(extract_cite_history(fake_page, [extractor]))
+ eq_(len(citations), len(expected))
+ for cite in extract_cite_history(fake_page, [extractor]):
+ assert cite in expected
diff --git a/mwrefs/extract.py b/mwrefs/refs/extract.py
similarity index 100%
rename from mwrefs/extract.py
rename to mwrefs/refs/extract.py
diff --git a/mwrefs/tests/test_extract.py b/mwrefs/tests/test_extract.py
index ca2a660..7dd8600 100644
--- a/mwrefs/tests/test_extract.py
+++ b/mwrefs/tests/test_extract.py
@@ -41,15 +41,15 @@ def test_extract():
eq_(refs,
['[{{cite web\n |url=http://topics.info.com/Who-coined-the-' +
- 'term-biology_716 |title=Who coined\n the term biology? |work=' +
- 'Info.com|accessdate=2012-06-03}}]',
+ 'term-biology_716 |title=Who coined\n the term biology? ' +
+ '|work=Info.com|accessdate=2012-06-03}}',
'[{{cite web|title=biology\n |url=http://' +
- 'www.etymonline.com/index.php?term=biology&allowed_in_frame=0\n ' +
- ' |publisher=[[Online Etymology Dictionary]]}}]',
+ 'www.etymonline.com/index.php?term=biology&allowed_in_frame=0\n ' +
+ ' |publisher=[[Online Etymology Dictionary]]}}',
'', '',
'[\n {{cite book|last=Richards|first=Robert J.' +
- '|title=The Romantic Conception of\n Life: Science and ' +
- 'Philosophy in the Age of Goethe|year=2002\n |publisher=' +
- 'University of Chicago Press|isbn=0-226-71210-9\n ' +
- '|url=http://books.google.cocover#v=onepage&q&f=false}}]',
+ '|title=The Romantic Conception of\n Life: Science and ' +
+ 'Philosophy in the Age of Goethe|year=2002\n |publisher=' +
+ 'University of Chicago Press|isbn=0-226-71210-9\n ' +
+ '|url=http://books.google.cocover#v=onepage&q&f=false}}',
'[foobar]'])
diff --git a/mwrefs/utilities/diffs.py b/mwrefs/utilities/diff_ref_tags.py
similarity index 100%
rename from mwrefs/utilities/diffs.py
rename to mwrefs/utilities/diff_ref_tags.py
diff --git a/mwrefs/utilities/extract_ids.py b/mwrefs/utilities/extract_ids.py
new file mode 100644
index 0000000..8b58b24
--- /dev/null
+++ b/mwrefs/utilities/extract_ids.py
@@ -0,0 +1,166 @@
+"""
+Extracts academic citations from articles from the history of Wikipedia
+articles by processing a pages-meta-history XML dump and matching regular
+expressions to revision content.
+
+Currently supported identifies include:
+
+ * PubMed
+ * DOI
+ * ISBN
+ * arXiv
+
+Outputs a TSV file with the following fields:
+
+ * page_id: The identifier of the Wikipedia article (int), e.g. 1325125
+ * page_title: The title of the Wikipedia article (utf-8), e.g. Club cell
+ * rev_id: The Wikipedia revision where the citation was first added (int),
+ e.g. 282470030
+ * timestamp: The timestamp of the revision where the citation was first added.
+ (ISO 8601 datetime), e.g. 2009-04-08T01:52:20Z
+ * type: The type of identifier, e.g. pmid, pmcid, doi, arxiv or isbn
+ * id: The id of the cited scholarly article (utf-8),
+ e.g 10.1183/09031936.00213411
+
+Usage:
+ extract -h | --help
+ extract ... [--extractor=...]
+
+Options:
+ -h --help Shows this documentation
+ The path to a set of dump files to process. If no
+ files are specified, will be read.
+ --extractor= The class path to set of extractors to apply
+ [default: ]
+"""
+import sys
+from importlib import import_module
+from itertools import chain
+
+import docopt
+import mwxml
+
+import mysqltsv
+
+from ..extractors import arxiv, doi, isbn, pubmed
+
+ALL_EXTRACTORS = [doi, pubmed, isbn, arxiv]
+
+HEADERS = ("page_id", "page_title", "rev_id", "timestamp", "type", "id")
+
+
+def main(argv=None):
+ args = docopt.docopt(__doc__, argv=argv)
+ dump_files = args['']
+
+ if args['--extractor'] == ['']:
+ extractors = ALL_EXTRACTORS
+ else:
+ extractors = [import_from_path(path.lower)
+ for path in args['--extractor']]
+
+ run(dump_files, extractors)
+
+
+def run(dump_files, extractors):
+ writer = mysqltsv.Writer(sys.stdout, headers=HEADERS)
+
+ cites = extract(dump_files, extractors=extractors)
+ for page_id, title, rev_id, timestamp, type, id in cites:
+ writer.write(page_id, title, rev_id, timestamp.long_format(), type, id)
+
+
+def extract(dump_files, extractors=ALL_EXTRACTORS):
+ """
+ Extracts cites from a set of `dump_files`.
+
+ :Parameters:
+ dump_files : str | `file`
+ A set of files MediaWiki XML dump files
+ (expects: pages-meta-history)
+ extractors : `list`(`extractor`)
+ A list of extractors to apply to the text
+
+ :Returns:
+ `iterable` -- a generator of extracted cites
+
+ """
+ # Dump processor function
+ def process_dump(dump, path):
+ for page in dump:
+ if page.namespace != 0: continue
+ else:
+ for cite in extract_cite_history(page, extractors):
+ yield cite
+
+ # Map call
+ return mwxml.map(process_dump, dump_files)
+
+
+def extract_cite_history(page, extractors):
+ """
+ Extracts cites from the history of a `page` (`mwxml.Page`).
+
+ :Parameters:
+ page : `iterable`(`mwxml.Revision`)
+ The page to extract cites from
+ extractors : `list`(`extractor`)
+ A list of extractors to apply to the text
+
+ :Returns:
+ `iterable` -- a generator of extracted cites
+
+ """
+ appearances = {} # For tracking the first appearance of an ID
+ ids = set() # For holding onto the ids in the last revision.
+ for revision in page:
+ ids = set(extract_ids(revision.text, extractors))
+
+ # For each ID, check to see if we have seen it before
+ for id in ids:
+ if id not in appearances:
+ appearances[id] = (revision.id, revision.timestamp)
+
+ for id in ids: #For the ids in the last version of the page
+ rev_id, timestamp = appearances[id]
+ yield (page.id, page.title, rev_id, timestamp, id.type, id.id)
+
+def extract_ids(text, extractors):
+ """
+ Uses `extractors` to extract citation identifiers from a text.
+
+ :Parameters:
+ text : str
+ The text to process
+ extractors : `list`(`extractor`)
+ A list of extractors to apply to the text
+
+ :Returns:
+ `iterable` -- a generator of extracted identifiers
+ """
+ for extractor in extractors:
+ for id in extractor.extract(text):
+ yield id
+
+
+def import_from_path(path):
+ """
+ Imports a specific attribute from a module based on a class path.
+
+ :Parameters:
+ path : str
+ A dot delimited string representing the import path of the desired
+ object.
+
+ :Returns:
+ object -- An imported object
+ """
+ parts = path.split(".")
+ module_path = ".".join(parts[:-1])
+ attribute_name = parts[-1]
+
+ module = import_module(module_path)
+
+ attribute = getattr(module, attribute_name)
+
+ return attribute
diff --git a/mwrefs/utilities/extract.py b/mwrefs/utilities/extract_ref_tags.py
similarity index 100%
rename from mwrefs/utilities/extract.py
rename to mwrefs/utilities/extract_ref_tags.py
diff --git a/mwrefs/utilities/fetch_metadata.py b/mwrefs/utilities/fetch_metadata.py
new file mode 100644
index 0000000..e69de29
diff --git a/mwrefs/utilities/tests/__init__.py b/mwrefs/utilities/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/mwrefs/utilities/tests/test_extract_ids.py b/mwrefs/utilities/tests/test_extract_ids.py
new file mode 100644
index 0000000..e4353ca
--- /dev/null
+++ b/mwrefs/utilities/tests/test_extract_ids.py
@@ -0,0 +1,41 @@
+from collections import namedtuple
+
+from mw import Timestamp
+from nose.tools import eq_
+
+from ..extract import extract_cite_history
+from ...identifier import Identifier
+
+
+def test_extract_cite_history():
+ FakeRevision = namedtuple("Revision", ['id', 'timestamp', 'text'])
+
+ FakeExtractor = namedtuple("Extractor", ['extract'])
+
+ class FakePage:
+ def __init__(self, id, title):
+ self.id = id
+ self.title = title
+ def __iter__(self):
+ return iter([
+ FakeRevision(1, Timestamp(1), "id1 id2"),
+ FakeRevision(2, Timestamp(2), "id1 id3"),
+ FakeRevision(3, Timestamp(3), "id1 id2 id3"),
+ FakeRevision(4, Timestamp(4), "id1 id2 id4"),
+ FakeRevision(5, Timestamp(5), "id1 id2 id4"),
+ ])
+
+ fake_page = FakePage(1, "Title")
+
+ def extract(text):
+ return (Identifier('fake', id) for id in text.split(" "))
+ extractor = FakeExtractor(extract)
+
+ expected = [(1, "Title", 1, Timestamp(1), "fake", "id1"),
+ (1, "Title", 1, Timestamp(1), "fake", "id2"),
+ (1, "Title", 4, Timestamp(4), "fake", "id4")]
+
+ citations = list(extract_cite_history(fake_page, [extractor]))
+ eq_(len(citations), len(expected))
+ for cite in extract_cite_history(fake_page, [extractor]):
+ assert cite in expected