diff --git a/mwrefs/bibs/__main__.py b/mwrefs/bibs/__main__.py new file mode 100644 index 0000000..b9b65dd --- /dev/null +++ b/mwrefs/bibs/__main__.py @@ -0,0 +1,116 @@ +import argparse +import subprocess +import codecs +import os + +import mw.xml_dump +import mwxml +import pathlib + +from . import utils, processors + + +def open_xml_file(path): + f = mw.xml_dump.functions.open_file( + mw.xml_dump.functions.file(path) + ) + return f + + +def compressor_7z(file_path): + p = subprocess.Popen( + ['7z', 'a', '-si', file_path], + stdin=subprocess.PIPE, + stderr=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + ) + utf8writer = codecs.getwriter('utf-8') + + return utf8writer(p.stdin) + + +def output_writer(path, compression): + if compression == '7z': + return compressor_7z(path + '.7z') + else: + return open(path, 'wt', encoding='utf-8') + + +def create_path(path): + path = pathlib.Path(path) + path.parent.mkdir(parents=True, exist_ok=True) + + +def get_args(): + parser = argparse.ArgumentParser( + prog='wikidump', + description='Wikidump features extractor.', + ) + parser.add_argument('files', + metavar='FILE', + type=pathlib.Path, + nargs='+', + help='XML Wikidump file to parse. It accepts only 7z.' + ) + parser.add_argument('output_dir_path', + metavar='OUTPUT_DIR', + type=pathlib.Path, + help='XML output directory.', + ) + parser.add_argument('--output-compression', + choices={None, '7z'}, + required=False, + default=None, + help='Output compression format.', + ) + parser.add_argument('--dry-run', '-n', + action='store_true', + help="Don't write any file", + ) + + subparsers = parser.add_subparsers(help='sub-commands help') + processors.bibliography_extractor.configure_subparsers(subparsers) + processors.identifiers_extractor.configure_subparsers(subparsers) + processors.sections_counter.configure_subparsers(subparsers) + + parsed_args = parser.parse_args() + if 'func' not in parsed_args: + parser.print_usage() + parser.exit(1) + + return parsed_args + + +def main(): + args = get_args() + + args.output_dir_path.mkdir(parents=True, exist_ok=True) + + for input_file_path in args.files: + utils.log("Analyzing {}...".format(input_file_path)) + + dump = mwxml.Dump.from_file(open_xml_file(str(input_file_path))) + + basename = input_file_path.name + + if args.dry_run: + pages_output = open(os.devnull, 'wt') + stats_output = open(os.devnull, 'wt') + else: + pages_output = output_writer( + path=str(args.output_dir_path/(basename + '.features.xml')), + compression=args.output_compression, + ) + stats_output = output_writer( + path=str(args.output_dir_path/(basename + '.stats.xml')), + compression=args.output_compression, + ) + args.func(dump, + pages_output, + stats_output, + args, + ) + + +if __name__ == '__main__': + main() diff --git a/mwrefs/bibs/dumper.py b/mwrefs/bibs/dumper.py new file mode 100644 index 0000000..7e78a8b --- /dev/null +++ b/mwrefs/bibs/dumper.py @@ -0,0 +1,99 @@ +import mako.runtime +import mako.template + +pages_revisions_template = ''' +<%! + from itertools import groupby + def groupby_action(diff): + return groupby(diff, lambda d: d.action) +%> + + % for page in pages: + + ${page.title} + ${page.id} + + % for revision in page.revisions: + + ${revision.id} + + ${revision.timestamp} + + % for key, group in groupby_action(revision.references_diff): + + % for _, text in group: + ${text} + % endfor + + % endfor + + + % for key, group in groupby_action(revision.publication_identifiers_diff): + + % for _, identifier in group: + + % endfor + + % endfor + + + % for section in revision.sections: +
${section.name}
+ % endfor +
+ ${revision.bibliography} +
+ %endfor +
+
+ % endfor +
+''' + +stats_template = ''' + + + ${stats['performance']['start_time']} + ${stats['performance']['end_time']} + ${stats['performance']['revisions_analyzed']} + ${stats['performance']['pages_analyzed']} + + + % for key in ['global', 'last_revision']: + <${key}> + % for where, count in stats['identifiers'][key].items(): + + % endfor + + % endfor + + +''' + + +def render_template(template, output_handler, default_filters=None, **kwargs): + ctx = mako.runtime.Context(output_handler, **kwargs) + + xml_template = mako.template.Template( + template, + default_filters=default_filters, + ) + xml_template.render_context(ctx) + + +def serialize_page_revisions(pages, output_handler): + render_template( + pages_revisions_template, + output_handler, + default_fiters=['x'], # XML escaping + pages=pages, + ) + + +def serialize_stats(stats, output_handler): + render_template( + stats_template, + output_handler, + default_filters=['x'], # XML escaping + stats=stats, + ) diff --git a/mwrefs/bibs/languages.py b/mwrefs/bibs/languages.py new file mode 100644 index 0000000..655e06f --- /dev/null +++ b/mwrefs/bibs/languages.py @@ -0,0 +1,82 @@ +supported = {'en', 'it'} + +bibliography = { + 'en': { + 'bibliography', + 'references', + 'reference', + 'further reading', + 'notes', + 'sources', + 'footnotes', + 'citations', + 'publications', + 'publication history', + 'literature', + }, + 'it': {'bibliografia'}, +} + +citation = { + 'en': {'Citation', 'cite', 'vcite'}, +} + +""" +What I mean for: +* References: a section containing footnotes for works cited in the text. +* Bibliography: a section containing articles and journals. +* Further reading: like `Bibliography`, but contains references not used in the text. +* Footnotes: a section containing explainations to concepts. + +From now on, words in backquotes (`) are to be interpreted as concept using the above definitions, while words in double quotes (") are to be interpreted as terms found in the text of the articles. + +"References" (term) is commonly used as `Bibliography` (concept), i.e. articles and journals without backref to the text. +And, of course, "Bibliography" (term) is sometimes used as `References` (concept). +* https://en.wikipedia.org/w/index.php?title=Anabaptists&oldid=49953891 "References" interpreted as `Bibliography` +* https://en.wikipedia.org/w/index.php?title=Alcopop&oldid=296736852 "References" interpreted as `Bibliography` +* https://en.wikipedia.org/w/index.php?title=Amu%20Darya&oldid=66374611 "References" interpreted as `Bibliography` + +"Citations" (term) sometimes used as synonym for "References" or "Bibliography" (terms): +* https://en.wikipedia.org/w/index.php?title=Augustine_of_Canterbury&oldid=676642624 "Citations" used as `References`, "References" used as `Bibliography` +* https://en.wikipedia.org/w/index.php?title=Anemometer&oldid=674186492#Citations "Citations" used as `References` + +"Notes and References" and "References and Notes" (terms) are used as synonyms for "References" (term): +* https://en.wikipedia.org/w/index.php?title=Ackermann%20function&oldid=335603599#Notes_and_references "Notes and References" converted to "References" (term) and interpreted as `References` +* https://en.wikipedia.org/w/index.php?title=albanians&oldid=391045161#Notes_and_references "Notes and References" is a wrapper around "Notes" (interpreted as `footnotes`) and "References" (interpreted as `References`) +* https://en.wikipedia.org/w/index.php?title=assassination&oldid=678057527#Notes_and_references interpreted as `References` + +"Sources" seems to be interpreted as `Bibliography` or `References`, and sometimes then converted by users to "References" or "Bibliography" +* https://en.wikipedia.org/w/index.php?title=artemis&diff=next&oldid=565871969 "Sources" has been converted to "References and sources" +* https://en.wikipedia.org/w/index.php?title=Amakusa&direction=next&oldid=667294099 "Sources" used as `Bibliography` +* https://en.wikipedia.org/w/index.php?title=A%20Doll's%20House&oldid=676505492#Sources "Sources" used as `Bibliography` +* https://en.wikipedia.org/w/index.php?title=A.%20E.%20Housman&diff=next&oldid=678259900#Sources "Sources" used `Bibliography` + +"Footnotes" is commonly interpreted as `References`, with the following terms: "References" and "Citations" +* https://en.wikipedia.org/w/index.php?title=Augustine%20of%20Canterbury&oldid=459457206#Footnotes "Footnotes" is used as `References`; "Footnotes" is then converted to "Citations", used as `References` +* https://en.wikipedia.org/w/index.php?title=Amoxicillin&diff=next&oldid=423375138 "Footnotes" used as and converted to `References` +* https://en.wikipedia.org/w/index.php?title=Anabaptists&oldid=49953891#Footnotes_and_references "Footnotes" interpreted as `References`. The next revision converts "Footnotes" to "Footnotes and References". +* https://en.wikipedia.org/w/index.php?title=Alcopop&oldid=296736852#Footnotes "Footnotes" used as `References` +* https://en.wikipedia.org/w/index.php?title=Archaeopteryx&diff=next&oldid=326796096 "Footnotes" interpreteda s and then converted to `References` (term and concept) +* https://en.wikipedia.org/w/index.php?title=Al%20Capp&oldid=590148186#Footnotes "Footnotes" interpreted as `References`. It is then converted to "Notes" +* https://en.wikipedia.org/w/index.php?title=Amu%20Darya&oldid=66374611#Footnotes "Footnotes" interpreted as `References`. Later converted to "Notes" +* https://en.wikipedia.org/w/index.php?title=Albert%20Brooks&oldid=150996845#Footnotes "Footnotes" used as and then converted to `References` (term and concept) + +"Literature" is used most of the times as a subsection for things like "Culture", and in some cases is a replacement for "bibliography": +* https://en.wikipedia.org/w/index.php?title=Alexandria&oldid=678355005 "Literature" used as subsection of "Culture" +* https://en.wikipedia.org/w/index.php?title=Bible&oldid=23508742#Literature "Literature" used as `Bibliography` +* https://en.wikipedia.org/w/index.php?title=Board_game&oldid=7131437#Literature "Literature" used as "Bibliography", then converted to "References" (used as "Bibliography") +* https://en.wikipedia.org/w/index.php?title=Ahuitzotl&oldid=118183827 "Literature" interpreted as `Bibliography` + +"Publications" and "Publication history" are used as a subsection for the "Biography" with the works of the person described. + +"Reference" is almost always converted to "References" in a successive revision. + + +"Notes" is sometimes interpreted as `References` or `Footnotes` +* https://en.wikipedia.org/w/index.php?title=Ahuitzotl&oldid=118183827 "Notes" used as `Footnotes` +* https://en.wikipedia.org/w/index.php?title=Archaeoastronomy&oldid=678777218#Notes "Notes" used as `References` +* https://en.wikipedia.org/w/index.php?title=Alexander_of_Hales&oldid=661215939#Other_historical_works "Notes" interpreted as `References` + +"See also" and "Related pages" usually contain links to other wikipedia pages. +""" + diff --git a/mwrefs/bibs/processors/__init__.py b/mwrefs/bibs/processors/__init__.py new file mode 100644 index 0000000..d613340 --- /dev/null +++ b/mwrefs/bibs/processors/__init__.py @@ -0,0 +1 @@ +from . import bibliography_extractor, identifiers_extractor, sections_counter diff --git a/mwrefs/bibs/processors/bibliography_extractor.py b/mwrefs/bibs/processors/bibliography_extractor.py new file mode 100644 index 0000000..8285a65 --- /dev/null +++ b/mwrefs/bibs/processors/bibliography_extractor.py @@ -0,0 +1,202 @@ +import collections +import functools +import datetime + +import more_itertools +import fuzzywuzzy.process + +from .. import utils, extractors, dumper, languages + +FUZZY_MATCH_CUTOFF = 91 # between 0, 100 + +features_template = ''' +<%! + from itertools import groupby + def groupby_action(diff): + return groupby(diff, lambda d: d.action) +%> +<%def name="attribute_if_exists(name, text)" filter="trim"> + % if text is not None: + ${name}="${text | x}" + % endif + +<%def name="tag_user_if_exists(user)" filter="trim"> + % if user: + + % endif + + + % for page in pages: + + ${page.title | x} + ${page.id | x} + + % for revision in page.revisions: + + ${revision.id | x} + ${tag_user_if_exists(revision.user)} + ${revision.timestamp | x} + + % for section in revision.sections: +
${section.body | x}
+ % endfor +
+
+ % endfor +
+
+ % endfor +
+''' + +stats_template = ''' + + + ${stats['performance']['start_time'] | x} + ${stats['performance']['end_time'] | x} + ${stats['performance']['revisions_analyzed'] | x} + ${stats['performance']['pages_analyzed'] | x} + + + % for key in ['global', 'last_revision']: + <${key}> + % for section_name, count in stats['section_names'][key].most_common(): +
+ % endfor + + % endfor + + +''' + +Page = collections.namedtuple('Page', [ + 'id', + 'title', + 'revisions', +]) +Revision = collections.namedtuple('Revision', [ + 'id', + 'user', + 'timestamp', + 'sections', +]) + + +# TODO: instead of comparing section_name to a bib synonym, +# search all the possible bib synonyms in the section name +@functools.lru_cache(maxsize=500) +def is_secion_bibliography(section_name, language, score_cutoff=FUZZY_MATCH_CUTOFF): + bibliography_synonyms = languages.bibliography[language] + match = fuzzywuzzy.process.extractOne( + section_name, + bibliography_synonyms, + score_cutoff=score_cutoff, + ) + return bool(match) + + +def extract_revisions(mw_page, language, stats, only_last_revision): + section_names_stats = stats['section_names'] + revisions = more_itertools.peekable(mw_page) + for mw_revision in revisions: + utils.dot() + + is_last_revision = not utils.has_next(revisions) + if only_last_revision and not is_last_revision: + continue + + text = utils.remove_comments(mw_revision.text or '') + + sections = (section for section, _ in extractors.sections(text)) + + bibliography_sections = list(section + for section in sections + if is_secion_bibliography(section.name, language)) + + for section in bibliography_sections: + section_names_stats['global'][section.name] += 1 + if is_last_revision: + section_names_stats['last_revision'][section.name] += 1 + + yield Revision( + id=mw_revision.id, + user=mw_revision.user, + timestamp=mw_revision.timestamp.to_json(), + sections=bibliography_sections, + ) + + stats['performance']['revisions_analyzed'] += 1 + + +def extract_pages(dump, language, stats, only_last_revision): + for mw_page in dump: + utils.log("Processing", mw_page.title) + + # Skip non-articles + if mw_page.namespace != 0: + utils.log('Skipped (namespace != 0)') + continue + + revisions_generator = extract_revisions( + mw_page, + language=language, + stats=stats, + only_last_revision=only_last_revision, + ) + + yield Page( + id=mw_page.id, + title=mw_page.title, + revisions=revisions_generator, + ) + stats['performance']['pages_analyzed'] += 1 + + +def configure_subparsers(subparsers): + parser = subparsers.add_parser('extract-bibliography', + help='Extract only sections may be a bibliography') + parser.add_argument('-l', '--language', + choices=languages.supported, + required=True, + help='The language of the dump.', + ) + parser.add_argument('--only-last-revision', + action='store_true', + help='Consider only the last revision for each page.', + ) + parser.set_defaults(func=main) + + +def main(dump, features_output_h, stats_output_h, args): + stats = { + 'performance': { + 'start_time': None, + 'end_time': None, + 'revisions_analyzed': 0, + 'pages_analyzed': 0, + }, + 'section_names': { + 'global': collections.Counter(), + 'last_revision': collections.Counter(), + }, + } + pages_generator = extract_pages(dump, + language=args.language, + stats=stats, + only_last_revision=args.only_last_revision, + ) + with features_output_h: + stats['performance']['start_time'] = datetime.datetime.utcnow() + dumper.render_template( + features_template, + output_handler=features_output_h, + pages=pages_generator, + ) + stats['performance']['end_time'] = datetime.datetime.utcnow() + + with stats_output_h: + dumper.render_template( + stats_template, + stats_output_h, + stats=stats, + ) diff --git a/mwrefs/bibs/processors/identifiers_extractor.py b/mwrefs/bibs/processors/identifiers_extractor.py new file mode 100644 index 0000000..9ba5961 --- /dev/null +++ b/mwrefs/bibs/processors/identifiers_extractor.py @@ -0,0 +1,220 @@ +import collections +import datetime +import more_itertools + +from .. import utils, extractors, dumper + +features_template = ''' +<%! + from itertools import groupby + def groupby_action(diff): + return groupby(diff, lambda d: d.action) +%> +<%def name="attribute_if_exists(name, text)" filter="trim"> + % if text is not None: + ${name}="${text | x}" + % endif + +<%def name="tag_user_if_exists(user)" filter="trim"> + % if user: + + % endif + + + % for page in pages: + + ${page.title | x} + ${page.id | x} + + % for revision in page.revisions: + + ${revision.id | x} + ${tag_user_if_exists(revision.user)} + ${revision.timestamp | x} + + % for key, group in groupby_action(revision.publication_identifiers_diff): + + % for _, identifier in group: + + % endfor + + % endfor + + + %endfor + + + % endfor + +''' + +stats_template = ''' + + + ${stats['performance']['start_time']} + ${stats['performance']['end_time']} + ${stats['performance']['revisions_analyzed']} + ${stats['performance']['pages_analyzed']} + + + % for key in ['global', 'last_revision']: + <${key}> + % for where, count in stats['identifiers'][key].items(): + + % endfor + + % endfor + + +''' + +Page = collections.namedtuple('Page', [ + 'id', + 'title', + 'revisions', +]) +Revision = collections.namedtuple('Revision', [ + 'id', + 'user', + 'timestamp', + 'publication_identifiers_diff', +]) + +def IdentifierStatsDict(): + return { + 'only_in_raw_text': 0, + 'only_in_tag_ref': 0, + 'only_in_template': 0, + 'in_tag_ref_and_template': 0, + } + + +@utils.listify(wrapper=set) +def where_appears(span, **spans): + span_le = extractors.Span.__le__ + for key, span_list in spans.items(): + # if any(span <= other_span) for other_span in span_list): + # HACK: the following is more efficient. Sorry :( + if any(span_le(span, other_span) for other_span in span_list): + yield key + + +def identifier_appearance_stat_key(appearances): + if {'templates', 'references'} <= appearances: + return 'in_tag_ref_and_template' + elif 'templates' in appearances: + return 'only_in_template' + elif 'references' in appearances: + return 'only_in_tag_ref' + else: + return 'only_in_raw_text' + + +def extract_revisions(page, stats, only_last_revision): + revisions = more_itertools.peekable(page) + + prev_identifiers = set() + for mw_revision in revisions: + utils.dot() + + is_last_revision = not utils.has_next(revisions) + if only_last_revision and not is_last_revision: + continue + + text = utils.remove_comments(mw_revision.text or '') + + references_captures = list(extractors.references(text)) + + templates_captures = list(extractors.templates(text)) + + identifiers_captures = list(extractors.pub_identifiers(text)) + identifiers = [identifier for identifier, _ in identifiers_captures] + + for identifier, span in identifiers_captures: + appearances = where_appears(span, + references=(span for _, span in references_captures), + templates=(span for _, span in templates_captures), + ) + key_to_increment = identifier_appearance_stat_key(appearances) + + stats['identifiers']['global'][key_to_increment] += 1 + if is_last_revision: + stats['identifiers']['last_revision'][key_to_increment] += 1 + + yield Revision( + id=mw_revision.id, + user=mw_revision.user, + timestamp=mw_revision.timestamp.to_json(), + publication_identifiers_diff=utils.diff(prev_identifiers, + identifiers), + ) + + stats['performance']['revisions_analyzed'] += 1 + prev_identifiers = identifiers + + +def extract_pages(dump, stats, only_last_revision): + for mw_page in dump: + utils.log("Processing", mw_page.title) + + # Skip non-articles + if mw_page.namespace != 0: + utils.log('Skipped (namespace != 0)') + continue + + revisions_generator = extract_revisions( + mw_page, + stats=stats, + only_last_revision=only_last_revision, + ) + + yield Page( + id=mw_page.id, + title=mw_page.title, + revisions=revisions_generator, + ) + stats['performance']['pages_analyzed'] += 1 + + +def configure_subparsers(subparsers): + parser = subparsers.add_parser('extract-identifiers', + help='Extract the identifiers from the text (doi, isbn, arxiv and pubmed.') + parser.add_argument('--only-last-revision', + action='store_true', + help='Consider only the last revision for each page.', + ) + parser.set_defaults(func=main) + + +def main(dump, features_output_h, stats_output_h, args): + stats = { + 'performance': { + 'start_time': None, + 'end_time': None, + 'revisions_analyzed': 0, + 'pages_analyzed': 0, + }, + 'identifiers': { + 'global': IdentifierStatsDict(), + 'last_revision': IdentifierStatsDict(), + }, + } + pages_generator = extract_pages(dump, + stats=stats, + only_last_revision=args.only_last_revision, + ) + with features_output_h: + stats['performance']['start_time'] = datetime.datetime.utcnow() + dumper.render_template( + features_template, + output_handler=features_output_h, + pages=pages_generator, + ) + stats['performance']['end_time'] = datetime.datetime.utcnow() + + with stats_output_h: + dumper.render_template( + stats_template, + stats_output_h, + stats=stats, + ) diff --git a/mwrefs/bibs/processors/sections_counter.py b/mwrefs/bibs/processors/sections_counter.py new file mode 100644 index 0000000..1ae3141 --- /dev/null +++ b/mwrefs/bibs/processors/sections_counter.py @@ -0,0 +1,136 @@ +import collections +import datetime + +import more_itertools + +from .. import utils, extractors, dumper + + +stats_template = ''' + + + ${stats['performance']['start_time']} + ${stats['performance']['end_time']} + ${stats['performance']['revisions_analyzed']} + ${stats['performance']['pages_analyzed']} + + + % for key in ['global', 'last_revision']: + <${key}> + % for section_name, count in stats['section_names_per_revision'][key].most_common(): +
+ % endfor + + % endfor + + + % for key in ['global', 'last_revision']: + <${key}> + % for sections_in_revision, count in stats['sections_per_revision'][key].most_common(): + + % endfor + + % endfor + + + + + + +''' + + +def analyze_revisions(page, stats, only_last_revision): + revisions = more_itertools.peekable(page) + + section_names_stats = stats['section_names_per_revision'] + sections_stats = stats['sections_per_revision'] + + for mw_revision in revisions: + utils.dot() + + is_last_revision = not utils.has_next(revisions) + if only_last_revision and not is_last_revision: + continue + + text = utils.remove_comments(mw_revision.text or '') + + section_names = [section.name.strip().lower() + for section, _ in extractors.sections(text)] + sections_count = len(section_names) + + for section_name in section_names: + section_names_stats['global'][section_name] += 1 + if is_last_revision: + section_names_stats['last_revision'][section_name] += 1 + + sections_stats['global'][sections_count] += 1 + if is_last_revision: + sections_stats['last_revision'][sections_count] += 1 + + stats['revisions']['global'] += 1 + if is_last_revision: + stats['revisions']['last_revision'] += 1 + + stats['performance']['revisions_analyzed'] += 1 + + +def analyze_pages(dump, stats, only_last_revision): + for mw_page in dump: + utils.log("Processing", mw_page.title) + + # Skip non-articles + if mw_page.namespace != 0: + utils.log('Skipped (namespace != 0)') + continue + + analyze_revisions( + mw_page, + stats=stats, + only_last_revision=only_last_revision, + ) + + stats['performance']['pages_analyzed'] += 1 + + +def configure_subparsers(subparsers): + parser = subparsers.add_parser('count-sections', + help='Count the number of sections and the section names of the dump.') + parser.add_argument('--only-last-revision', + action='store_true', + help='Consider only the last revision for each page.', + ) + parser.set_defaults(func=main) + + +def main(dump, features_output_h, stats_output_h, args): + stats = { + 'sections_per_revision': { + 'global': collections.Counter(), + 'last_revision': collections.Counter(), + }, + 'section_names_per_revision': { + 'global': collections.Counter(), + 'last_revision': collections.Counter(), + }, + 'revisions': collections.Counter(), + 'performance': { + 'start_time': None, + 'end_time': None, + 'revisions_analyzed': 0, + 'pages_analyzed': 0, + } + } + stats['performance']['start_time'] = datetime.datetime.utcnow() + analyze_pages(dump, + stats=stats, + only_last_revision=args.only_last_revision, + ) + stats['performance']['end_time'] = datetime.datetime.utcnow() + + with stats_output_h: + dumper.render_template( + stats_template, + stats_output_h, + stats=stats, + ) diff --git a/mwrefs/bibs/utils.py b/mwrefs/bibs/utils.py new file mode 100644 index 0000000..3651180 --- /dev/null +++ b/mwrefs/bibs/utils.py @@ -0,0 +1,91 @@ +import functools +import collections +import sys +import regex as re + + +Diff = collections.namedtuple("Diff", "action data") + + +def diff(previous, current): + # previous = [ref.text for ref in previous] + # current = [ref.text for ref in current] + + added = set(current) - set(previous) + removed = set(previous) - set(current) + + diff = ( + [Diff('added', el) for el in added] + + [Diff('removed', el) for el in removed] + ) + + return diff + + +# https://github.com/shazow/unstdlib.py/blob/master/unstdlib/standard/list_.py#L149 +def listify(fn=None, wrapper=list): + """ + A decorator which wraps a function's return value in ``list(...)``. + + Useful when an algorithm can be expressed more cleanly as a generator but + the function should return an list. + + Example:: + + >>> @listify + ... def get_lengths(iterable): + ... for i in iterable: + ... yield len(i) + >>> get_lengths(["spam", "eggs"]) + [4, 4] + >>> + >>> @listify(wrapper=tuple) + ... def get_lengths_tuple(iterable): + ... for i in iterable: + ... yield len(i) + >>> get_lengths_tuple(["foo", "bar"]) + (3, 3) + """ + def listify_return(fn): + @functools.wraps(fn) + def listify_helper(*args, **kw): + return wrapper(fn(*args, **kw)) + return listify_helper + if fn is None: + return listify_return + return listify_return(fn) + + +def iter_with_prev(iterable): + last = None + for el in iterable: + yield last, el + last = el + + +def dot(num=None): + if not num: + what = '.' + elif num < 10: + what = str(num) + else: + what = '>' + print(what, end='', file=sys.stderr, flush=True) + + +def log(*args): + first, *rest = args + print('\n' + str(first), *rest, end='', file=sys.stderr, flush=True) + + +def remove_comments(source): + pattern = re.compile(r'', re.MULTILINE | re.DOTALL) + return pattern.sub('', source) + + +def has_next(peekable): + try: + peekable.peek() + return True + except StopIteration: + return False diff --git a/mwrefs/ids/__init__.py b/mwrefs/ids/__init__.py new file mode 100644 index 0000000..e5d4c7d --- /dev/null +++ b/mwrefs/ids/__init__.py @@ -0,0 +1,3 @@ +from .identifier import Identifier + +__version__ = "0.2.0" diff --git a/mwrefs/ids/extractors/__init__.py b/mwrefs/ids/extractors/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mwrefs/ids/extractors/arxiv.py b/mwrefs/ids/extractors/arxiv.py new file mode 100644 index 0000000..c7c52c6 --- /dev/null +++ b/mwrefs/ids/extractors/arxiv.py @@ -0,0 +1,17 @@ +import re + +from ..identifier import Identifier + +# From http://arxiv.org/help/arxiv_identifier +old_id = r"-?(?P([a-z]+(.[a-z]+)/)?[0-9]{4}[0-9]+)" +new_id = r"(?P[0-9]{4}.[0-9]+)(v[0-9]+)?" + +prefixes=["arxiv\s*=\s*", "//arxiv\.org/(abs/)?", "arxiv:\s?"] + +ARXIV_RE = re.compile(r"({0})".format("|".join(prefixes)) + + r"({0}|{1})".format(old_id, new_id), re.I|re.U) + +def extract(text): + for match in ARXIV_RE.finditer(text): + id = match.group('new_id') or match.group("old_id") + yield Identifier("arxiv", id.lower()) diff --git a/mwrefs/ids/extractors/doi.py b/mwrefs/ids/extractors/doi.py new file mode 100644 index 0000000..c76bb3d --- /dev/null +++ b/mwrefs/ids/extractors/doi.py @@ -0,0 +1,150 @@ +import re +from collections import defaultdict + +from more_itertools import peekable + +from ..identifier import Identifier + +DOI_START_RE = re.compile(r'10\.[0-9]{4,}/') + +HTML_TAGS = ['ref', 'span', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', + 'b', 'u', 'i', 's', 'ins', 'del', 'code', 'tt', 'blockquote', + 'pre'] + +TAGS_RE = re.compile(r'<(/\s*)?(' + '|'.join(HTML_TAGS) + ')(\s[^>\n\r]+)?>', re.I) + +''' +DOI_RE = re.compile(r'\b(10\.\d+/[^\s\|\]\}\?\,]+)') + +def extract_regex(text): + for match in DOI_RE.finditer(text): + id = re.sub(TAGS_RE, "", match.group(1)).rstrip(".") + yield Identifier("doi", id) + +import mwparserfromhell as mwp +def extract_mwp(text): + no_tags = mwp.parse(text).strip_code() + for match in DOI_RE.finditer(no_tags): + id = re.sub(TAGS_RE, "", match.group(1)).rstrip(".") + yield Identifier("doi", id) +''' + +LEXICON = [ + (DOI_START_RE.pattern, 'doi_start'), + (r'\(', 'open_paren'), + (r'\)', 'close_paren'), + (r'\[', 'open_bracket'), + (r'\]', 'close_bracket'), + (r'', 'comment_end'), + (TAGS_RE.pattern, 'tag'), + (r'<', 'open_angle'), + (r'>', 'close_angle'), + (r'\{', 'open_curly'), + (r'\}', 'close_curly'), + (r'\|', 'pipe'), + (r'[,\.;!]', 'punct'), + (r'[\?#]', 'url_end'), + (r'[\n\r]+', 'break'), + (r'\s+', 'whitespace'), + (r'\w+', 'word'), + (r'.', 'etc') +] + +def extract_island(text): + tokens = tokenize_finditer(text, LEXICON) + tokens = peekable(tokens) + + while tokens.peek(None) is not None: + + if tokens.peek()[0] == 'doi_start': + yield ('doi', read_doi(tokens)) + + next(tokens) + + +def tokenize_finditer(text, lexicon=LEXICON): + pattern = '|'.join("(?P<{0}>{1})".format(name, pattern) + for pattern, name in lexicon) + + group_regex = re.compile(pattern, re.I|re.U|re.M) + + for match in group_regex.finditer(text): + yield match.lastgroup, match.group(0) + + +""" +def tokenize_scanner(text, lexicon=LEXICON): + scanner = re.Scanner(lexicon) + tokens, remainder = scanner.scan(text) + return tokens +""" + +#from mwcites.extractors.doi import tokenize_scan +#list(tokenize_scan("foo bar baz.{}")) + +def read_doi(tokens): + assert tokens.peek()[0] == 'doi_start' + + depth = defaultdict(lambda: 0) + + doi_buffer = [next(tokens)[1]] + + while tokens.peek(None) is not None: + name, match = tokens.peek() + + if name in ('url_end', 'break', 'whitespace', 'tag', 'pipe', + 'comment_start', 'comment_end'): + break + elif name == 'open_bracket': + depth['bracket'] += 1 + doi_buffer.append(next(tokens)[1]) + elif name == 'open_curly': + depth['curly'] += 1 + doi_buffer.append(next(tokens)[1]) + elif name == 'close_bracket': + if depth['bracket'] > 0: + depth['bracket'] -= 1 + doi_buffer.append(next(tokens)[1]) + else: + break + elif name == 'close_curly': + if depth['curly'] > 0: + depth['curly'] -= 1 + doi_buffer.append(next(tokens)[1]) + else: + break + else: + doi_buffer.append(next(tokens)[1]) + + + # Do not return a doi with punctuation at the end + return re.sub(r'[\.,!]+$', '', ''.join(doi_buffer)) + + + +def tokenize_search(text, start, lexicon=LEXICON): + pattern = '|'.join("(?P<{0}>{1})".format(name, pattern) + for pattern, name in lexicon) + + group_regex = re.compile(pattern, re.I|re.U) + + match = group_regex.search(text, start) + while match is not None: + yield match.lastgroup, match.group(0) + match = group_regex.search(text, match.span()[1]) + +def extract_search(text, lexicon=LEXICON): + + last_end = 0 + for match in DOI_START_RE.finditer(text): + if match.span()[0] > last_end: + tokens = tokenize_search(text, match.span()[0], lexicon=lexicon) + tokens = peekable(tokens) + doi = read_doi(tokens) + last_end = match.span()[0] + len(doi) + yield Identifier('doi', doi) + else: + last_end = max(match.span()[1], last_end) + +extract = extract_search # Setting the default to the best method diff --git a/mwrefs/ids/extractors/isbn.py b/mwrefs/ids/extractors/isbn.py new file mode 100644 index 0000000..12883f3 --- /dev/null +++ b/mwrefs/ids/extractors/isbn.py @@ -0,0 +1,8 @@ +import re +from ..identifier import Identifier + +ISBN_RE = re.compile('isbn\s?=?\s?([0-9\-Xx]+)', re.I) + +def extract(text): + for match in ISBN_RE.finditer(text): + yield Identifier('isbn', match.group(1).replace('-', '')) diff --git a/mwrefs/ids/extractors/pubmed.py b/mwrefs/ids/extractors/pubmed.py new file mode 100644 index 0000000..5fbaf67 --- /dev/null +++ b/mwrefs/ids/extractors/pubmed.py @@ -0,0 +1,22 @@ +import re + +from ..identifier import Identifier + +TEMPLATE_RE = re.compile(r"\b(pmid|pmc)\s*=\s*(pmc)?([0-9]+)\b", re.I) + +PMURL_RE = re.compile(r"//www\.ncbi\.nlm\.nih\.gov" + + r"/pubmed/([0-9]+)\b", re.I) +PMCURL_RE = re.compile(r"//www\.ncbi\.nlm\.nih\.gov" + + r"/pmc/articles/PMC([0-9]+)\b", re.I) + +def extract(text): + text = str(text or "") + + for match in TEMPLATE_RE.finditer(text): + yield Identifier(match.group(1).lower(), match.group(3)) + + for match in PMURL_RE.finditer(text): + yield Identifier("pmid", match.group(1)) + + for match in PMCURL_RE.finditer(text): + yield Identifier("pmc", match.group(1)) diff --git a/mwrefs/ids/extractors/tests/__init__.py b/mwrefs/ids/extractors/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mwrefs/ids/extractors/tests/test_arxiv.py b/mwrefs/ids/extractors/tests/test_arxiv.py new file mode 100644 index 0000000..ee0e7a6 --- /dev/null +++ b/mwrefs/ids/extractors/tests/test_arxiv.py @@ -0,0 +1,43 @@ +import pprint + +from nose.tools import eq_ + +from .. import arxiv +from ...identifier import Identifier + +INPUT_TEXT = """ +This is a doi randomly placed in the text 10.0000/m1 +Here's a typo that might be construed as a doi 10.60 people were there. +{{cite|...|arxiv=0706.0001v1|pmid=10559875}} +Halfaker, A., Geiger, R. S., Morgan, J. T., & Riedl, J. (2012). +The rise and decline of an open collaboration system: How Wikipedia’s +reaction to popularity is causing its decline. +American Behavioral Scientist, +0002764212469365 arxiv:0706.0002v1. Hats pants and banana +[http://arxiv.org/0706.0003] +[http://arxiv.org/abs/0706.0004v1] +[https://arxiv.org/abs/0706.0005v1] +[https://arxiv.org/abs/math.GT/0309001] +[https://arxiv.org/abs/-math.gs/0309002] +{{cite|...|arxiv=foobar.hats/0101003|issue=1656}} +http://www.google.com/sky/#latitude=3.362&longitude=160.1238441&zoom= +10.2387/234310.2347/39423 + +""" +EXPECTED = [ + Identifier('arxiv', "0706.0001"), + Identifier('arxiv', "0706.0002"), + Identifier('arxiv', "0706.0003"), + Identifier('arxiv', "0706.0004"), + Identifier('arxiv', "0706.0005"), + Identifier('arxiv', "math.gt/0309001"), + Identifier('arxiv', "math.gs/0309002"), + Identifier('arxiv', "foobar.hats/0101003") +] + +def test_extract(): + ids = list(arxiv.extract(INPUT_TEXT)) + pprint.pprint(ids) + pprint.pprint(EXPECTED) + eq_(ids, EXPECTED) diff --git a/mwrefs/ids/extractors/tests/test_doi.py b/mwrefs/ids/extractors/tests/test_doi.py new file mode 100644 index 0000000..05b85fa --- /dev/null +++ b/mwrefs/ids/extractors/tests/test_doi.py @@ -0,0 +1,67 @@ +import pprint + +from nose.tools import eq_ + +from .. import doi +from ...identifier import Identifier + +INPUT_TEXT = """ +This is a doi randomly placed in the text 10.0000/m1 +Here's a typo that might be construed as a doi 10.60 people were there. +{{cite|...|doi=10.0000/m2|pmid=10559875}} +Halfaker, A., Geiger, R. S., Morgan, J. T., & Riedl, J. (2012). +The rise and decline of an open collaboration system: How Wikipedia’s +reaction to popularity is causing its decline. +American Behavioral Scientist, +0002764212469365 doi: 10.1177/0002764212469365. Hats pants and banana +[http://dx.doi.org/10.1170/foo(herp)derp] +[http://dx.doi.org/10.1170/foo(herp)derp[waffles]] +{{cite|...|doi=10.1098/rspb.2008.1131|issue=1656}} +http://www.google.com/sky/#latitude=3.362&longitude=160.1238441&zoom= +10.2387/234310.2347/39423 + +""" +EXPECTED = [ + Identifier('doi', "10.0000/m1"), + Identifier('doi', "10.0000/m2"), + Identifier('doi', "10.1177/0002764212469365"), + Identifier('doi', "10.1170/foo(herp)derp"), + Identifier('doi', "10.1170/foo(herp)derp[waffles]"), + Identifier('doi', "10.1098/rspb.2008.1131"), + Identifier('doi', "10.2387/234310.2347/39423"), + Identifier('doi', "10.2387/234310.2347/39423") +] + +""" +def test_extract_regex(): + ids = list(doi.extract_regex(INPUT_TEXT)) + pprint.pprint(ids) + pprint.pprint(EXPECTED) + eq_(ids, EXPECTED) + +def test_extract_mwp(): + ids = list(doi.extract_mwp(INPUT_TEXT)) + pprint.pprint(ids) + pprint.pprint(EXPECTED) + eq_(ids, EXPECTED) +""" + +def test_extract(): + ids = list(doi.extract(INPUT_TEXT)) + pprint.pprint(ids) + pprint.pprint(EXPECTED) + eq_(ids, EXPECTED) + +def test_extract_island(): + ids = list(doi.extract_island(INPUT_TEXT)) + pprint.pprint(ids) + pprint.pprint(EXPECTED) + eq_(ids, EXPECTED) + +def test_extract_search(): + ids = list(doi.extract_search(INPUT_TEXT)) + pprint.pprint(ids) + pprint.pprint(EXPECTED) + #pprint.pprint(list(doi.tokenize_finditer(INPUT_TEXT))) + eq_(ids, EXPECTED) diff --git a/mwrefs/ids/extractors/tests/test_isbn.py b/mwrefs/ids/extractors/tests/test_isbn.py new file mode 100644 index 0000000..cd41776 --- /dev/null +++ b/mwrefs/ids/extractors/tests/test_isbn.py @@ -0,0 +1,44 @@ +import pprint +from nose.tools import eq_ + +from .. import isbn +from ...identifier import Identifier + +INPUT_TEXT = """ + | publisher=Academic Press | isbn=0124366031 + | isbn=3540206310 + | accessdate=2008-02-05 | isbn=0-618-34342-3 + | isbn=978-0-140-27666-4 + | isbn = 0-13-054091-9 + | isbn=0195305736 }}</ref> schlug [[Irving Langmuir]] 1919 vor, dass das Elektronen in einem Atom verbunden oder verklumpt seien. Elektronengruppen beset + | ISBN=978-3-7046-5112-9 + * Peter L. Bergen: ''Heiliger Krieg, Inc.: Osama bin Ladens Terrornetz''. Siedler, Berlin 2001, ISBN 3-88680-752-5. + * Marwan Abou-Taam, Ruth Bigalke (Hgg) ''Die Reden des Osama bin Laden''. Diederichs, München 2006, ISBN 3-72052-773-5. (Reden und Ansprachen des b.L. im Original - ''Rezensionen: '' [http://www.sicherheit-heute.de/index.php?cccpage=readpolitik&set_z_artikel=221 ]und [http://www.fr-online.de/in_und_ausland/kultur_und_medien/buecher/?em_cnt=868715&sid=f55727] Frankf. Rundschau 26. April 2006) + * Michael Pekler, Andreas Ungerböck: ''Ang Lee und seine Filme''. Schüren, Marburg 2009, ISBN 978-3-89472-665-2. + <ref name="flos1">{{Literatur | Autor = René Flosdorff, Günther Hilgarth | Titel = Elektrische Energieverteilung | Verlag = Teubner | Auflage = 8. | Jahr = 2003 | Kapitel = Kapitel 1.2.2.4 | ISBN = 3-519-26424-2 }}</ref> + Bei einer [[Sprungtemperatur]] von 1,2&nbsp;K wird reines Aluminium [[Supraleiter|supraleitend]].<ref>{{Literatur | Autor = Ilschner | first = Bernhard | Titel = Werkstoffwissenschaften und Fertigungstechnik Eigenschaften, Vorgänge, Technologien | Verlag = Springer | Ort = Berlin | Jahr = 2010 | ISBN = 978-3-642-01734-6 | Seiten = 277}}</ref> + * {{Literatur | Autor=Michael J. Padilla, Ioannis Miaoulis, Martha Cyr | Jahr = 2002 | Titel = Prentice Hall Science Explorer: Chemical Building Blocks | Verlag = Prentice-Hall, Inc. | Ort = Upper Saddle River, New Jersey USA | ISBN = 0-13-054091-9 | |Originalsprache=en}} + """ + + +EXPECTED = [ + Identifier('isbn', '0124366031'), + Identifier('isbn', '3540206310'), + Identifier('isbn', '0618343423'), + Identifier('isbn', '9780140276664'), + Identifier('isbn', '0130540919'), + Identifier('isbn', '0195305736'), + Identifier('isbn', '9783704651129'), + Identifier('isbn', '3886807525'), + Identifier('isbn', '3720527735'), + Identifier('isbn', '9783894726652'), + Identifier('isbn', '3519264242'), + Identifier('isbn', '9783642017346'), + Identifier('isbn', '0130540919'), +] + +def test_extract(): + ids = list(isbn.extract(INPUT_TEXT)) + pprint.pprint(ids) + pprint.pprint(EXPECTED) + eq_(ids, EXPECTED) diff --git a/mwrefs/ids/extractors/tests/test_pubmed.py b/mwrefs/ids/extractors/tests/test_pubmed.py new file mode 100644 index 0000000..48f98d9 --- /dev/null +++ b/mwrefs/ids/extractors/tests/test_pubmed.py @@ -0,0 +1,27 @@ +from nose.tools import eq_ + +from .. import pubmed +from ...identifier import Identifier + +def test_extract(): + + text = """ + This is some text with a template cite. {{cite|...|...|pmid=1}}. + This is some text with a template cite. {{cite|...|...|pmid = 2|...}}. + This is some text with a template cite. {{cite|...|...|pmc = 3|...}}. + This is some text with a template cite. {{cite|...|...|pmc = pmc4|...}}. + This is some text with a link [http://www.ncbi.nlm.nih.gov/pubmed/5 ID] + Another link [https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6 ID] + """ + ids = list(pubmed.extract(text)) + expected = [ + Identifier('pmid', "1"), + Identifier('pmid', "2"), + Identifier('pmc', "3"), + Identifier('pmc', "4"), + Identifier('pmid', "5"), + Identifier('pmc', "6") + ] + print(ids) + print(expected) + eq_(ids, expected) diff --git a/mwrefs/ids/fetchers/doi.py b/mwrefs/ids/fetchers/doi.py new file mode 100644 index 0000000..eb963d3 --- /dev/null +++ b/mwrefs/ids/fetchers/doi.py @@ -0,0 +1,48 @@ +import requests + +CITOID_HOST = 'https://citoid.wikimedia.org' + + +def lookup_via_citoid(doi): + url = CITOID_HOST + "/api" + params = { + 'format': "mediawiki", + 'search': doi + } + response = requests.get(url, params=params) + doc = response.json() + if 'Error' in doc: + raise RuntimeError(doc['Error']) + else: + return doc + + +def lookup_via_doidotorg(doi): + url = "http://doi.org" + data = { + "hdl": doi + } + response = requests.post( + url, data=data, headers={'Accept': "application/json"}) + if response.status_code == 404: + raise RuntimeError("DOI not found") + elif response.status_code == 200: + return response.json() + else: + raise RuntimeError("Unknown error") + +METHODS = { + 'doi.org': lookup_via_doidotorg, + 'citoid.wikimedia.org': lookup_via_citoid +} + + +def lookup(doi, methods=['doi.org']): + for i, method in enumerate(methods): + try: + return METHODS[method](doi) + except RuntimeError as e: + if i+1 == len(methods): + raise e + else: + continue diff --git a/mwrefs/ids/identifier.py b/mwrefs/ids/identifier.py new file mode 100644 index 0000000..44f9b03 --- /dev/null +++ b/mwrefs/ids/identifier.py @@ -0,0 +1,3 @@ +from collections import namedtuple + +Identifier = namedtuple("Identifier", ['type', 'id']) diff --git a/mwrefs/ids/utilities/__init__.py b/mwrefs/ids/utilities/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mwrefs/ids/utilities/extract.py b/mwrefs/ids/utilities/extract.py new file mode 100644 index 0000000..1dfa0ca --- /dev/null +++ b/mwrefs/ids/utilities/extract.py @@ -0,0 +1,160 @@ +""" +Extracts academic citations from articles from the history of Wikipedia +articles by processing a pages-meta-history XML dump and matching regular +expressions to revision content. + +Currently supported identifies include: + + * PubMed + * DOI + * ISBN + * arXiv + +Outputs a TSV file with the following fields: + + * page_id: The identifier of the Wikipedia article (int), e.g. 1325125 + * page_title: The title of the Wikipedia article (utf-8), e.g. Club cell + * rev_id: The Wikipedia revision where the citation was first added (int), + e.g. 282470030 + * timestamp: The timestamp of the revision where the citation was first added. + (ISO 8601 datetime), e.g. 2009-04-08T01:52:20Z + * type: The type of identifier, e.g. pmid, pmcid, doi, arxiv or isbn + * id: The id of the cited scholarly article (utf-8), + e.g 10.1183/09031936.00213411 + +Usage: + extract -h | --help + extract ... [--extractor=...] + +Options: + -h --help Shows this documentation + The path to a set of dump files to process. If no + files are specified, will be read. + --extractor= The class path to set of extractors to apply + [default: ] +""" +import sys +from itertools import chain + +import docopt +import mwxml + +import mysqltsv + +from ..extractors import arxiv, doi, isbn, pubmed + +ALL_EXTRACTORS = [doi, pubmed, isbn, arxiv] + +HEADERS = ("page_id", "page_title", "rev_id", "timestamp", "type", "id") + +def main(argv=None): + args = docopt.docopt(__doc__, argv=argv) + dump_files = args[''] + + if args['--extractor'] == ['']: + extractors = ALL_EXTRACTORS + else: + extractors = [import_from_path(path.lower) + for path in args['--extractor']] + + run(dump_files, extractors) + +def run(dump_files, extractors): + writer = mysqltsv.Writer(sts.stdout, headers=HEADERS) + + cites = extract(dump_files, extractors=extractors) + for page_id, title, rev_id, timestamp, type, id in cites: + writer.write(page_id, title, rev_id, timestamp.long_format(), type, id) + +def extract(dump_files, extractors=ALL_EXTRACTORS): + """ + Extracts cites from a set of `dump_files`. + + :Parameters: + dump_files : str | `file` + A set of files MediaWiki XML dump files + (expects: pages-meta-history) + extractors : `list`(`extractor`) + A list of extractors to apply to the text + + :Returns: + `iterable` -- a generator of extracted cites + + """ + # Dump processor function + def process_dump(dump, path): + for page in dump: + if page.namespace != 0: continue + else: + for cite in extract_cite_history(page, extractors): + yield cite + + # Map call + return mwxml.map(process_dump, dump_files) + +def extract_cite_history(page, extractors): + """ + Extracts cites from the history of a `page` (`mwxml.Page`). + + :Parameters: + page : `iterable`(`mwxml.Revision`) + The page to extract cites from + extractors : `list`(`extractor`) + A list of extractors to apply to the text + + :Returns: + `iterable` -- a generator of extracted cites + + """ + appearances = {} # For tracking the first appearance of an ID + ids = set() # For holding onto the ids in the last revision. + for revision in page: + ids = set(extract_ids(revision.text, extractors)) + + # For each ID, check to see if we have seen it before + for id in ids: + if id not in appearances: + appearances[id] = (revision.id, revision.timestamp) + + for id in ids: #For the ids in the last version of the page + rev_id, timestamp = appearances[id] + yield (page.id, page.title, rev_id, timestamp, id.type, id.id) + +def extract_ids(text, extractors): + """ + Uses `extractors` to extract citation identifiers from a text. + + :Parameters: + text : str + The text to process + extractors : `list`(`extractor`) + A list of extractors to apply to the text + + :Returns: + `iterable` -- a generator of extracted identifiers + """ + for extractor in extractors: + for id in extractor.extract(text): + yield id + +def import_from_path(path): + """ + Imports a specific attribute from a module based on a class path. + + :Parameters: + path : str + A dot delimited string representing the import path of the desired + object. + + :Returns: + object -- An imported object + """ + parts = path.split(".") + module_path = ".".join(parts[:-1]) + attribute_name = parts[-1] + + module = import_module(module_path) + + attribute = getattr(module, attribute_name) + + return attribute diff --git a/mwrefs/ids/utilities/tests/__init__.py b/mwrefs/ids/utilities/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mwrefs/ids/utilities/tests/test_extract.py b/mwrefs/ids/utilities/tests/test_extract.py new file mode 100644 index 0000000..e4353ca --- /dev/null +++ b/mwrefs/ids/utilities/tests/test_extract.py @@ -0,0 +1,41 @@ +from collections import namedtuple + +from mw import Timestamp +from nose.tools import eq_ + +from ..extract import extract_cite_history +from ...identifier import Identifier + + +def test_extract_cite_history(): + FakeRevision = namedtuple("Revision", ['id', 'timestamp', 'text']) + + FakeExtractor = namedtuple("Extractor", ['extract']) + + class FakePage: + def __init__(self, id, title): + self.id = id + self.title = title + def __iter__(self): + return iter([ + FakeRevision(1, Timestamp(1), "id1 id2"), + FakeRevision(2, Timestamp(2), "id1 id3"), + FakeRevision(3, Timestamp(3), "id1 id2 id3"), + FakeRevision(4, Timestamp(4), "id1 id2 id4"), + FakeRevision(5, Timestamp(5), "id1 id2 id4"), + ]) + + fake_page = FakePage(1, "Title") + + def extract(text): + return (Identifier('fake', id) for id in text.split(" ")) + extractor = FakeExtractor(extract) + + expected = [(1, "Title", 1, Timestamp(1), "fake", "id1"), + (1, "Title", 1, Timestamp(1), "fake", "id2"), + (1, "Title", 4, Timestamp(4), "fake", "id4")] + + citations = list(extract_cite_history(fake_page, [extractor])) + eq_(len(citations), len(expected)) + for cite in extract_cite_history(fake_page, [extractor]): + assert cite in expected diff --git a/mwrefs/extract.py b/mwrefs/refs/extract.py similarity index 100% rename from mwrefs/extract.py rename to mwrefs/refs/extract.py diff --git a/mwrefs/tests/test_extract.py b/mwrefs/tests/test_extract.py index ca2a660..7dd8600 100644 --- a/mwrefs/tests/test_extract.py +++ b/mwrefs/tests/test_extract.py @@ -41,15 +41,15 @@ def test_extract(): eq_(refs, ['{{cite web\n |url=http://topics.info.com/Who-coined-the-' + - 'term-biology_716 |title=Who coined\n the term biology? |work=' + - 'Info.com|accessdate=2012-06-03}}', + 'term-biology_716 |title=Who coined\n the term biology? ' + + '|work=Info.com|accessdate=2012-06-03}}', '{{cite web|title=biology\n |url=http://' + - 'www.etymonline.com/index.php?term=biology&allowed_in_frame=0\n ' + - ' |publisher=[[Online Etymology Dictionary]]}}', + 'www.etymonline.com/index.php?term=biology&allowed_in_frame=0\n ' + + ' |publisher=[[Online Etymology Dictionary]]}}', '', '', '\n {{cite book|last=Richards|first=Robert J.' + - '|title=The Romantic Conception of\n Life: Science and ' + - 'Philosophy in the Age of Goethe|year=2002\n |publisher=' + - 'University of Chicago Press|isbn=0-226-71210-9\n ' + - '|url=http://books.google.cocover#v=onepage&q&f=false}}', + '|title=The Romantic Conception of\n Life: Science and ' + + 'Philosophy in the Age of Goethe|year=2002\n |publisher=' + + 'University of Chicago Press|isbn=0-226-71210-9\n ' + + '|url=http://books.google.cocover#v=onepage&q&f=false}}', 'foobar']) diff --git a/mwrefs/utilities/diffs.py b/mwrefs/utilities/diff_ref_tags.py similarity index 100% rename from mwrefs/utilities/diffs.py rename to mwrefs/utilities/diff_ref_tags.py diff --git a/mwrefs/utilities/extract_ids.py b/mwrefs/utilities/extract_ids.py new file mode 100644 index 0000000..8b58b24 --- /dev/null +++ b/mwrefs/utilities/extract_ids.py @@ -0,0 +1,166 @@ +""" +Extracts academic citations from articles from the history of Wikipedia +articles by processing a pages-meta-history XML dump and matching regular +expressions to revision content. + +Currently supported identifies include: + + * PubMed + * DOI + * ISBN + * arXiv + +Outputs a TSV file with the following fields: + + * page_id: The identifier of the Wikipedia article (int), e.g. 1325125 + * page_title: The title of the Wikipedia article (utf-8), e.g. Club cell + * rev_id: The Wikipedia revision where the citation was first added (int), + e.g. 282470030 + * timestamp: The timestamp of the revision where the citation was first added. + (ISO 8601 datetime), e.g. 2009-04-08T01:52:20Z + * type: The type of identifier, e.g. pmid, pmcid, doi, arxiv or isbn + * id: The id of the cited scholarly article (utf-8), + e.g 10.1183/09031936.00213411 + +Usage: + extract -h | --help + extract ... [--extractor=...] + +Options: + -h --help Shows this documentation + The path to a set of dump files to process. If no + files are specified, will be read. + --extractor= The class path to set of extractors to apply + [default: ] +""" +import sys +from importlib import import_module +from itertools import chain + +import docopt +import mwxml + +import mysqltsv + +from ..extractors import arxiv, doi, isbn, pubmed + +ALL_EXTRACTORS = [doi, pubmed, isbn, arxiv] + +HEADERS = ("page_id", "page_title", "rev_id", "timestamp", "type", "id") + + +def main(argv=None): + args = docopt.docopt(__doc__, argv=argv) + dump_files = args[''] + + if args['--extractor'] == ['']: + extractors = ALL_EXTRACTORS + else: + extractors = [import_from_path(path.lower) + for path in args['--extractor']] + + run(dump_files, extractors) + + +def run(dump_files, extractors): + writer = mysqltsv.Writer(sys.stdout, headers=HEADERS) + + cites = extract(dump_files, extractors=extractors) + for page_id, title, rev_id, timestamp, type, id in cites: + writer.write(page_id, title, rev_id, timestamp.long_format(), type, id) + + +def extract(dump_files, extractors=ALL_EXTRACTORS): + """ + Extracts cites from a set of `dump_files`. + + :Parameters: + dump_files : str | `file` + A set of files MediaWiki XML dump files + (expects: pages-meta-history) + extractors : `list`(`extractor`) + A list of extractors to apply to the text + + :Returns: + `iterable` -- a generator of extracted cites + + """ + # Dump processor function + def process_dump(dump, path): + for page in dump: + if page.namespace != 0: continue + else: + for cite in extract_cite_history(page, extractors): + yield cite + + # Map call + return mwxml.map(process_dump, dump_files) + + +def extract_cite_history(page, extractors): + """ + Extracts cites from the history of a `page` (`mwxml.Page`). + + :Parameters: + page : `iterable`(`mwxml.Revision`) + The page to extract cites from + extractors : `list`(`extractor`) + A list of extractors to apply to the text + + :Returns: + `iterable` -- a generator of extracted cites + + """ + appearances = {} # For tracking the first appearance of an ID + ids = set() # For holding onto the ids in the last revision. + for revision in page: + ids = set(extract_ids(revision.text, extractors)) + + # For each ID, check to see if we have seen it before + for id in ids: + if id not in appearances: + appearances[id] = (revision.id, revision.timestamp) + + for id in ids: #For the ids in the last version of the page + rev_id, timestamp = appearances[id] + yield (page.id, page.title, rev_id, timestamp, id.type, id.id) + +def extract_ids(text, extractors): + """ + Uses `extractors` to extract citation identifiers from a text. + + :Parameters: + text : str + The text to process + extractors : `list`(`extractor`) + A list of extractors to apply to the text + + :Returns: + `iterable` -- a generator of extracted identifiers + """ + for extractor in extractors: + for id in extractor.extract(text): + yield id + + +def import_from_path(path): + """ + Imports a specific attribute from a module based on a class path. + + :Parameters: + path : str + A dot delimited string representing the import path of the desired + object. + + :Returns: + object -- An imported object + """ + parts = path.split(".") + module_path = ".".join(parts[:-1]) + attribute_name = parts[-1] + + module = import_module(module_path) + + attribute = getattr(module, attribute_name) + + return attribute diff --git a/mwrefs/utilities/extract.py b/mwrefs/utilities/extract_ref_tags.py similarity index 100% rename from mwrefs/utilities/extract.py rename to mwrefs/utilities/extract_ref_tags.py diff --git a/mwrefs/utilities/fetch_metadata.py b/mwrefs/utilities/fetch_metadata.py new file mode 100644 index 0000000..e69de29 diff --git a/mwrefs/utilities/tests/__init__.py b/mwrefs/utilities/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mwrefs/utilities/tests/test_extract_ids.py b/mwrefs/utilities/tests/test_extract_ids.py new file mode 100644 index 0000000..e4353ca --- /dev/null +++ b/mwrefs/utilities/tests/test_extract_ids.py @@ -0,0 +1,41 @@ +from collections import namedtuple + +from mw import Timestamp +from nose.tools import eq_ + +from ..extract import extract_cite_history +from ...identifier import Identifier + + +def test_extract_cite_history(): + FakeRevision = namedtuple("Revision", ['id', 'timestamp', 'text']) + + FakeExtractor = namedtuple("Extractor", ['extract']) + + class FakePage: + def __init__(self, id, title): + self.id = id + self.title = title + def __iter__(self): + return iter([ + FakeRevision(1, Timestamp(1), "id1 id2"), + FakeRevision(2, Timestamp(2), "id1 id3"), + FakeRevision(3, Timestamp(3), "id1 id2 id3"), + FakeRevision(4, Timestamp(4), "id1 id2 id4"), + FakeRevision(5, Timestamp(5), "id1 id2 id4"), + ]) + + fake_page = FakePage(1, "Title") + + def extract(text): + return (Identifier('fake', id) for id in text.split(" ")) + extractor = FakeExtractor(extract) + + expected = [(1, "Title", 1, Timestamp(1), "fake", "id1"), + (1, "Title", 1, Timestamp(1), "fake", "id2"), + (1, "Title", 4, Timestamp(4), "fake", "id4")] + + citations = list(extract_cite_history(fake_page, [extractor])) + eq_(len(citations), len(expected)) + for cite in extract_cite_history(fake_page, [extractor]): + assert cite in expected