diff --git a/mwrefs/bibs/__main__.py b/mwrefs/bibs/__main__.py
new file mode 100644
index 0000000..b9b65dd
--- /dev/null
+++ b/mwrefs/bibs/__main__.py
@@ -0,0 +1,116 @@
+import argparse
+import subprocess
+import codecs
+import os
+
+import mw.xml_dump
+import mwxml
+import pathlib
+
+from . import utils, processors
+
+
+def open_xml_file(path):
+    f = mw.xml_dump.functions.open_file(
+        mw.xml_dump.functions.file(path)
+    )
+    return f
+
+
+def compressor_7z(file_path):
+    p = subprocess.Popen(
+        ['7z', 'a', '-si', file_path],
+        stdin=subprocess.PIPE,
+        stderr=subprocess.DEVNULL,
+        stdout=subprocess.DEVNULL,
+    )
+    utf8writer = codecs.getwriter('utf-8')
+
+    return utf8writer(p.stdin)
+
+
+def output_writer(path, compression):
+    if compression == '7z':
+        return compressor_7z(path + '.7z')
+    else:
+        return open(path, 'wt', encoding='utf-8')
+
+
+def create_path(path):
+    path = pathlib.Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        prog='wikidump',
+        description='Wikidump features extractor.',
+    )
+    parser.add_argument('files',
+        metavar='FILE',
+        type=pathlib.Path,
+        nargs='+',
+        help='XML Wikidump file to parse. It accepts only 7z.'
+    )
+    parser.add_argument('output_dir_path',
+        metavar='OUTPUT_DIR',
+        type=pathlib.Path,
+        help='XML output directory.',
+    )
+    parser.add_argument('--output-compression',
+        choices={None, '7z'},
+        required=False,
+        default=None,
+        help='Output compression format.',
+    )
+    parser.add_argument('--dry-run', '-n',
+        action='store_true',
+        help="Don't write any file",
+    )
+
+    subparsers = parser.add_subparsers(help='sub-commands help')
+    processors.bibliography_extractor.configure_subparsers(subparsers)
+    processors.identifiers_extractor.configure_subparsers(subparsers)
+    processors.sections_counter.configure_subparsers(subparsers)
+
+    parsed_args = parser.parse_args()
+    if 'func' not in parsed_args:
+        parser.print_usage()
+        parser.exit(1)
+
+    return parsed_args
+
+
+def main():
+    args = get_args()
+
+    args.output_dir_path.mkdir(parents=True, exist_ok=True)
+
+    for input_file_path in args.files:
+        utils.log("Analyzing {}...".format(input_file_path))
+
+        dump = mwxml.Dump.from_file(open_xml_file(str(input_file_path)))
+
+        basename = input_file_path.name
+
+        if args.dry_run:
+            pages_output = open(os.devnull, 'wt')
+            stats_output = open(os.devnull, 'wt')
+        else:
+            pages_output = output_writer(
+                path=str(args.output_dir_path/(basename + '.features.xml')),
+                compression=args.output_compression,
+            )
+            stats_output = output_writer(
+                path=str(args.output_dir_path/(basename + '.stats.xml')),
+                compression=args.output_compression,
+            )
+        args.func(dump,
+            pages_output,
+            stats_output,
+            args,
+        )
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mwrefs/bibs/dumper.py b/mwrefs/bibs/dumper.py
new file mode 100644
index 0000000..7e78a8b
--- /dev/null
+++ b/mwrefs/bibs/dumper.py
@@ -0,0 +1,99 @@
+import mako.runtime
+import mako.template
+
+pages_revisions_template = '''
+<%!
+    from itertools import groupby
+    def groupby_action(diff):
+        return groupby(diff, lambda d: d.action)
+%>
+<root>
+    % for page in pages:
+    <page>
+        <title>${page.title}</title>
+        <id>${page.id}</id>
+        <revisions>
+            % for revision in page.revisions:
+            <revision>
+                <id>${revision.id}</id>
+                <user id="{$revision.user.id}" name="${revision.user.text}" />
+                <timestamp>${revision.timestamp}</timestamp>
+                <references_diff>
+                    % for key, group in groupby_action(revision.references_diff):
+                    <diff action="${key}">
+                        % for _, text in group:
+                        <reference>${text}</reference>
+                        % endfor
+                    </diff>
+                    % endfor
+                </references_diff>
+                <publication_identifiers_diff>
+                    % for key, group in groupby_action(revision.publication_identifiers_diff):
+                    <diff action="${key}">
+                        % for _, identifier in group:
+                        <identifier type="${identifier.type}" id="${identifier.id}" />
+                        % endfor
+                    </diff>
+                    % endfor
+                </publication_identifiers_diff>
+                <sections>
+                    % for section in revision.sections:
+                    <section level="${section.level}">${section.name}</section>
+                    % endfor
+                </sections>
+                <bibliography>${revision.bibliography}</bibliography>
+            </revision>
+            %endfor
+        </revisions>
+    </page>
+    % endfor
+</root>
+'''
+
+stats_template = '''
+<stats>
+    <performance>
+        <start_time>${stats['performance']['start_time']}</start_time>
+        <end_time>${stats['performance']['end_time']}</end_time>
+        <revisions_analyzed>${stats['performance']['revisions_analyzed']}</revisions_analyzed>
+        <pages_analyzed>${stats['performance']['pages_analyzed']}</pages_analyzed>
+    </performance>
+    <identifiers>
+        % for key in ['global', 'last_revision']:
+        <${key}>
+            % for where, count in stats['identifiers'][key].items():
+            <appearance where="${where}" count="${count}" />
+            % endfor
+        </${key}>
+        % endfor
+    </identifiers>
+</stats>
+'''
+
+
+def render_template(template, output_handler, default_filters=None, **kwargs):
+    ctx = mako.runtime.Context(output_handler, **kwargs)
+
+    xml_template = mako.template.Template(
+        template,
+        default_filters=default_filters,
+    )
+    xml_template.render_context(ctx)
+
+
+def serialize_page_revisions(pages, output_handler):
+    render_template(
+        pages_revisions_template,
+        output_handler,
+        default_fiters=['x'],  # XML escaping
+        pages=pages,
+    )
+
+
+def serialize_stats(stats, output_handler):
+    render_template(
+        stats_template,
+        output_handler,
+        default_filters=['x'],  # XML escaping
+        stats=stats,
+    )
diff --git a/mwrefs/bibs/languages.py b/mwrefs/bibs/languages.py
new file mode 100644
index 0000000..655e06f
--- /dev/null
+++ b/mwrefs/bibs/languages.py
@@ -0,0 +1,82 @@
+supported = {'en', 'it'}
+
+bibliography = {
+    'en': {
+        'bibliography',
+        'references',
+        'reference',
+        'further reading',
+        'notes',
+        'sources',
+        'footnotes',
+        'citations',
+        'publications',
+        'publication history',
+        'literature',
+    },
+    'it': {'bibliografia'},
+}
+
+citation = {
+    'en': {'Citation', 'cite', 'vcite'},
+}
+
+"""
+What I mean for:
+* References: a section containing footnotes for works cited in the text.
+* Bibliography: a section containing articles and journals.
+* Further reading: like `Bibliography`, but contains references not used in the text.
+* Footnotes: a section containing explainations to concepts.
+
+From now on, words in backquotes (`) are to be interpreted as concept using the above definitions, while words in double quotes (") are to be interpreted as terms found in the text of the articles.
+
+"References" (term) is commonly used as `Bibliography` (concept), i.e. articles and journals without backref to the text.
+And, of course, "Bibliography" (term) is sometimes used as `References` (concept).
+* https://en.wikipedia.org/w/index.php?title=Anabaptists&oldid=49953891 "References" interpreted as `Bibliography`
+* https://en.wikipedia.org/w/index.php?title=Alcopop&oldid=296736852 "References" interpreted as `Bibliography`
+* https://en.wikipedia.org/w/index.php?title=Amu%20Darya&oldid=66374611 "References" interpreted as `Bibliography`
+
+"Citations" (term) sometimes used as synonym for "References" or "Bibliography" (terms):
+* https://en.wikipedia.org/w/index.php?title=Augustine_of_Canterbury&oldid=676642624 "Citations" used as `References`, "References" used as `Bibliography`
+* https://en.wikipedia.org/w/index.php?title=Anemometer&oldid=674186492#Citations "Citations" used as `References`
+
+"Notes and References" and "References and Notes" (terms) are used as synonyms for "References" (term):
+* https://en.wikipedia.org/w/index.php?title=Ackermann%20function&oldid=335603599#Notes_and_references "Notes and References" converted to "References" (term) and interpreted as `References`
+* https://en.wikipedia.org/w/index.php?title=albanians&oldid=391045161#Notes_and_references "Notes and References" is a wrapper around "Notes" (interpreted as `footnotes`) and "References" (interpreted as `References`)
+* https://en.wikipedia.org/w/index.php?title=assassination&oldid=678057527#Notes_and_references interpreted as `References`
+
+"Sources" seems to be interpreted as `Bibliography` or `References`, and sometimes then converted by users to "References" or "Bibliography"
+* https://en.wikipedia.org/w/index.php?title=artemis&diff=next&oldid=565871969 "Sources" has been converted to "References and sources"
+* https://en.wikipedia.org/w/index.php?title=Amakusa&direction=next&oldid=667294099 "Sources" used as `Bibliography`
+* https://en.wikipedia.org/w/index.php?title=A%20Doll's%20House&oldid=676505492#Sources "Sources" used as `Bibliography`
+* https://en.wikipedia.org/w/index.php?title=A.%20E.%20Housman&diff=next&oldid=678259900#Sources "Sources" used `Bibliography`
+
+"Footnotes" is commonly interpreted as `References`, with the following terms: "References" and "Citations"
+* https://en.wikipedia.org/w/index.php?title=Augustine%20of%20Canterbury&oldid=459457206#Footnotes "Footnotes" is used as `References`; "Footnotes" is then converted to "Citations", used as `References`
+* https://en.wikipedia.org/w/index.php?title=Amoxicillin&diff=next&oldid=423375138 "Footnotes" used as and converted to `References`
+* https://en.wikipedia.org/w/index.php?title=Anabaptists&oldid=49953891#Footnotes_and_references "Footnotes" interpreted as `References`. The next revision converts "Footnotes" to "Footnotes and References".
+* https://en.wikipedia.org/w/index.php?title=Alcopop&oldid=296736852#Footnotes "Footnotes" used as `References`
+* https://en.wikipedia.org/w/index.php?title=Archaeopteryx&diff=next&oldid=326796096 "Footnotes" interpreteda s and then converted to `References` (term and concept)
+* https://en.wikipedia.org/w/index.php?title=Al%20Capp&oldid=590148186#Footnotes "Footnotes" interpreted as `References`. It is then converted to "Notes"
+* https://en.wikipedia.org/w/index.php?title=Amu%20Darya&oldid=66374611#Footnotes "Footnotes" interpreted as `References`. Later converted to "Notes"
+* https://en.wikipedia.org/w/index.php?title=Albert%20Brooks&oldid=150996845#Footnotes "Footnotes" used as and then converted to `References` (term and concept)
+
+"Literature" is used most of the times as a subsection for things like "Culture", and in some cases is a replacement for "bibliography":
+* https://en.wikipedia.org/w/index.php?title=Alexandria&oldid=678355005 "Literature" used as subsection of "Culture"
+* https://en.wikipedia.org/w/index.php?title=Bible&oldid=23508742#Literature "Literature" used as `Bibliography`
+* https://en.wikipedia.org/w/index.php?title=Board_game&oldid=7131437#Literature "Literature" used as "Bibliography", then converted to "References" (used as "Bibliography")
+* https://en.wikipedia.org/w/index.php?title=Ahuitzotl&oldid=118183827 "Literature" interpreted as `Bibliography`
+
+"Publications" and "Publication history" are used as a subsection for the "Biography" with the works of the person described.
+
+"Reference" is almost always converted to "References" in a successive revision.
+
+
+"Notes" is sometimes interpreted as `References` or `Footnotes`
+* https://en.wikipedia.org/w/index.php?title=Ahuitzotl&oldid=118183827 "Notes" used as `Footnotes`
+* https://en.wikipedia.org/w/index.php?title=Archaeoastronomy&oldid=678777218#Notes "Notes" used as `References`
+* https://en.wikipedia.org/w/index.php?title=Alexander_of_Hales&oldid=661215939#Other_historical_works "Notes" interpreted as `References`
+
+"See also" and "Related pages" usually contain links to other wikipedia pages.
+"""
+
diff --git a/mwrefs/bibs/processors/__init__.py b/mwrefs/bibs/processors/__init__.py
new file mode 100644
index 0000000..d613340
--- /dev/null
+++ b/mwrefs/bibs/processors/__init__.py
@@ -0,0 +1 @@
+from . import bibliography_extractor, identifiers_extractor, sections_counter
diff --git a/mwrefs/bibs/processors/bibliography_extractor.py b/mwrefs/bibs/processors/bibliography_extractor.py
new file mode 100644
index 0000000..8285a65
--- /dev/null
+++ b/mwrefs/bibs/processors/bibliography_extractor.py
@@ -0,0 +1,202 @@
+import collections
+import functools
+import datetime
+
+import more_itertools
+import fuzzywuzzy.process
+
+from .. import utils, extractors, dumper, languages
+
+FUZZY_MATCH_CUTOFF = 91      # between 0, 100
+
+features_template = '''
+<%!
+    from itertools import groupby
+    def groupby_action(diff):
+        return groupby(diff, lambda d: d.action)
+%>
+<%def name="attribute_if_exists(name, text)" filter="trim">
+    % if text is not None:
+        ${name}="${text | x}"
+    % endif
+</%def>
+<%def name="tag_user_if_exists(user)" filter="trim">
+    % if user:
+        <user ${attribute_if_exists('id', user.id)} ${attribute_if_exists('name', user.text)} />
+    % endif
+</%def>
+<root>
+    % for page in pages:
+    <page>
+        <title>${page.title | x}</title>
+        <id>${page.id | x}</id>
+        <revisions>
+            % for revision in page.revisions:
+            <revision>
+                <id>${revision.id | x}</id>
+                ${tag_user_if_exists(revision.user)}
+                <timestamp>${revision.timestamp | x}</timestamp>
+                <sections>
+                    % for section in revision.sections:
+                        <section name="${section.name | x}" level="${section.level | x}">${section.body | x}</section>
+                    % endfor
+                </sections>
+            </revision>
+            % endfor
+        </revisions>
+    </page>
+    % endfor
+</root>
+'''
+
+stats_template = '''
+<stats>
+    <performance>
+        <start_time>${stats['performance']['start_time'] | x}</start_time>
+        <end_time>${stats['performance']['end_time'] | x}</end_time>
+        <revisions_analyzed>${stats['performance']['revisions_analyzed'] | x}</revisions_analyzed>
+        <pages_analyzed>${stats['performance']['pages_analyzed'] | x}</pages_analyzed>
+    </performance>
+    <extracted-section-names>
+        % for key in ['global', 'last_revision']:
+        <${key}>
+            % for section_name, count in stats['section_names'][key].most_common():
+            <section name="${section_name | x}" count="${count}" />
+            % endfor
+        </${key}>
+        % endfor
+    </extracted-section-names>
+</stats>
+'''
+
+Page = collections.namedtuple('Page', [
+    'id',
+    'title',
+    'revisions',
+])
+Revision = collections.namedtuple('Revision', [
+    'id',
+    'user',
+    'timestamp',
+    'sections',
+])
+
+
+# TODO: instead of comparing section_name to a bib synonym,
+# search all the possible bib synonyms in the section name
+@functools.lru_cache(maxsize=500)
+def is_secion_bibliography(section_name, language, score_cutoff=FUZZY_MATCH_CUTOFF):
+    bibliography_synonyms = languages.bibliography[language]
+    match = fuzzywuzzy.process.extractOne(
+        section_name,
+        bibliography_synonyms,
+        score_cutoff=score_cutoff,
+    )
+    return bool(match)
+
+
+def extract_revisions(mw_page, language, stats, only_last_revision):
+    section_names_stats = stats['section_names']
+    revisions = more_itertools.peekable(mw_page)
+    for mw_revision in revisions:
+        utils.dot()
+
+        is_last_revision = not utils.has_next(revisions)
+        if only_last_revision and not is_last_revision:
+            continue
+
+        text = utils.remove_comments(mw_revision.text or '')
+
+        sections = (section for section, _ in extractors.sections(text))
+
+        bibliography_sections = list(section
+            for section in sections
+            if is_secion_bibliography(section.name, language))
+
+        for section in bibliography_sections:
+            section_names_stats['global'][section.name] += 1
+            if is_last_revision:
+                section_names_stats['last_revision'][section.name] += 1
+
+        yield Revision(
+            id=mw_revision.id,
+            user=mw_revision.user,
+            timestamp=mw_revision.timestamp.to_json(),
+            sections=bibliography_sections,
+        )
+
+        stats['performance']['revisions_analyzed'] += 1
+
+
+def extract_pages(dump, language, stats, only_last_revision):
+    for mw_page in dump:
+        utils.log("Processing", mw_page.title)
+
+        # Skip non-articles
+        if mw_page.namespace != 0:
+            utils.log('Skipped (namespace != 0)')
+            continue
+
+        revisions_generator = extract_revisions(
+            mw_page,
+            language=language,
+            stats=stats,
+            only_last_revision=only_last_revision,
+        )
+
+        yield Page(
+            id=mw_page.id,
+            title=mw_page.title,
+            revisions=revisions_generator,
+        )
+        stats['performance']['pages_analyzed'] += 1
+
+
+def configure_subparsers(subparsers):
+    parser = subparsers.add_parser('extract-bibliography',
+        help='Extract only sections may be a bibliography')
+    parser.add_argument('-l', '--language',
+        choices=languages.supported,
+        required=True,
+        help='The language of the dump.',
+    )
+    parser.add_argument('--only-last-revision',
+        action='store_true',
+        help='Consider only the last revision for each page.',
+    )
+    parser.set_defaults(func=main)
+
+
+def main(dump, features_output_h, stats_output_h, args):
+    stats = {
+        'performance': {
+            'start_time': None,
+            'end_time': None,
+            'revisions_analyzed': 0,
+            'pages_analyzed': 0,
+        },
+        'section_names': {
+            'global': collections.Counter(),
+            'last_revision': collections.Counter(),
+        },
+    }
+    pages_generator = extract_pages(dump,
+        language=args.language,
+        stats=stats,
+        only_last_revision=args.only_last_revision,
+    )
+    with features_output_h:
+        stats['performance']['start_time'] = datetime.datetime.utcnow()
+        dumper.render_template(
+            features_template,
+            output_handler=features_output_h,
+            pages=pages_generator,
+        )
+        stats['performance']['end_time'] = datetime.datetime.utcnow()
+
+    with stats_output_h:
+        dumper.render_template(
+            stats_template,
+            stats_output_h,
+            stats=stats,
+        )
diff --git a/mwrefs/bibs/processors/identifiers_extractor.py b/mwrefs/bibs/processors/identifiers_extractor.py
new file mode 100644
index 0000000..9ba5961
--- /dev/null
+++ b/mwrefs/bibs/processors/identifiers_extractor.py
@@ -0,0 +1,220 @@
+import collections
+import datetime
+import more_itertools
+
+from .. import utils, extractors, dumper
+
+features_template = '''
+<%!
+    from itertools import groupby
+    def groupby_action(diff):
+        return groupby(diff, lambda d: d.action)
+%>
+<%def name="attribute_if_exists(name, text)" filter="trim">
+    % if text is not None:
+        ${name}="${text | x}"
+    % endif
+</%def>
+<%def name="tag_user_if_exists(user)" filter="trim">
+    % if user:
+        <user ${attribute_if_exists('id', user.id)} ${attribute_if_exists('name', user.text)} />
+    % endif
+</%def>
+<root>
+    % for page in pages:
+    <page>
+        <title>${page.title | x}</title>
+        <id>${page.id | x}</id>
+        <revisions>
+            % for revision in page.revisions:
+            <revision>
+                <id>${revision.id | x}</id>
+                ${tag_user_if_exists(revision.user)}
+                <timestamp>${revision.timestamp | x}</timestamp>
+                <publication-identifiers-diff>
+                    % for key, group in groupby_action(revision.publication_identifiers_diff):
+                    <diff action="${key | x}">
+                        % for _, identifier in group:
+                        <identifier type="${identifier.type | x}" id="${identifier.id | x}" />
+                        % endfor
+                    </diff>
+                    % endfor
+                </publication-identifiers-diff>
+            </revision>
+            %endfor
+        </revisions>
+    </page>
+    % endfor
+</root>
+'''
+
+stats_template = '''
+<stats>
+    <performance>
+        <start_time>${stats['performance']['start_time']}</start_time>
+        <end_time>${stats['performance']['end_time']}</end_time>
+        <revisions_analyzed>${stats['performance']['revisions_analyzed']}</revisions_analyzed>
+        <pages_analyzed>${stats['performance']['pages_analyzed']}</pages_analyzed>
+    </performance>
+    <identifiers>
+        % for key in ['global', 'last_revision']:
+        <${key}>
+            % for where, count in stats['identifiers'][key].items():
+            <appearance where="${where}" count="${count}" />
+            % endfor
+        </${key}>
+        % endfor
+    </identifiers>
+</stats>
+'''
+
+Page = collections.namedtuple('Page', [
+    'id',
+    'title',
+    'revisions',
+])
+Revision = collections.namedtuple('Revision', [
+    'id',
+    'user',
+    'timestamp',
+    'publication_identifiers_diff',
+])
+
+def IdentifierStatsDict():
+    return {
+        'only_in_raw_text': 0,
+        'only_in_tag_ref': 0,
+        'only_in_template': 0,
+        'in_tag_ref_and_template': 0,
+    }
+
+
+@utils.listify(wrapper=set)
+def where_appears(span, **spans):
+    span_le = extractors.Span.__le__
+    for key, span_list in spans.items():
+        # if any(span <= other_span) for other_span in span_list):
+        # HACK: the following is more efficient. Sorry :(
+        if any(span_le(span, other_span) for other_span in span_list):
+            yield key
+
+
+def identifier_appearance_stat_key(appearances):
+    if {'templates', 'references'} <= appearances:
+        return 'in_tag_ref_and_template'
+    elif 'templates' in appearances:
+        return 'only_in_template'
+    elif 'references' in appearances:
+        return 'only_in_tag_ref'
+    else:
+        return 'only_in_raw_text'
+
+
+def extract_revisions(page, stats, only_last_revision):
+    revisions = more_itertools.peekable(page)
+
+    prev_identifiers = set()
+    for mw_revision in revisions:
+        utils.dot()
+
+        is_last_revision = not utils.has_next(revisions)
+        if only_last_revision and not is_last_revision:
+            continue
+
+        text = utils.remove_comments(mw_revision.text or '')
+
+        references_captures = list(extractors.references(text))
+
+        templates_captures = list(extractors.templates(text))
+
+        identifiers_captures = list(extractors.pub_identifiers(text))
+        identifiers = [identifier for identifier, _ in identifiers_captures]
+
+        for identifier, span in identifiers_captures:
+            appearances = where_appears(span,
+                references=(span for _, span in references_captures),
+                templates=(span for _, span in templates_captures),
+            )
+            key_to_increment = identifier_appearance_stat_key(appearances)
+
+            stats['identifiers']['global'][key_to_increment] += 1
+            if is_last_revision:
+                stats['identifiers']['last_revision'][key_to_increment] += 1
+
+        yield Revision(
+            id=mw_revision.id,
+            user=mw_revision.user,
+            timestamp=mw_revision.timestamp.to_json(),
+            publication_identifiers_diff=utils.diff(prev_identifiers,
+                                              identifiers),
+        )
+
+        stats['performance']['revisions_analyzed'] += 1
+        prev_identifiers = identifiers
+
+
+def extract_pages(dump, stats, only_last_revision):
+    for mw_page in dump:
+        utils.log("Processing", mw_page.title)
+
+        # Skip non-articles
+        if mw_page.namespace != 0:
+            utils.log('Skipped (namespace != 0)')
+            continue
+
+        revisions_generator = extract_revisions(
+            mw_page,
+            stats=stats,
+            only_last_revision=only_last_revision,
+        )
+
+        yield Page(
+            id=mw_page.id,
+            title=mw_page.title,
+            revisions=revisions_generator,
+        )
+        stats['performance']['pages_analyzed'] += 1
+
+
+def configure_subparsers(subparsers):
+    parser = subparsers.add_parser('extract-identifiers',
+        help='Extract the identifiers from the text (doi, isbn, arxiv and pubmed.')
+    parser.add_argument('--only-last-revision',
+        action='store_true',
+        help='Consider only the last revision for each page.',
+        )
+    parser.set_defaults(func=main)
+
+
+def main(dump, features_output_h, stats_output_h, args):
+    stats = {
+        'performance': {
+            'start_time': None,
+            'end_time': None,
+            'revisions_analyzed': 0,
+            'pages_analyzed': 0,
+        },
+        'identifiers': {
+            'global': IdentifierStatsDict(),
+            'last_revision': IdentifierStatsDict(),
+        },
+    }
+    pages_generator = extract_pages(dump,
+        stats=stats,
+        only_last_revision=args.only_last_revision,
+    )
+    with features_output_h:
+        stats['performance']['start_time'] = datetime.datetime.utcnow()
+        dumper.render_template(
+            features_template,
+            output_handler=features_output_h,
+            pages=pages_generator,
+        )
+        stats['performance']['end_time'] = datetime.datetime.utcnow()
+
+    with stats_output_h:
+        dumper.render_template(
+            stats_template,
+            stats_output_h,
+            stats=stats,
+        )
diff --git a/mwrefs/bibs/processors/sections_counter.py b/mwrefs/bibs/processors/sections_counter.py
new file mode 100644
index 0000000..1ae3141
--- /dev/null
+++ b/mwrefs/bibs/processors/sections_counter.py
@@ -0,0 +1,136 @@
+import collections
+import datetime
+
+import more_itertools
+
+from .. import utils, extractors, dumper
+
+
+stats_template = '''
+<stats>
+    <performance>
+        <start_time>${stats['performance']['start_time']}</start_time>
+        <end_time>${stats['performance']['end_time']}</end_time>
+        <revisions_analyzed>${stats['performance']['revisions_analyzed']}</revisions_analyzed>
+        <pages_analyzed>${stats['performance']['pages_analyzed']}</pages_analyzed>
+    </performance>
+    <section-names-per-revision>
+        % for key in ['global', 'last_revision']:
+        <${key}>
+            % for section_name, count in stats['section_names_per_revision'][key].most_common():
+            <section name="${section_name | x}" count="${count}" />
+            % endfor
+        </${key}>
+        % endfor
+    </section-names-per-revision>
+    <sections-per-revision>
+        % for key in ['global', 'last_revision']:
+        <${key}>
+            % for sections_in_revision, count in stats['sections_per_revision'][key].most_common():
+            <sections number="${sections_in_revision}" count="${count}" />
+            % endfor
+        </${key}>
+        % endfor
+    </sections-per-revision>
+    <revisions>
+        <global count="${stats['revisions']['global']}" />
+        <last_revision count="${stats['revisions']['last_revision']}" />
+    </revisions>
+</stats>
+'''
+
+
+def analyze_revisions(page, stats, only_last_revision):
+    revisions = more_itertools.peekable(page)
+
+    section_names_stats = stats['section_names_per_revision']
+    sections_stats = stats['sections_per_revision']
+
+    for mw_revision in revisions:
+        utils.dot()
+
+        is_last_revision = not utils.has_next(revisions)
+        if only_last_revision and not is_last_revision:
+            continue
+
+        text = utils.remove_comments(mw_revision.text or '')
+
+        section_names = [section.name.strip().lower()
+                         for section, _ in extractors.sections(text)]
+        sections_count = len(section_names)
+
+        for section_name in section_names:
+            section_names_stats['global'][section_name] += 1
+            if is_last_revision:
+                section_names_stats['last_revision'][section_name] += 1
+
+        sections_stats['global'][sections_count] += 1
+        if is_last_revision:
+            sections_stats['last_revision'][sections_count] += 1
+
+        stats['revisions']['global'] += 1
+        if is_last_revision:
+            stats['revisions']['last_revision'] += 1
+
+        stats['performance']['revisions_analyzed'] += 1
+
+
+def analyze_pages(dump, stats, only_last_revision):
+    for mw_page in dump:
+        utils.log("Processing", mw_page.title)
+
+        # Skip non-articles
+        if mw_page.namespace != 0:
+            utils.log('Skipped (namespace != 0)')
+            continue
+
+        analyze_revisions(
+            mw_page,
+            stats=stats,
+            only_last_revision=only_last_revision,
+        )
+
+        stats['performance']['pages_analyzed'] += 1
+
+
+def configure_subparsers(subparsers):
+    parser = subparsers.add_parser('count-sections',
+        help='Count the number of sections and the section names of the dump.')
+    parser.add_argument('--only-last-revision',
+        action='store_true',
+        help='Consider only the last revision for each page.',
+    )
+    parser.set_defaults(func=main)
+
+
+def main(dump, features_output_h, stats_output_h, args):
+    stats = {
+        'sections_per_revision': {
+            'global': collections.Counter(),
+            'last_revision': collections.Counter(),
+        },
+        'section_names_per_revision': {
+            'global': collections.Counter(),
+            'last_revision': collections.Counter(),
+        },
+        'revisions': collections.Counter(),
+        'performance': {
+            'start_time': None,
+            'end_time': None,
+            'revisions_analyzed': 0,
+            'pages_analyzed': 0,
+        }
+    }
+    stats['performance']['start_time'] = datetime.datetime.utcnow()
+    analyze_pages(dump,
+        stats=stats,
+        only_last_revision=args.only_last_revision,
+    )
+    stats['performance']['end_time'] = datetime.datetime.utcnow()
+
+    with stats_output_h:
+        dumper.render_template(
+            stats_template,
+            stats_output_h,
+            stats=stats,
+        )
diff --git a/mwrefs/bibs/utils.py b/mwrefs/bibs/utils.py
new file mode 100644
index 0000000..3651180
--- /dev/null
+++ b/mwrefs/bibs/utils.py
@@ -0,0 +1,91 @@
+import functools
+import collections
+import sys
+import regex as re
+
+
+Diff = collections.namedtuple("Diff", "action data")
+
+
+def diff(previous, current):
+    # previous = [ref.text for ref in previous]
+    # current = [ref.text for ref in current]
+
+    added = set(current) - set(previous)
+    removed = set(previous) - set(current)
+
+    diff = (
+        [Diff('added', el) for el in added]
+        + [Diff('removed', el) for el in removed]
+    )
+
+    return diff
+
+
+# https://github.com/shazow/unstdlib.py/blob/master/unstdlib/standard/list_.py#L149
+def listify(fn=None, wrapper=list):
+    """
+    A decorator which wraps a function's return value in ``list(...)``.
+
+    Useful when an algorithm can be expressed more cleanly as a generator but
+    the function should return an list.
+
+    Example::
+
+        >>> @listify
+        ... def get_lengths(iterable):
+        ...     for i in iterable:
+        ...         yield len(i)
+        >>> get_lengths(["spam", "eggs"])
+        [4, 4]
+        >>>
+        >>> @listify(wrapper=tuple)
+        ... def get_lengths_tuple(iterable):
+        ...     for i in iterable:
+        ...         yield len(i)
+        >>> get_lengths_tuple(["foo", "bar"])
+        (3, 3)
+    """
+    def listify_return(fn):
+        @functools.wraps(fn)
+        def listify_helper(*args, **kw):
+            return wrapper(fn(*args, **kw))
+        return listify_helper
+    if fn is None:
+        return listify_return
+    return listify_return(fn)
+
+
+def iter_with_prev(iterable):
+    last = None
+    for el in iterable:
+        yield last, el
+        last = el
+
+
+def dot(num=None):
+    if not num:
+        what = '.'
+    elif num < 10:
+        what = str(num)
+    else:
+        what = '>'
+    print(what, end='', file=sys.stderr, flush=True)
+
+
+def log(*args):
+    first, *rest = args
+    print('\n' + str(first), *rest, end='', file=sys.stderr, flush=True)
+
+
+def remove_comments(source):
+    pattern = re.compile(r'<!--(.*?)-->', re.MULTILINE | re.DOTALL)
+    return pattern.sub('', source)
+
+
+def has_next(peekable):
+    try:
+        peekable.peek()
+        return True
+    except StopIteration:
+        return False
diff --git a/mwrefs/ids/__init__.py b/mwrefs/ids/__init__.py
new file mode 100644
index 0000000..e5d4c7d
--- /dev/null
+++ b/mwrefs/ids/__init__.py
@@ -0,0 +1,3 @@
+from .identifier import Identifier
+
+__version__ = "0.2.0"
diff --git a/mwrefs/ids/extractors/__init__.py b/mwrefs/ids/extractors/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/mwrefs/ids/extractors/arxiv.py b/mwrefs/ids/extractors/arxiv.py
new file mode 100644
index 0000000..c7c52c6
--- /dev/null
+++ b/mwrefs/ids/extractors/arxiv.py
@@ -0,0 +1,17 @@
+import re
+
+from ..identifier import Identifier
+
+# From http://arxiv.org/help/arxiv_identifier
+old_id = r"-?(?P<old_id>([a-z]+(.[a-z]+)/)?[0-9]{4}[0-9]+)"
+new_id = r"(?P<new_id>[0-9]{4}.[0-9]+)(v[0-9]+)?"
+
+prefixes=["arxiv\s*=\s*", "//arxiv\.org/(abs/)?", "arxiv:\s?"]
+
+ARXIV_RE = re.compile(r"({0})".format("|".join(prefixes)) +
+                      r"({0}|{1})".format(old_id, new_id), re.I|re.U)
+
+def extract(text):
+    for match in ARXIV_RE.finditer(text):
+        id = match.group('new_id') or match.group("old_id")
+        yield Identifier("arxiv", id.lower())
diff --git a/mwrefs/ids/extractors/doi.py b/mwrefs/ids/extractors/doi.py
new file mode 100644
index 0000000..c76bb3d
--- /dev/null
+++ b/mwrefs/ids/extractors/doi.py
@@ -0,0 +1,150 @@
+import re
+from collections import defaultdict
+
+from more_itertools import peekable
+
+from ..identifier import Identifier
+
+DOI_START_RE = re.compile(r'10\.[0-9]{4,}/')
+
+HTML_TAGS = ['ref', 'span', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+     'b', 'u', 'i', 's', 'ins', 'del', 'code', 'tt', 'blockquote',
+     'pre']
+
+TAGS_RE = re.compile(r'<(/\s*)?(' + '|'.join(HTML_TAGS) + ')(\s[^>\n\r]+)?>', re.I)
+
+'''
+DOI_RE = re.compile(r'\b(10\.\d+/[^\s\|\]\}\?\,]+)')
+
+def extract_regex(text):
+    for match in DOI_RE.finditer(text):
+        id = re.sub(TAGS_RE, "", match.group(1)).rstrip(".")
+        yield Identifier("doi", id)
+
+import mwparserfromhell as mwp
+def extract_mwp(text):
+    no_tags = mwp.parse(text).strip_code()
+    for match in DOI_RE.finditer(no_tags):
+        id = re.sub(TAGS_RE, "", match.group(1)).rstrip(".")
+        yield Identifier("doi", id)
+'''
+
+LEXICON = [
+    (DOI_START_RE.pattern, 'doi_start'),
+    (r'\(',                'open_paren'),
+    (r'\)',                'close_paren'),
+    (r'\[',                'open_bracket'),
+    (r'\]',                'close_bracket'),
+    (r'<!--',              'comment_start'),
+    (r'-->',               'comment_end'),
+    (TAGS_RE.pattern,      'tag'),
+    (r'<',                 'open_angle'),
+    (r'>',                 'close_angle'),
+    (r'\{',                'open_curly'),
+    (r'\}',                'close_curly'),
+    (r'\|',                'pipe'),
+    (r'[,\.;!]',           'punct'),
+    (r'[\?#]',             'url_end'),
+    (r'[\n\r]+',           'break'),
+    (r'\s+',               'whitespace'),
+    (r'\w+',               'word'),
+    (r'.',                 'etc')
+]
+
+def extract_island(text):
+    tokens = tokenize_finditer(text, LEXICON)
+    tokens = peekable(tokens)
+
+    while tokens.peek(None) is not None:
+
+        if tokens.peek()[0] == 'doi_start':
+            yield ('doi', read_doi(tokens))
+
+        next(tokens)
+
+
+def tokenize_finditer(text, lexicon=LEXICON):
+    pattern = '|'.join("(?P<{0}>{1})".format(name, pattern)
+                       for pattern, name in lexicon)
+
+    group_regex = re.compile(pattern, re.I|re.U|re.M)
+
+    for match in group_regex.finditer(text):
+        yield match.lastgroup, match.group(0)
+
+
+"""
+def tokenize_scanner(text, lexicon=LEXICON):
+    scanner = re.Scanner(lexicon)
+    tokens, remainder = scanner.scan(text)
+    return tokens
+"""
+
+#from mwcites.extractors.doi import tokenize_scan
+#list(tokenize_scan("foo bar baz.{}"))
+
+def read_doi(tokens):
+    assert tokens.peek()[0] == 'doi_start'
+
+    depth = defaultdict(lambda: 0)
+
+    doi_buffer = [next(tokens)[1]]
+
+    while tokens.peek(None) is not None:
+        name, match = tokens.peek()
+
+        if name in ('url_end', 'break', 'whitespace', 'tag', 'pipe',
+                    'comment_start', 'comment_end'):
+            break
+        elif name == 'open_bracket':
+            depth['bracket'] += 1
+            doi_buffer.append(next(tokens)[1])
+        elif name == 'open_curly':
+            depth['curly'] += 1
+            doi_buffer.append(next(tokens)[1])
+        elif name == 'close_bracket':
+            if depth['bracket'] > 0:
+                depth['bracket'] -= 1
+                doi_buffer.append(next(tokens)[1])
+            else:
+                break
+        elif name == 'close_curly':
+            if depth['curly'] > 0:
+                depth['curly'] -= 1
+                doi_buffer.append(next(tokens)[1])
+            else:
+                break
+        else:
+            doi_buffer.append(next(tokens)[1])
+
+
+    # Do not return a doi with punctuation at the end
+    return re.sub(r'[\.,!]+$', '', ''.join(doi_buffer))
+
+
+
+def tokenize_search(text, start, lexicon=LEXICON):
+    pattern = '|'.join("(?P<{0}>{1})".format(name, pattern)
+                       for pattern, name in lexicon)
+
+    group_regex = re.compile(pattern, re.I|re.U)
+
+    match = group_regex.search(text, start)
+    while match is not None:
+        yield match.lastgroup, match.group(0)
+        match = group_regex.search(text, match.span()[1])
+
+def extract_search(text, lexicon=LEXICON):
+
+    last_end = 0
+    for match in DOI_START_RE.finditer(text):
+        if match.span()[0] > last_end:
+            tokens = tokenize_search(text, match.span()[0], lexicon=lexicon)
+            tokens = peekable(tokens)
+            doi = read_doi(tokens)
+            last_end = match.span()[0] + len(doi)
+            yield Identifier('doi', doi)
+        else:
+            last_end = max(match.span()[1], last_end)
+
+extract = extract_search # Setting the default to the best method
diff --git a/mwrefs/ids/extractors/isbn.py b/mwrefs/ids/extractors/isbn.py
new file mode 100644
index 0000000..12883f3
--- /dev/null
+++ b/mwrefs/ids/extractors/isbn.py
@@ -0,0 +1,8 @@
+import re
+from ..identifier import Identifier
+
+ISBN_RE = re.compile('isbn\s?=?\s?([0-9\-Xx]+)', re.I)
+
+def extract(text):
+    for match in ISBN_RE.finditer(text):
+        yield Identifier('isbn', match.group(1).replace('-', ''))
diff --git a/mwrefs/ids/extractors/pubmed.py b/mwrefs/ids/extractors/pubmed.py
new file mode 100644
index 0000000..5fbaf67
--- /dev/null
+++ b/mwrefs/ids/extractors/pubmed.py
@@ -0,0 +1,22 @@
+import re
+
+from ..identifier import Identifier
+
+TEMPLATE_RE = re.compile(r"\b(pmid|pmc)\s*=\s*(pmc)?([0-9]+)\b", re.I)
+
+PMURL_RE = re.compile(r"//www\.ncbi\.nlm\.nih\.gov" +
+                      r"/pubmed/([0-9]+)\b", re.I)
+PMCURL_RE = re.compile(r"//www\.ncbi\.nlm\.nih\.gov" +
+                       r"/pmc/articles/PMC([0-9]+)\b", re.I)
+
+def extract(text):
+    text = str(text or "")
+    
+    for match in TEMPLATE_RE.finditer(text):
+        yield Identifier(match.group(1).lower(), match.group(3))
+            
+    for match in PMURL_RE.finditer(text):
+        yield Identifier("pmid", match.group(1))
+    
+    for match in PMCURL_RE.finditer(text):
+        yield Identifier("pmc", match.group(1))
diff --git a/mwrefs/ids/extractors/tests/__init__.py b/mwrefs/ids/extractors/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/mwrefs/ids/extractors/tests/test_arxiv.py b/mwrefs/ids/extractors/tests/test_arxiv.py
new file mode 100644
index 0000000..ee0e7a6
--- /dev/null
+++ b/mwrefs/ids/extractors/tests/test_arxiv.py
@@ -0,0 +1,43 @@
+import pprint
+
+from nose.tools import eq_
+
+from .. import arxiv
+from ...identifier import Identifier
+
+INPUT_TEXT = """
+This is a doi randomly placed in the text 10.0000/m1
+Here's a typo that might be construed as a doi 10.60 people were there.
+{{cite|...|arxiv=0706.0001v1|pmid=10559875}}
+<ref>Halfaker, A., Geiger, R. S., Morgan, J. T., & Riedl, J. (2012).
+The rise and decline of an open collaboration system: How Wikipedia’s
+reaction to popularity is causing its decline.
+American Behavioral Scientist,
+0002764212469365 arxiv:0706.0002v1</ref>.  Hats pants and banana
+[http://arxiv.org/0706.0003]
+[http://arxiv.org/abs/0706.0004v1]
+[https://arxiv.org/abs/0706.0005v1]
+[https://arxiv.org/abs/math.GT/0309001]
+[https://arxiv.org/abs/-math.gs/0309002]
+{{cite|...|arxiv=foobar.hats/0101003|issue=1656}}
+http://www.google.com/sky/#latitude=3.362&longitude=160.1238441&zoom=
+10.2387/234310.2347/39423
+<!--
+    10.2387/234310.2347/39423-->
+"""
+EXPECTED = [
+    Identifier('arxiv', "0706.0001"),
+    Identifier('arxiv', "0706.0002"),
+    Identifier('arxiv', "0706.0003"),
+    Identifier('arxiv', "0706.0004"),
+    Identifier('arxiv', "0706.0005"),
+    Identifier('arxiv', "math.gt/0309001"),
+    Identifier('arxiv', "math.gs/0309002"),
+    Identifier('arxiv', "foobar.hats/0101003")
+]
+
+def test_extract():
+    ids = list(arxiv.extract(INPUT_TEXT))
+    pprint.pprint(ids)
+    pprint.pprint(EXPECTED)
+    eq_(ids, EXPECTED)
diff --git a/mwrefs/ids/extractors/tests/test_doi.py b/mwrefs/ids/extractors/tests/test_doi.py
new file mode 100644
index 0000000..05b85fa
--- /dev/null
+++ b/mwrefs/ids/extractors/tests/test_doi.py
@@ -0,0 +1,67 @@
+import pprint
+
+from nose.tools import eq_
+
+from .. import doi
+from ...identifier import Identifier
+
+INPUT_TEXT = """
+This is a doi randomly placed in the text 10.0000/m1
+Here's a typo that might be construed as a doi 10.60 people were there.
+{{cite|...|doi=10.0000/m2|pmid=10559875}}
+<ref>Halfaker, A., Geiger, R. S., Morgan, J. T., & Riedl, J. (2012).
+The rise and decline of an open collaboration system: How Wikipedia’s
+reaction to popularity is causing its decline.
+American Behavioral Scientist,
+0002764212469365 doi: 10.1177/0002764212469365</ref>.  Hats pants and banana
+[http://dx.doi.org/10.1170/foo<bar>(herp)derp]
+[http://dx.doi.org/10.1170/foo<bar>(herp)derp[waffles]]
+{{cite|...|doi=10.1098/rspb.2008.1131|issue=1656}}
+http://www.google.com/sky/#latitude=3.362&longitude=160.1238441&zoom=
+10.2387/234310.2347/39423
+<!--
+    10.2387/234310.2347/39423-->
+"""
+EXPECTED = [
+    Identifier('doi', "10.0000/m1"),
+    Identifier('doi', "10.0000/m2"),
+    Identifier('doi', "10.1177/0002764212469365"),
+    Identifier('doi', "10.1170/foo<bar>(herp)derp"),
+    Identifier('doi', "10.1170/foo<bar>(herp)derp[waffles]"),
+    Identifier('doi', "10.1098/rspb.2008.1131"),
+    Identifier('doi', "10.2387/234310.2347/39423"),
+    Identifier('doi', "10.2387/234310.2347/39423")
+]
+
+"""
+def test_extract_regex():
+    ids = list(doi.extract_regex(INPUT_TEXT))
+    pprint.pprint(ids)
+    pprint.pprint(EXPECTED)
+    eq_(ids, EXPECTED)
+
+def test_extract_mwp():
+    ids = list(doi.extract_mwp(INPUT_TEXT))
+    pprint.pprint(ids)
+    pprint.pprint(EXPECTED)
+    eq_(ids, EXPECTED)
+"""
+
+def test_extract():
+    ids = list(doi.extract(INPUT_TEXT))
+    pprint.pprint(ids)
+    pprint.pprint(EXPECTED)
+    eq_(ids, EXPECTED)
+
+def test_extract_island():
+    ids = list(doi.extract_island(INPUT_TEXT))
+    pprint.pprint(ids)
+    pprint.pprint(EXPECTED)
+    eq_(ids, EXPECTED)
+
+def test_extract_search():
+    ids = list(doi.extract_search(INPUT_TEXT))
+    pprint.pprint(ids)
+    pprint.pprint(EXPECTED)
+    #pprint.pprint(list(doi.tokenize_finditer(INPUT_TEXT)))
+    eq_(ids, EXPECTED)
diff --git a/mwrefs/ids/extractors/tests/test_isbn.py b/mwrefs/ids/extractors/tests/test_isbn.py
new file mode 100644
index 0000000..cd41776
--- /dev/null
+++ b/mwrefs/ids/extractors/tests/test_isbn.py
@@ -0,0 +1,44 @@
+import pprint
+from nose.tools import eq_
+
+from .. import isbn
+from ...identifier import Identifier
+
+INPUT_TEXT = """
+    | publisher=Academic Press | isbn=0124366031
+    | isbn=3540206310
+    | accessdate=2008-02-05 | isbn=0-618-34342-3
+    | isbn=978-0-140-27666-4
+    | isbn = 0-13-054091-9
+    | isbn=0195305736 }}&lt;/ref&gt; schlug [[Irving Langmuir]] 1919 vor, dass das Elektronen in einem Atom verbunden oder verklumpt seien. Elektronengruppen beset
+    | ISBN=978-3-7046-5112-9
+    * Peter L. Bergen: ''Heiliger Krieg, Inc.: Osama bin Ladens Terrornetz''. Siedler, Berlin 2001, ISBN 3-88680-752-5.
+    * Marwan Abou-Taam, Ruth Bigalke (Hgg) ''Die Reden des Osama bin Laden''. Diederichs, München 2006, ISBN 3-72052-773-5. (Reden und Ansprachen des b.L. im Original - ''Rezensionen: '' [http://www.sicherheit-heute.de/index.php?cccpage=readpolitik&amp;set_z_artikel=221 ]und [http://www.fr-online.de/in_und_ausland/kultur_und_medien/buecher/?em_cnt=868715&amp;sid=f55727] Frankf. Rundschau 26. April 2006)
+    * Michael Pekler, Andreas Ungerböck: ''Ang Lee und seine Filme''. Schüren, Marburg 2009, ISBN 978-3-89472-665-2.
+    &lt;ref name=&quot;flos1&quot;&gt;{{Literatur | Autor = René Flosdorff, Günther Hilgarth | Titel = Elektrische Energieverteilung | Verlag = Teubner | Auflage = 8. | Jahr = 2003 | Kapitel = Kapitel 1.2.2.4 | ISBN = 3-519-26424-2 }}&lt;/ref&gt;
+    Bei einer [[Sprungtemperatur]] von 1,2&amp;nbsp;K wird reines Aluminium [[Supraleiter|supraleitend]].&lt;ref&gt;{{Literatur | Autor = Ilschner | first = Bernhard | Titel = Werkstoffwissenschaften und Fertigungstechnik Eigenschaften, Vorgänge, Technologien | Verlag = Springer | Ort = Berlin | Jahr = 2010 | ISBN = 978-3-642-01734-6 | Seiten = 277}}&lt;/ref&gt;
+    * {{Literatur | Autor=Michael J. Padilla, Ioannis Miaoulis, Martha Cyr | Jahr = 2002 | Titel = Prentice Hall Science Explorer: Chemical Building Blocks | Verlag = Prentice-Hall, Inc. | Ort = Upper Saddle River, New Jersey USA | ISBN = 0-13-054091-9 | |Originalsprache=en}}
+    """
+
+
+EXPECTED = [
+    Identifier('isbn', '0124366031'),
+    Identifier('isbn', '3540206310'),
+    Identifier('isbn', '0618343423'),
+    Identifier('isbn', '9780140276664'),
+    Identifier('isbn', '0130540919'),
+    Identifier('isbn', '0195305736'),
+    Identifier('isbn', '9783704651129'),
+    Identifier('isbn', '3886807525'),
+    Identifier('isbn', '3720527735'),
+    Identifier('isbn', '9783894726652'),
+    Identifier('isbn', '3519264242'),
+    Identifier('isbn', '9783642017346'),
+    Identifier('isbn', '0130540919'),
+]
+
+def test_extract():
+    ids = list(isbn.extract(INPUT_TEXT))
+    pprint.pprint(ids)
+    pprint.pprint(EXPECTED)
+    eq_(ids, EXPECTED)
diff --git a/mwrefs/ids/extractors/tests/test_pubmed.py b/mwrefs/ids/extractors/tests/test_pubmed.py
new file mode 100644
index 0000000..48f98d9
--- /dev/null
+++ b/mwrefs/ids/extractors/tests/test_pubmed.py
@@ -0,0 +1,27 @@
+from nose.tools import eq_
+
+from .. import pubmed
+from ...identifier import Identifier
+
+def test_extract():
+
+    text = """
+    This is some text with a template cite. {{cite|...|...|pmid=1}}.
+    This is some text with a template cite. {{cite|...|...|pmid = 2|...}}.
+    This is some text with a template cite. {{cite|...|...|pmc = 3|...}}.
+    This is some text with a template cite. {{cite|...|...|pmc = pmc4|...}}.
+    This is some text with a link [http://www.ncbi.nlm.nih.gov/pubmed/5 ID]
+    Another link [https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6 ID]
+    """
+    ids = list(pubmed.extract(text))
+    expected = [
+        Identifier('pmid', "1"),
+        Identifier('pmid', "2"),
+        Identifier('pmc', "3"),
+        Identifier('pmc', "4"),
+        Identifier('pmid', "5"),
+        Identifier('pmc', "6")
+    ]
+    print(ids)
+    print(expected)
+    eq_(ids, expected)
diff --git a/mwrefs/ids/fetchers/doi.py b/mwrefs/ids/fetchers/doi.py
new file mode 100644
index 0000000..eb963d3
--- /dev/null
+++ b/mwrefs/ids/fetchers/doi.py
@@ -0,0 +1,48 @@
+import requests
+
+CITOID_HOST = 'https://citoid.wikimedia.org'
+
+
+def lookup_via_citoid(doi):
+    url = CITOID_HOST + "/api"
+    params = {
+        'format': "mediawiki",
+        'search': doi
+    }
+    response = requests.get(url, params=params)
+    doc = response.json()
+    if 'Error' in doc:
+        raise RuntimeError(doc['Error'])
+    else:
+        return doc
+
+
+def lookup_via_doidotorg(doi):
+    url = "http://doi.org"
+    data = {
+        "hdl": doi
+    }
+    response = requests.post(
+        url, data=data, headers={'Accept': "application/json"})
+    if response.status_code == 404:
+        raise RuntimeError("DOI not found")
+    elif response.status_code == 200:
+        return response.json()
+    else:
+        raise RuntimeError("Unknown error")
+
+METHODS = {
+    'doi.org': lookup_via_doidotorg,
+    'citoid.wikimedia.org': lookup_via_citoid
+}
+
+
+def lookup(doi, methods=['doi.org']):
+    for i, method in enumerate(methods):
+        try:
+            return METHODS[method](doi)
+        except RuntimeError as e:
+            if i+1 == len(methods):
+                raise e
+            else:
+                continue
diff --git a/mwrefs/ids/identifier.py b/mwrefs/ids/identifier.py
new file mode 100644
index 0000000..44f9b03
--- /dev/null
+++ b/mwrefs/ids/identifier.py
@@ -0,0 +1,3 @@
+from collections import namedtuple
+
+Identifier = namedtuple("Identifier", ['type', 'id'])
diff --git a/mwrefs/ids/utilities/__init__.py b/mwrefs/ids/utilities/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/mwrefs/ids/utilities/extract.py b/mwrefs/ids/utilities/extract.py
new file mode 100644
index 0000000..1dfa0ca
--- /dev/null
+++ b/mwrefs/ids/utilities/extract.py
@@ -0,0 +1,160 @@
+"""
+Extracts academic citations from articles from the history of Wikipedia
+articles by processing a pages-meta-history XML dump and matching regular
+expressions to revision content.
+
+Currently supported identifies include:
+
+ * PubMed
+ * DOI
+ * ISBN
+ * arXiv
+
+Outputs a TSV file with the following fields:
+
+ * page_id: The identifier of the Wikipedia article (int), e.g. 1325125
+ * page_title: The title of the Wikipedia article (utf-8), e.g. Club cell
+ * rev_id: The Wikipedia revision where the citation was first added (int),
+           e.g. 282470030
+ * timestamp: The timestamp of the revision where the citation was first added.
+              (ISO 8601 datetime), e.g. 2009-04-08T01:52:20Z
+ * type: The type of identifier, e.g. pmid, pmcid, doi, arxiv or isbn
+ * id: The id of the cited scholarly article (utf-8),
+       e.g 10.1183/09031936.00213411
+
+Usage:
+    extract -h | --help
+    extract <dump_file>... [--extractor=<classpath>...]
+
+Options:
+    -h --help                Shows this documentation
+    <dump_file>              The path to a set of dump files to process.  If no
+                             files are specified, <stdin> will be read.
+    --extractor=<classpath>  The class path to set of extractors to apply
+                             [default: <all>]
+"""
+import sys
+from itertools import chain
+
+import docopt
+import mwxml
+
+import mysqltsv
+
+from ..extractors import arxiv, doi, isbn, pubmed
+
+ALL_EXTRACTORS = [doi, pubmed, isbn, arxiv]
+
+HEADERS = ("page_id", "page_title", "rev_id", "timestamp", "type", "id")
+
+def main(argv=None):
+    args = docopt.docopt(__doc__, argv=argv)
+    dump_files = args['<dump_file>']
+
+    if args['--extractor'] == ['<all>']:
+        extractors = ALL_EXTRACTORS
+    else:
+        extractors = [import_from_path(path.lower)
+                      for path in args['--extractor']]
+
+    run(dump_files, extractors)
+
+def run(dump_files, extractors):
+    writer = mysqltsv.Writer(sts.stdout, headers=HEADERS)
+
+    cites = extract(dump_files, extractors=extractors)
+    for page_id, title, rev_id, timestamp, type, id in cites:
+        writer.write(page_id, title, rev_id, timestamp.long_format(), type, id)
+
+def extract(dump_files, extractors=ALL_EXTRACTORS):
+    """
+    Extracts cites from a set of `dump_files`.
+
+    :Parameters:
+        dump_files : str | `file`
+            A set of files MediaWiki XML dump files
+            (expects: pages-meta-history)
+        extractors : `list`(`extractor`)
+            A list of extractors to apply to the text
+
+    :Returns:
+        `iterable` -- a generator of extracted cites
+
+    """
+    # Dump processor function
+    def process_dump(dump, path):
+        for page in dump:
+            if page.namespace != 0: continue
+            else:
+                for cite in extract_cite_history(page, extractors):
+                    yield cite
+
+    # Map call
+    return mwxml.map(process_dump, dump_files)
+
+def extract_cite_history(page, extractors):
+    """
+    Extracts cites from the history of a `page` (`mwxml.Page`).
+
+    :Parameters:
+        page : `iterable`(`mwxml.Revision`)
+            The page to extract cites from
+        extractors : `list`(`extractor`)
+            A list of extractors to apply to the text
+
+    :Returns:
+        `iterable` -- a generator of extracted cites
+
+    """
+    appearances = {} # For tracking the first appearance of an ID
+    ids = set() # For holding onto the ids in the last revision.
+    for revision in page:
+        ids = set(extract_ids(revision.text, extractors))
+
+        # For each ID, check to see if we have seen it before
+        for id in ids:
+            if id not in appearances:
+               appearances[id] = (revision.id, revision.timestamp)
+
+    for id in ids: #For the ids in the last version of the page
+        rev_id, timestamp = appearances[id]
+        yield (page.id, page.title, rev_id, timestamp, id.type, id.id)
+
+def extract_ids(text, extractors):
+    """
+    Uses `extractors` to extract citation identifiers from a text.
+
+    :Parameters:
+        text : str
+            The text to process
+        extractors : `list`(`extractor`)
+            A list of extractors to apply to the text
+
+    :Returns:
+        `iterable` -- a generator of extracted identifiers
+    """
+    for extractor in extractors:
+        for id in extractor.extract(text):
+            yield id
+
+def import_from_path(path):
+    """
+    Imports a specific attribute from a module based on a class path.
+
+    :Parameters:
+        path : str
+            A dot delimited string representing the import path of the desired
+            object.
+
+    :Returns:
+        object -- An imported object
+    """
+    parts = path.split(".")
+    module_path = ".".join(parts[:-1])
+    attribute_name = parts[-1]
+
+    module = import_module(module_path)
+
+    attribute = getattr(module, attribute_name)
+
+    return attribute
diff --git a/mwrefs/ids/utilities/tests/__init__.py b/mwrefs/ids/utilities/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/mwrefs/ids/utilities/tests/test_extract.py b/mwrefs/ids/utilities/tests/test_extract.py
new file mode 100644
index 0000000..e4353ca
--- /dev/null
+++ b/mwrefs/ids/utilities/tests/test_extract.py
@@ -0,0 +1,41 @@
+from collections import namedtuple
+
+from mw import Timestamp
+from nose.tools import eq_
+
+from ..extract import extract_cite_history
+from ...identifier import Identifier
+
+
+def test_extract_cite_history():
+    FakeRevision = namedtuple("Revision", ['id', 'timestamp', 'text'])
+
+    FakeExtractor = namedtuple("Extractor", ['extract'])
+
+    class FakePage:
+        def __init__(self, id, title):
+            self.id = id
+            self.title = title
+        def __iter__(self):
+            return iter([
+                FakeRevision(1, Timestamp(1), "id1 id2"),
+                FakeRevision(2, Timestamp(2), "id1 id3"),
+                FakeRevision(3, Timestamp(3), "id1 id2 id3"),
+                FakeRevision(4, Timestamp(4), "id1 id2 id4"),
+                FakeRevision(5, Timestamp(5), "id1 id2 id4"),
+            ])
+
+    fake_page = FakePage(1, "Title")
+
+    def extract(text):
+        return (Identifier('fake', id) for id in text.split(" "))
+    extractor = FakeExtractor(extract)
+
+    expected = [(1, "Title", 1, Timestamp(1), "fake", "id1"),
+                (1, "Title", 1, Timestamp(1), "fake", "id2"),
+                (1, "Title", 4, Timestamp(4), "fake", "id4")]
+
+    citations = list(extract_cite_history(fake_page, [extractor]))
+    eq_(len(citations), len(expected))
+    for cite in extract_cite_history(fake_page, [extractor]):
+        assert cite in expected
diff --git a/mwrefs/extract.py b/mwrefs/refs/extract.py
similarity index 100%
rename from mwrefs/extract.py
rename to mwrefs/refs/extract.py
diff --git a/mwrefs/tests/test_extract.py b/mwrefs/tests/test_extract.py
index ca2a660..7dd8600 100644
--- a/mwrefs/tests/test_extract.py
+++ b/mwrefs/tests/test_extract.py
@@ -41,15 +41,15 @@ def test_extract():
 
     eq_(refs,
         ['<ref>{{cite web\n    |url=http://topics.info.com/Who-coined-the-' +
-           'term-biology_716 |title=Who coined\n    the term biology? |work=' +
-           'Info.com|accessdate=2012-06-03}}</ref>',
+            'term-biology_716 |title=Who coined\n    the term biology? ' +
+            '|work=Info.com|accessdate=2012-06-03}}</ref>',
          '<ref name=OnlineEtDict>{{cite web|title=biology\n    |url=http://' +
-           'www.etymonline.com/index.php?term=biology&allowed_in_frame=0\n   ' +
-           ' |publisher=[[Online Etymology Dictionary]]}}</ref>',
+            'www.etymonline.com/index.php?term=biology&allowed_in_frame=0\n   ' +
+            ' |publisher=[[Online Etymology Dictionary]]}}</ref>',
          '<ref name="pete"/>', '<ref name=bob />',
          '<ref name=Richards>\n    {{cite book|last=Richards|first=Robert J.' +
-           '|title=The Romantic Conception of\n    Life: Science and ' +
-           'Philosophy in the Age of Goethe|year=2002\n    |publisher=' +
-           'University of Chicago Press|isbn=0-226-71210-9\n    ' +
-           '|url=http://books.google.cocover#v=onepage&q&f=false}}</ref>',
+            '|title=The Romantic Conception of\n    Life: Science and ' +
+            'Philosophy in the Age of Goethe|year=2002\n    |publisher=' +
+            'University of Chicago Press|isbn=0-226-71210-9\n    ' +
+            '|url=http://books.google.cocover#v=onepage&q&f=false}}</ref>',
          '<ref name="Richards">foobar</ref>'])
diff --git a/mwrefs/utilities/diffs.py b/mwrefs/utilities/diff_ref_tags.py
similarity index 100%
rename from mwrefs/utilities/diffs.py
rename to mwrefs/utilities/diff_ref_tags.py
diff --git a/mwrefs/utilities/extract_ids.py b/mwrefs/utilities/extract_ids.py
new file mode 100644
index 0000000..8b58b24
--- /dev/null
+++ b/mwrefs/utilities/extract_ids.py
@@ -0,0 +1,166 @@
+"""
+Extracts academic citations from articles from the history of Wikipedia
+articles by processing a pages-meta-history XML dump and matching regular
+expressions to revision content.
+
+Currently supported identifies include:
+
+ * PubMed
+ * DOI
+ * ISBN
+ * arXiv
+
+Outputs a TSV file with the following fields:
+
+ * page_id: The identifier of the Wikipedia article (int), e.g. 1325125
+ * page_title: The title of the Wikipedia article (utf-8), e.g. Club cell
+ * rev_id: The Wikipedia revision where the citation was first added (int),
+           e.g. 282470030
+ * timestamp: The timestamp of the revision where the citation was first added.
+              (ISO 8601 datetime), e.g. 2009-04-08T01:52:20Z
+ * type: The type of identifier, e.g. pmid, pmcid, doi, arxiv or isbn
+ * id: The id of the cited scholarly article (utf-8),
+       e.g 10.1183/09031936.00213411
+
+Usage:
+    extract -h | --help
+    extract <dump_file>... [--extractor=<classpath>...]
+
+Options:
+    -h --help                Shows this documentation
+    <dump_file>              The path to a set of dump files to process.  If no
+                             files are specified, <stdin> will be read.
+    --extractor=<classpath>  The class path to set of extractors to apply
+                             [default: <all>]
+"""
+import sys
+from importlib import import_module
+from itertools import chain
+
+import docopt
+import mwxml
+
+import mysqltsv
+
+from ..extractors import arxiv, doi, isbn, pubmed
+
+ALL_EXTRACTORS = [doi, pubmed, isbn, arxiv]
+
+HEADERS = ("page_id", "page_title", "rev_id", "timestamp", "type", "id")
+
+
+def main(argv=None):
+    args = docopt.docopt(__doc__, argv=argv)
+    dump_files = args['<dump_file>']
+
+    if args['--extractor'] == ['<all>']:
+        extractors = ALL_EXTRACTORS
+    else:
+        extractors = [import_from_path(path.lower)
+                      for path in args['--extractor']]
+
+    run(dump_files, extractors)
+
+
+def run(dump_files, extractors):
+    writer = mysqltsv.Writer(sys.stdout, headers=HEADERS)
+
+    cites = extract(dump_files, extractors=extractors)
+    for page_id, title, rev_id, timestamp, type, id in cites:
+        writer.write(page_id, title, rev_id, timestamp.long_format(), type, id)
+
+
+def extract(dump_files, extractors=ALL_EXTRACTORS):
+    """
+    Extracts cites from a set of `dump_files`.
+
+    :Parameters:
+        dump_files : str | `file`
+            A set of files MediaWiki XML dump files
+            (expects: pages-meta-history)
+        extractors : `list`(`extractor`)
+            A list of extractors to apply to the text
+
+    :Returns:
+        `iterable` -- a generator of extracted cites
+
+    """
+    # Dump processor function
+    def process_dump(dump, path):
+        for page in dump:
+            if page.namespace != 0: continue
+            else:
+                for cite in extract_cite_history(page, extractors):
+                    yield cite
+
+    # Map call
+    return mwxml.map(process_dump, dump_files)
+
+
+def extract_cite_history(page, extractors):
+    """
+    Extracts cites from the history of a `page` (`mwxml.Page`).
+
+    :Parameters:
+        page : `iterable`(`mwxml.Revision`)
+            The page to extract cites from
+        extractors : `list`(`extractor`)
+            A list of extractors to apply to the text
+
+    :Returns:
+        `iterable` -- a generator of extracted cites
+
+    """
+    appearances = {} # For tracking the first appearance of an ID
+    ids = set() # For holding onto the ids in the last revision.
+    for revision in page:
+        ids = set(extract_ids(revision.text, extractors))
+
+        # For each ID, check to see if we have seen it before
+        for id in ids:
+            if id not in appearances:
+               appearances[id] = (revision.id, revision.timestamp)
+
+    for id in ids: #For the ids in the last version of the page
+        rev_id, timestamp = appearances[id]
+        yield (page.id, page.title, rev_id, timestamp, id.type, id.id)
+
+def extract_ids(text, extractors):
+    """
+    Uses `extractors` to extract citation identifiers from a text.
+
+    :Parameters:
+        text : str
+            The text to process
+        extractors : `list`(`extractor`)
+            A list of extractors to apply to the text
+
+    :Returns:
+        `iterable` -- a generator of extracted identifiers
+    """
+    for extractor in extractors:
+        for id in extractor.extract(text):
+            yield id
+
+
+def import_from_path(path):
+    """
+    Imports a specific attribute from a module based on a class path.
+
+    :Parameters:
+        path : str
+            A dot delimited string representing the import path of the desired
+            object.
+
+    :Returns:
+        object -- An imported object
+    """
+    parts = path.split(".")
+    module_path = ".".join(parts[:-1])
+    attribute_name = parts[-1]
+
+    module = import_module(module_path)
+
+    attribute = getattr(module, attribute_name)
+
+    return attribute
diff --git a/mwrefs/utilities/extract.py b/mwrefs/utilities/extract_ref_tags.py
similarity index 100%
rename from mwrefs/utilities/extract.py
rename to mwrefs/utilities/extract_ref_tags.py
diff --git a/mwrefs/utilities/fetch_metadata.py b/mwrefs/utilities/fetch_metadata.py
new file mode 100644
index 0000000..e69de29
diff --git a/mwrefs/utilities/tests/__init__.py b/mwrefs/utilities/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/mwrefs/utilities/tests/test_extract_ids.py b/mwrefs/utilities/tests/test_extract_ids.py
new file mode 100644
index 0000000..e4353ca
--- /dev/null
+++ b/mwrefs/utilities/tests/test_extract_ids.py
@@ -0,0 +1,41 @@
+from collections import namedtuple
+
+from mw import Timestamp
+from nose.tools import eq_
+
+from ..extract import extract_cite_history
+from ...identifier import Identifier
+
+
+def test_extract_cite_history():
+    FakeRevision = namedtuple("Revision", ['id', 'timestamp', 'text'])
+
+    FakeExtractor = namedtuple("Extractor", ['extract'])
+
+    class FakePage:
+        def __init__(self, id, title):
+            self.id = id
+            self.title = title
+        def __iter__(self):
+            return iter([
+                FakeRevision(1, Timestamp(1), "id1 id2"),
+                FakeRevision(2, Timestamp(2), "id1 id3"),
+                FakeRevision(3, Timestamp(3), "id1 id2 id3"),
+                FakeRevision(4, Timestamp(4), "id1 id2 id4"),
+                FakeRevision(5, Timestamp(5), "id1 id2 id4"),
+            ])
+
+    fake_page = FakePage(1, "Title")
+
+    def extract(text):
+        return (Identifier('fake', id) for id in text.split(" "))
+    extractor = FakeExtractor(extract)
+
+    expected = [(1, "Title", 1, Timestamp(1), "fake", "id1"),
+                (1, "Title", 1, Timestamp(1), "fake", "id2"),
+                (1, "Title", 4, Timestamp(4), "fake", "id4")]
+
+    citations = list(extract_cite_history(fake_page, [extractor]))
+    eq_(len(citations), len(expected))
+    for cite in extract_cite_history(fake_page, [extractor]):
+        assert cite in expected