Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(WIP) Merger with mwcites #6

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 116 additions & 0 deletions mwrefs/bibs/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import argparse
import subprocess
import codecs
import os

import mw.xml_dump
import mwxml
import pathlib

from . import utils, processors


def open_xml_file(path):
f = mw.xml_dump.functions.open_file(
mw.xml_dump.functions.file(path)
)
return f


def compressor_7z(file_path):
p = subprocess.Popen(
['7z', 'a', '-si', file_path],
stdin=subprocess.PIPE,
stderr=subprocess.DEVNULL,
stdout=subprocess.DEVNULL,
)
utf8writer = codecs.getwriter('utf-8')

return utf8writer(p.stdin)


def output_writer(path, compression):
if compression == '7z':
return compressor_7z(path + '.7z')
else:
return open(path, 'wt', encoding='utf-8')


def create_path(path):
path = pathlib.Path(path)
path.parent.mkdir(parents=True, exist_ok=True)


def get_args():
parser = argparse.ArgumentParser(
prog='wikidump',
description='Wikidump features extractor.',
)
parser.add_argument('files',
metavar='FILE',
type=pathlib.Path,
nargs='+',
help='XML Wikidump file to parse. It accepts only 7z.'
)
parser.add_argument('output_dir_path',
metavar='OUTPUT_DIR',
type=pathlib.Path,
help='XML output directory.',
)
parser.add_argument('--output-compression',
choices={None, '7z'},
required=False,
default=None,
help='Output compression format.',
)
parser.add_argument('--dry-run', '-n',
action='store_true',
help="Don't write any file",
)

subparsers = parser.add_subparsers(help='sub-commands help')
processors.bibliography_extractor.configure_subparsers(subparsers)
processors.identifiers_extractor.configure_subparsers(subparsers)
processors.sections_counter.configure_subparsers(subparsers)

parsed_args = parser.parse_args()
if 'func' not in parsed_args:
parser.print_usage()
parser.exit(1)

return parsed_args


def main():
args = get_args()

args.output_dir_path.mkdir(parents=True, exist_ok=True)

for input_file_path in args.files:
utils.log("Analyzing {}...".format(input_file_path))

dump = mwxml.Dump.from_file(open_xml_file(str(input_file_path)))

basename = input_file_path.name

if args.dry_run:
pages_output = open(os.devnull, 'wt')
stats_output = open(os.devnull, 'wt')
else:
pages_output = output_writer(
path=str(args.output_dir_path/(basename + '.features.xml')),
compression=args.output_compression,
)
stats_output = output_writer(
path=str(args.output_dir_path/(basename + '.stats.xml')),
compression=args.output_compression,
)
args.func(dump,
pages_output,
stats_output,
args,
)


if __name__ == '__main__':
main()
99 changes: 99 additions & 0 deletions mwrefs/bibs/dumper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import mako.runtime
import mako.template

pages_revisions_template = '''
<%!
from itertools import groupby
def groupby_action(diff):
return groupby(diff, lambda d: d.action)
%>
<root>
% for page in pages:
<page>
<title>${page.title}</title>
<id>${page.id}</id>
<revisions>
% for revision in page.revisions:
<revision>
<id>${revision.id}</id>
<user id="{$revision.user.id}" name="${revision.user.text}" />
<timestamp>${revision.timestamp}</timestamp>
<references_diff>
% for key, group in groupby_action(revision.references_diff):
<diff action="${key}">
% for _, text in group:
<reference>${text}</reference>
% endfor
</diff>
% endfor
</references_diff>
<publication_identifiers_diff>
% for key, group in groupby_action(revision.publication_identifiers_diff):
<diff action="${key}">
% for _, identifier in group:
<identifier type="${identifier.type}" id="${identifier.id}" />
% endfor
</diff>
% endfor
</publication_identifiers_diff>
<sections>
% for section in revision.sections:
<section level="${section.level}">${section.name}</section>
% endfor
</sections>
<bibliography>${revision.bibliography}</bibliography>
</revision>
%endfor
</revisions>
</page>
% endfor
</root>
'''

stats_template = '''
<stats>
<performance>
<start_time>${stats['performance']['start_time']}</start_time>
<end_time>${stats['performance']['end_time']}</end_time>
<revisions_analyzed>${stats['performance']['revisions_analyzed']}</revisions_analyzed>
<pages_analyzed>${stats['performance']['pages_analyzed']}</pages_analyzed>
</performance>
<identifiers>
% for key in ['global', 'last_revision']:
<${key}>
% for where, count in stats['identifiers'][key].items():
<appearance where="${where}" count="${count}" />
% endfor
</${key}>
% endfor
</identifiers>
</stats>
'''


def render_template(template, output_handler, default_filters=None, **kwargs):
ctx = mako.runtime.Context(output_handler, **kwargs)

xml_template = mako.template.Template(
template,
default_filters=default_filters,
)
xml_template.render_context(ctx)


def serialize_page_revisions(pages, output_handler):
render_template(
pages_revisions_template,
output_handler,
default_fiters=['x'], # XML escaping
pages=pages,
)


def serialize_stats(stats, output_handler):
render_template(
stats_template,
output_handler,
default_filters=['x'], # XML escaping
stats=stats,
)
82 changes: 82 additions & 0 deletions mwrefs/bibs/languages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
supported = {'en', 'it'}

bibliography = {
'en': {
'bibliography',
'references',
'reference',
'further reading',
'notes',
'sources',
'footnotes',
'citations',
'publications',
'publication history',
'literature',
},
'it': {'bibliografia'},
}

citation = {
'en': {'Citation', 'cite', 'vcite'},
}

"""
What I mean for:
* References: a section containing footnotes for works cited in the text.
* Bibliography: a section containing articles and journals.
* Further reading: like `Bibliography`, but contains references not used in the text.
* Footnotes: a section containing explainations to concepts.

From now on, words in backquotes (`) are to be interpreted as concept using the above definitions, while words in double quotes (") are to be interpreted as terms found in the text of the articles.

"References" (term) is commonly used as `Bibliography` (concept), i.e. articles and journals without backref to the text.
And, of course, "Bibliography" (term) is sometimes used as `References` (concept).
* https://en.wikipedia.org/w/index.php?title=Anabaptists&oldid=49953891 "References" interpreted as `Bibliography`
* https://en.wikipedia.org/w/index.php?title=Alcopop&oldid=296736852 "References" interpreted as `Bibliography`
* https://en.wikipedia.org/w/index.php?title=Amu%20Darya&oldid=66374611 "References" interpreted as `Bibliography`

"Citations" (term) sometimes used as synonym for "References" or "Bibliography" (terms):
* https://en.wikipedia.org/w/index.php?title=Augustine_of_Canterbury&oldid=676642624 "Citations" used as `References`, "References" used as `Bibliography`
* https://en.wikipedia.org/w/index.php?title=Anemometer&oldid=674186492#Citations "Citations" used as `References`

"Notes and References" and "References and Notes" (terms) are used as synonyms for "References" (term):
* https://en.wikipedia.org/w/index.php?title=Ackermann%20function&oldid=335603599#Notes_and_references "Notes and References" converted to "References" (term) and interpreted as `References`
* https://en.wikipedia.org/w/index.php?title=albanians&oldid=391045161#Notes_and_references "Notes and References" is a wrapper around "Notes" (interpreted as `footnotes`) and "References" (interpreted as `References`)
* https://en.wikipedia.org/w/index.php?title=assassination&oldid=678057527#Notes_and_references interpreted as `References`

"Sources" seems to be interpreted as `Bibliography` or `References`, and sometimes then converted by users to "References" or "Bibliography"
* https://en.wikipedia.org/w/index.php?title=artemis&diff=next&oldid=565871969 "Sources" has been converted to "References and sources"
* https://en.wikipedia.org/w/index.php?title=Amakusa&direction=next&oldid=667294099 "Sources" used as `Bibliography`
* https://en.wikipedia.org/w/index.php?title=A%20Doll's%20House&oldid=676505492#Sources "Sources" used as `Bibliography`
* https://en.wikipedia.org/w/index.php?title=A.%20E.%20Housman&diff=next&oldid=678259900#Sources "Sources" used `Bibliography`

"Footnotes" is commonly interpreted as `References`, with the following terms: "References" and "Citations"
* https://en.wikipedia.org/w/index.php?title=Augustine%20of%20Canterbury&oldid=459457206#Footnotes "Footnotes" is used as `References`; "Footnotes" is then converted to "Citations", used as `References`
* https://en.wikipedia.org/w/index.php?title=Amoxicillin&diff=next&oldid=423375138 "Footnotes" used as and converted to `References`
* https://en.wikipedia.org/w/index.php?title=Anabaptists&oldid=49953891#Footnotes_and_references "Footnotes" interpreted as `References`. The next revision converts "Footnotes" to "Footnotes and References".
* https://en.wikipedia.org/w/index.php?title=Alcopop&oldid=296736852#Footnotes "Footnotes" used as `References`
* https://en.wikipedia.org/w/index.php?title=Archaeopteryx&diff=next&oldid=326796096 "Footnotes" interpreteda s and then converted to `References` (term and concept)
* https://en.wikipedia.org/w/index.php?title=Al%20Capp&oldid=590148186#Footnotes "Footnotes" interpreted as `References`. It is then converted to "Notes"
* https://en.wikipedia.org/w/index.php?title=Amu%20Darya&oldid=66374611#Footnotes "Footnotes" interpreted as `References`. Later converted to "Notes"
* https://en.wikipedia.org/w/index.php?title=Albert%20Brooks&oldid=150996845#Footnotes "Footnotes" used as and then converted to `References` (term and concept)

"Literature" is used most of the times as a subsection for things like "Culture", and in some cases is a replacement for "bibliography":
* https://en.wikipedia.org/w/index.php?title=Alexandria&oldid=678355005 "Literature" used as subsection of "Culture"
* https://en.wikipedia.org/w/index.php?title=Bible&oldid=23508742#Literature "Literature" used as `Bibliography`
* https://en.wikipedia.org/w/index.php?title=Board_game&oldid=7131437#Literature "Literature" used as "Bibliography", then converted to "References" (used as "Bibliography")
* https://en.wikipedia.org/w/index.php?title=Ahuitzotl&oldid=118183827 "Literature" interpreted as `Bibliography`

"Publications" and "Publication history" are used as a subsection for the "Biography" with the works of the person described.

"Reference" is almost always converted to "References" in a successive revision.


"Notes" is sometimes interpreted as `References` or `Footnotes`
* https://en.wikipedia.org/w/index.php?title=Ahuitzotl&oldid=118183827 "Notes" used as `Footnotes`
* https://en.wikipedia.org/w/index.php?title=Archaeoastronomy&oldid=678777218#Notes "Notes" used as `References`
* https://en.wikipedia.org/w/index.php?title=Alexander_of_Hales&oldid=661215939#Other_historical_works "Notes" interpreted as `References`

"See also" and "Related pages" usually contain links to other wikipedia pages.
"""

1 change: 1 addition & 0 deletions mwrefs/bibs/processors/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from . import bibliography_extractor, identifiers_extractor, sections_counter
Loading