Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion examples/xml_dump.iteration.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@

# Iterate through pages
for page in dump:

# Iterate through a page's revisions
for revision in page:
print(revision.id)

# dump has a language attribute
assert dump.lang == 'en'
3 changes: 1 addition & 2 deletions mw/xml_dump/element_iterator.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def __init__(self, element, pointer):
self.pointer = pointer
self.element = element
self.depth = pointer.depth() - 1

self.element.attrib = {trim_ns(k): v for k, v in self.element.attrib.items()}
self.done = False

def __iter__(self):
Expand Down Expand Up @@ -93,7 +93,6 @@ def __getattr__(self, attr):

@classmethod
def from_file(cls, f):

try:
pointer = EventPointer.from_file(f)
event, element = next(pointer)
Expand Down
25 changes: 16 additions & 9 deletions mw/xml_dump/iteration/iterator.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ class Iterator(serializable.Type):
'__pages')

def __init__(self, site_name=None, dbname=None, base=None, generator=None,
case=None, namespaces=None, pages=None):
case=None, namespaces=None, pages=None, lang=None):

self.site_name = none_or(site_name, str)
"""
Expand Down Expand Up @@ -109,6 +109,11 @@ def __init__(self, site_name=None, dbname=None, base=None, generator=None,
# Should be a lazy generator of page info
self.__pages = pages

self.lang = none_or(lang, str)
"""
A 2 character language code.
"""

def __iter__(self):
return self.__pages

Expand Down Expand Up @@ -140,6 +145,9 @@ def load_site_info(cls, element):
namespaces = {}

for sub_element in element:

if sub_element.tag == 'siteinfo':
return(cls.load_site_info(sub_element))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return isn't a function

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, what's going on here? Is there a inside of a tag?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean is language data inside a tag?
Unfortunately not. The language is in the xml header, not a tag.

if sub_element.tag == 'sitename':
site_name = sub_element.text
if sub_element.tag == 'dbname':
Expand All @@ -152,24 +160,21 @@ def load_site_info(cls, element):
case = sub_element.text
elif sub_element.tag == 'namespaces':
namespaces = cls.load_namespaces(sub_element)

return site_name, dbname, base, generator, case, namespaces

@classmethod
def load_pages(cls, element):

for sub_element in element:
tag = sub_element.tag

if tag == "page":
yield Page.from_element(sub_element)
else:
assert MalformedXML("Expected to see 'page'. " +
"Instead saw '{0}'".format(tag))

@classmethod
def from_element(cls, element):

def from_element(cls, element, lang=None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not extract the lang inside of this method?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea.

site_name = None
base = None
generator = None
Expand All @@ -187,20 +192,22 @@ def from_element(cls, element):
# Consume all <page>
pages = cls.load_pages(element)

return cls(site_name, dbname, base, generator, case, namespaces, pages)
return cls(site_name, dbname, base, generator, case, namespaces, pages, lang)

@classmethod
def from_file(cls, f):
element = ElementIterator.from_file(f)
assert element.tag == "mediawiki"
return cls.from_element(element)
lang = element.attr("lang")
return cls.from_element(element, lang=lang)

@classmethod
def from_string(cls, string):
f = io.StringIO(string)
element = ElementIterator.from_file(f)
assert element.tag == "mediawiki"
return cls.from_element(element)
lang = element.attr("xml:lang")
return cls.from_element(element, lang=lang)

@classmethod
def from_page_xml(cls, page_xml):
Expand Down
3 changes: 2 additions & 1 deletion mw/xml_dump/iteration/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from .redirect import Redirect
from .revision import Revision


class Page(serializable.Type):
"""
Page meta data and a :class:`~mw.xml_dump.Revision` iterator. Instances of
Expand Down Expand Up @@ -99,6 +98,8 @@ def from_element(cls, element):
restrictions.append(sub_element.text)
elif tag == "DiscussionThreading":
continue
elif tag == "sha1":
continue
elif tag == "revision":
first_revision = sub_element
break
Expand Down