Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion examples/xml_dump.iteration.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@

# Iterate through pages
for page in dump:

# Iterate through a page's revisions
for revision in page:
print(revision.id)

# dump has a language attribute
assert dump.lang == 'en'
3 changes: 1 addition & 2 deletions mw/xml_dump/element_iterator.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def __init__(self, element, pointer):
self.pointer = pointer
self.element = element
self.depth = pointer.depth() - 1

self.element.attrib = {trim_ns(k): v for k, v in self.element.attrib.items()}
self.done = False

def __iter__(self):
Expand Down Expand Up @@ -93,7 +93,6 @@ def __getattr__(self, attr):

@classmethod
def from_file(cls, f):

try:
pointer = EventPointer.from_file(f)
event, element = next(pointer)
Expand Down
22 changes: 13 additions & 9 deletions mw/xml_dump/iteration/iterator.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,6 @@ def _read(self, size):
yield item.read()




def concat(*stream_items):
return ConcatinatingTextReader(*stream_items)

Expand Down Expand Up @@ -73,7 +71,7 @@ class Iterator(serializable.Type):
'__pages')

def __init__(self, site_name=None, dbname=None, base=None, generator=None,
case=None, namespaces=None, pages=None):
case=None, namespaces=None, pages=None, lang=None):

self.site_name = none_or(site_name, str)
"""
Expand Down Expand Up @@ -109,6 +107,11 @@ def __init__(self, site_name=None, dbname=None, base=None, generator=None,
# Should be a lazy generator of page info
self.__pages = pages

self.lang = none_or(lang, str)
"""
A 2 character language code.
"""

def __iter__(self):
return self.__pages

Expand Down Expand Up @@ -140,6 +143,9 @@ def load_site_info(cls, element):
namespaces = {}

for sub_element in element:

if sub_element.tag == 'siteinfo':
return cls.load_site_info(sub_element)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm confused about this line because it looks like you're expecting to find something like this:

<siteinfo>
  <siteinfo> ... </siteinfo>
</siteinfo>

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK i see now why this looks weird. I'll take another look. It's been a while since I did this so I'll see if it's a mistake or just something strange going on with Wikia dumps.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks a lot for taking a look at this.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😉 👍

if sub_element.tag == 'sitename':
site_name = sub_element.text
if sub_element.tag == 'dbname':
Expand All @@ -152,30 +158,28 @@ def load_site_info(cls, element):
case = sub_element.text
elif sub_element.tag == 'namespaces':
namespaces = cls.load_namespaces(sub_element)

return site_name, dbname, base, generator, case, namespaces

@classmethod
def load_pages(cls, element):

for sub_element in element:
tag = sub_element.tag

if tag == "page":
yield Page.from_element(sub_element)
else:
assert MalformedXML("Expected to see 'page'. " +
"Instead saw '{0}'".format(tag))

@classmethod
def from_element(cls, element):

def from_element(cls, element, lang=None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not extract the lang inside of this method?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea.

site_name = None
base = None
generator = None
case = None
namespaces = None

lang = element.attr("lang")
# Consume <siteinfo>
for sub_element in element:
tag = sub_element.tag
Expand All @@ -187,7 +191,7 @@ def from_element(cls, element):
# Consume all <page>
pages = cls.load_pages(element)

return cls(site_name, dbname, base, generator, case, namespaces, pages)
return cls(site_name, dbname, base, generator, case, namespaces, pages, lang)

@classmethod
def from_file(cls, f):
Expand Down
3 changes: 2 additions & 1 deletion mw/xml_dump/iteration/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from .redirect import Redirect
from .revision import Revision


class Page(serializable.Type):
"""
Page meta data and a :class:`~mw.xml_dump.Revision` iterator. Instances of
Expand Down Expand Up @@ -99,6 +98,8 @@ def from_element(cls, element):
restrictions.append(sub_element.text)
elif tag == "DiscussionThreading":
continue
elif tag == "sha1":
continue
elif tag == "revision":
first_revision = sub_element
break
Expand Down