Source code for mwxml.iteration.dump

import logging

import mwtypes.files

from ..element_iterator import ElementIterator
from ..errors import MalformedXML
from .log_item import LogItem
from .page import Page
from .site_info import SiteInfo

logger = logging.getLogger(__name__)


[docs]class Dump: """ XML Dump Iterator. Dump file meta data and a :class:`~mwxml.iteration.page.Page` iterator. Instances of this class can be called as an iterator directly. Usually, you'll want to construct this class using :func:`~mwxml.iteration.dump.Dump.from_file`. :Parameters: site_info : :class:`~mwxml.iteration.site_info.SiteInfo` The data from the <siteinfo> block pages : `iterable` An `iterable` of :class:`~mwxml.iteration.page.Page` in the order they appear in the XML :Example: .. code-block:: python from mwxml import Dump, Page # Construct dump file iterator dump = Dump.from_file(open("example/dump.xml")) # Iterate through pages for page in dump.pages: # Iterate through a page's revisions for revision in page: print(revision.id) :Attributes: .. autoattribute:: mwxml.Dump.site_info :annotation: = Information from the <siteinfo> block : mwxml.SiteInfo .. autoattribute:: mwxml.Dump.pages :annotation: = The mwxml.Page that appear in the dump : iterator .. autoattribute:: mwxml.Dump.items :annotation: = The mwxml.Page and/or mwxml.LogItem that appear in the dump : iterator .. autoattribute:: mwxml.Dump.log_items :annotation: = The mwxml.LogItem that appear in the dump : iterator """ __slots__ = ('site_info', 'items', 'pages', 'log_items') def __init__(self, site_info, items): self.site_info = SiteInfo(site_info) """ Metadata from the <siteinfo> tag : :class:`~mwxml.iteration.site_info.SiteInfo` """ # Should be a lazy generator of page info self.items = items or range(0) """ An iterator of :class:`mwxml.Page` and/or :class:`mwxml.LogItem` elements """ self.pages = (item for item in items if isinstance(item, Page)) "An iterator of :class:`mwxml.Page` elements" self.log_items = (item for item in items if isinstance(item, LogItem)) "An iterator of :class:`mwxml.LogItem` elements" def __iter__(self): return self.items def __next__(self): return next(self.items) @classmethod def load_items(cls, first_item_element, element, namespace_map): if first_item_element is not None: yield cls.process_item(first_item_element, namespace_map) # Ensure that we completely current tag block first_item_element.clear() for item_element in element: yield cls.process_item(item_element, namespace_map) @classmethod def process_item(cls, item_element, namespace_map): if item_element.tag == "page": return Page.from_element(item_element, namespace_map) elif item_element.tag == "logitem": return LogItem.from_element(item_element, namespace_map) else: raise MalformedXML("Expected to see <page> or <logitem>. " + "Instead saw <{0}>".format(item_element.tag)) @classmethod def from_element(cls, element): site_info = None first_item_element = None # Consume <siteinfo> for sub_element in element: if sub_element.tag == "siteinfo": site_info = SiteInfo.from_element(sub_element) elif sub_element.tag in ("page", "logitem"): first_item_element = sub_element break # Assuming that the first <page> seen marks the end of dump # metadata. I'm not too keen on this assumption, so I'm leaving # this long comment to warn whoever ends up maintaining this. else: raise MalformedXML("Unexpected tag found when processing " + "a <mediawiki>: '{0}'".format(tag)) namespace_map = None if site_info.namespaces is not None: namespace_map = {} for namespace in site_info.namespaces: namespace_map[namespace.name] = namespace # Consume all <page> and <logitem> items = cls.load_items(first_item_element, element, namespace_map) return cls(site_info, items) @classmethod
[docs] def from_file(cls, f): """ Constructs a :class:`~mwxml.iteration.dump.Dump` from a `file` pointer. :Parameters: f : `file` A plain text file pointer containing XML to process """ element = ElementIterator.from_file(f) assert element.tag == "mediawiki" return cls.from_element(element)
@classmethod
[docs] def from_page_xml(cls, page_xml): """ Constructs a :class:`~mwxml.iteration.dump.Dump` from a <page> block. :Parameters: page_xml : `str` | `file` Either a plain string or a file containing <page> block XML to process """ header = """ <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en"> <siteinfo> </siteinfo> """ footer = "</mediawiki>" return cls.from_file(mwtypes.files.concat(header, page_xml, footer))