Source code for mwxml.iteration.page

import logging

import mwtypes

from ..errors import MalformedXML
from .revision import Revision

logger = logging.getLogger(__name__)


[docs]class Page(mwtypes.Page): """ Page meta data and a :class:`~mwxml.Revision` iterator. Instances of this class can be called as iterators directly. See :class:`mwtypes.Page` for a description of fields. :Example: .. code-block:: python page = mwxml.Page( ... ) for revision in page: print("{0} {1}".format(revision.id, page.id)) """ def initialize(self, *args, revisions=None, **kwargs): super().initialize(*args, **kwargs) # Should be a lazy generator self.__revisions = revisions def __iter__(self): for revision in self.__revisions: revision.page = self yield revision def __next__(self): revision = next(self.__revisions) revision.page = self return revision @classmethod def load_revisions(cls, first_revision, element): if first_revision is not None: yield Revision.from_element(first_revision) for sub_element in element: tag = sub_element.tag if tag == "revision": yield Revision.from_element(sub_element) else: raise MalformedXML("Expected to see <revision>. " + "Instead saw <{0}>".format(tag)) @classmethod def from_element(cls, element, namespace_map=None): title = None namespace = None id = None redirect = None restrictions = [] first_revision = None # Consume each of the elements until we see <revision> which should # signal the start of revision data for sub_element in element: tag = sub_element.tag if tag == "title": page_name = sub_element.text elif tag == "ns": namespace = int(sub_element.text) elif tag == "id": id = int(sub_element.text) elif tag == "redirect": redirect = sub_element.attr('title') elif tag == "restrictions": restrictions.append(sub_element.text) elif tag == "revision": first_revision = sub_element break # Assuming that the first revision seen marks the end of page # metadata. I'm not too keen on this assumption, so I'm leaving # this long comment to warn whoever ends up maintaining this. else: raise MalformedXML("Unexpected tag found when processing " + "a <page>: '{0}'".format(tag)) # Assuming that I got here by seeing a <revision> tag. See verbose # comment above. revisions = cls.load_revisions(first_revision, element) # Normalize title and extract namespace mapped_namespace, title = extract_namespace(page_name, namespace_map) if namespace is not None and mapped_namespace != namespace: logger.warn("Namespace id conflict detected. " + "<title>={0}, ".format(page_name) + "<namespace>={0}, ".format(namespace) + "mapped_namespace={0}".format(mapped_namespace)) namespace = namespace or mapped_namespace # Construct class return cls(id, title, namespace, redirect=redirect, restrictions=restrictions, revisions=revisions)
def normalize_title(title): return title.replace("_", " ") def extract_namespace(page_name, namespace_map): title_parts = page_name.split(":", 1) if len(title_parts) == 1: return 0, normalize_title(page_name) else: ns_name, split_title = title_parts if ns_name in namespace_map: return namespace_map[ns_name].id, normalize_title(split_title) else: return 0, normalize_title(page_name)