Source code for mwpersistence.state

import logging
from hashlib import sha1

import mwreverts

from .token import Token

logger = logging.getLogger(__name__)


class Version:
    __slots__ = ('tokens', )

    def __init__(self, tokens=None):
        self.tokens = tokens


[docs]class State: """ Constructs a revision state object that will track the persistence of tokens though a history of revisions of word persistence. This class is commonly used to process the revisions of a page in chronological order. """ def update(self, text, revision=None): raise NotImplementedError()
[docs]class DiffState: """ Constructs a state object with a diff-based transition function. :Parameters: diff_engine : :class:`deltas.DiffEngine` A "diff engine" processor for sequentially diffing text revert_radius : int a positive integer indicating the maximum revision distance that a revert can span. revert_detector : :class:`mwreverts.Detector` A revert detector. :Example: >>> import mwpersistence >>> import deltas >>> >>> state = mwpersistence.DiffState(deltas.SegmentMatcher()) >>> >>> print(state.update("Apples are red.", revision=1)) ([Token(text='Apples', revisions=[1]), Token(text=' ', revisions=[1]), Token(text='are', revisions=[1]), Token(text=' ', revisions=[1]), Token(text='red', revisions=[1]), Token(text='.', revisions=[1])], [Token(text='Apples', revisions=[1]), Token(text=' ', revisions=[1]), Token(text='are', revisions=[1]), Token(text=' ', revisions=[1]), Token(text='red', revisions=[1]), Token(text='.', revisions=[1])], []) >>> print(state.update("Apples are blue.", revision=2)) ([Token(text='Apples', revisions=[1, 2]), Token(text=' ', revisions=[1, 2]), Token(text='are', revisions=[1, 2]), Token(text=' ', revisions=[1, 2]), Token(text='blue', revisions=[2]), Token(text='.', revisions=[1, 2])], [Token(text='blue', revisions=[2])], [Token(text='red', revisions=[1])]) >>> print(state.update("Apples are red.", revision=3)) # A revert! ([Token(text='Apples', revisions=[1, 2, 3]), Token(text=' ', revisions=[1, 2, 3]), Token(text='are', revisions=[1, 2, 3]), Token(text=' ', revisions=[1, 2, 3]), Token(text='red', revisions=[1, 3]), Token(text='.', revisions=[1, 2, 3])], [], []) """ class Version: __slots__ = ('tokens',) def __init__(self): self.tokens = None def __init__(self, diff_engine=None, revert_radius=None, revert_detector=None): if diff_engine is not None: if not hasattr(diff_engine, 'process'): raise TypeError("'diff_engine' of type {0} does not have a " + "process() method.".format(type(diff_engine))) else: self.diff_engine = diff_engine self.diff_processor = self.diff_engine.processor() else: self.diff_engine, self.diff_processor = None, None # Either pass a detector or the revert radius so I can make one if revert_detector is None and revert_radius is None: raise TypeError("Either a 'revert_detector' or a " + "'revert_radius' must be provided.") if revert_detector is None: self.revert_detector = mwreverts.Detector(int(revert_radius)) else: self.revert_detector = revert_detector # Stores the last tokens self.last = Version()
[docs] def update(self, text, revision=None): """ Modifies the internal state based a change to the content and returns the sets of words added and removed. :Parameters: text : str The text content of a revision revision : `mixed` Revision metadata :Returns: A triple of lists: current_tokens : `list` ( :class:`~mwpersistence.Token` ) A sequence of Tokens representing the revision that was just processed. tokens_added : `list` ( :class:`~mwpersistence.Token` ) Tokens that were added while updating state. tokens_removed : `list` ( :class:`~mwpersistence.Token` ) Tokens that were removed while updating state. """ return self._update(text=text, revision=revision)
[docs] def update_opdocs(self, checksum, opdocs, revision=None): """ Modifies the internal state based a change to the content and returns the sets of words added and removed. :Parameters: checksum : `hashable` A checksum generated from the text of a revision opdocs : `iterable` ( `dict` ) A sequence of operations that represent the diff of this new revision revision : `mixed` Revision metadata :Returns: A triple of lists: current_tokens : `list` ( :class:`~mwpersistence.Token` ) A sequence of Tokens representing the revision that was just processed. tokens_added : `list` ( :class:`~mwpersistence.Token` ) Tokens that were added while updating state. tokens_removed : `list` ( :class:`~mwpersistence.Token` ) Tokens that were removed while updating state. """ return self._update(checksum=checksum, opdocs=opdocs, revision=revision)
def _update(self, text=None, checksum=None, opdocs=None, revision=None): if checksum is None: if text is None: raise TypeError("Either 'text' or 'checksum' must be " + "specified.") else: checksum = sha1(bytes(text, 'utf8')).hexdigest() current_version = Version() revert = self.revert_detector.process(checksum, current_version) if revert is not None: # Revert logger.debug("Revert detected between {0} and {1}" .format(revert.reverting, revert.reverted_to)) # Extract reverted_to revision current_version.tokens = revert.reverted_to.tokens # Update diff_processor state if self.diff_processor is not None: self.diff_processor.update(last_tokens=current_version.tokens) transition = current_version.tokens, [], [] else: if opdocs is not None: transition = apply_opdocs(opdocs, self.last.tokens or []) current_version.tokens, _, _ = transition else: # NOTICE: HEAVY COMPUTATION HERE!!! # # Diffs usually run in O(n^2) -- O(n^3) time and most # tokenizers produce a lot of tokens. if self.diff_processor is None: raise RuntimeError("DiffState cannot process raw text " + "without a diff_engine specified.") operations, _, current_tokens = \ self.diff_processor.process(text, token_class=Token) transition = apply_operations(operations, self.last.tokens or [], current_tokens) current_version.tokens, _, _ = transition # Record persistence persist_revision_once(current_version.tokens, revision) # Update last version self.last = current_version # Return the tranisitoned state return transition
def persist_revision_once(tokens, revision): """ This function makes sure that a revision is only marked as persisting for a token once. This is important since some diff algorithms allow tokens to be copied more than once in a revision. The id(token) should unique to the in-memory representation of any object, so we use that as unique token instance identifier. """ token_map = {id(token):token for token in tokens} for token in token_map.values(): token.persist(revision) def apply_operations(operations, a, b): tokens = [] tokens_added = [] tokens_removed = [] for op in operations: if op.name in ("replace", "insert"): new_tokens = b[op.b1:op.b2] tokens.extend(new_tokens) tokens_added.extend(new_tokens) if op.name in ("replace", "delete"): tokens_removed.extend(a[op.a1:op.a2]) elif op.name == "equal": tokens.extend(a[op.a1:op.a2]) return (tokens, tokens_added, tokens_removed) def apply_opdocs(op_docs, a, token_class=Token): tokens = [] tokens_added = [] tokens_removed = [] for op_doc in op_docs: if op_doc['name'] in ("replace", "insert"): new_tokens = [token_class(s) for s in op_doc['tokens']] tokens.extend(new_tokens) tokens_added.extend(new_tokens) if op_doc['name'] in ("replace", "delete"): tokens_removed.extend(a[op_doc['a1']:op_doc['a2']]) elif op_doc['name'] == "equal": tokens.extend(a[op_doc['a1']:op_doc['a2']]) return (tokens, tokens_added, tokens_removed)