import logging
from hashlib import sha1
import mwreverts
from .token import Token
logger = logging.getLogger(__name__)
class Version:
__slots__ = ('tokens', )
def __init__(self, tokens=None):
self.tokens = tokens
[docs]class State:
"""
Constructs a revision state object that will track the persistence of
tokens though a history of revisions of word persistence. This class is
commonly used to process the revisions of a page in chronological order.
"""
def update(self, text, revision=None):
raise NotImplementedError()
[docs]class DiffState:
"""
Constructs a state object with a diff-based transition function.
:Parameters:
diff_engine : :class:`deltas.DiffEngine`
A "diff engine" processor for sequentially diffing text
revert_radius : int
a positive integer indicating the maximum revision distance
that a revert can span.
revert_detector : :class:`mwreverts.Detector`
A revert detector.
:Example:
>>> import mwpersistence
>>> import deltas
>>>
>>> state = mwpersistence.DiffState(deltas.SegmentMatcher())
>>>
>>> print(state.update("Apples are red.", revision=1))
([Token(text='Apples', revisions=[1]),
Token(text=' ', revisions=[1]),
Token(text='are', revisions=[1]),
Token(text=' ', revisions=[1]),
Token(text='red', revisions=[1]),
Token(text='.', revisions=[1])],
[Token(text='Apples', revisions=[1]),
Token(text=' ', revisions=[1]),
Token(text='are', revisions=[1]),
Token(text=' ', revisions=[1]),
Token(text='red', revisions=[1]),
Token(text='.', revisions=[1])],
[])
>>> print(state.update("Apples are blue.", revision=2))
([Token(text='Apples', revisions=[1, 2]),
Token(text=' ', revisions=[1, 2]),
Token(text='are', revisions=[1, 2]),
Token(text=' ', revisions=[1, 2]),
Token(text='blue', revisions=[2]),
Token(text='.', revisions=[1, 2])],
[Token(text='blue', revisions=[2])],
[Token(text='red', revisions=[1])])
>>> print(state.update("Apples are red.", revision=3)) # A revert!
([Token(text='Apples', revisions=[1, 2, 3]),
Token(text=' ', revisions=[1, 2, 3]),
Token(text='are', revisions=[1, 2, 3]),
Token(text=' ', revisions=[1, 2, 3]),
Token(text='red', revisions=[1, 3]),
Token(text='.', revisions=[1, 2, 3])],
[],
[])
"""
class Version:
__slots__ = ('tokens',)
def __init__(self):
self.tokens = None
def __init__(self, diff_engine=None, revert_radius=None,
revert_detector=None):
if diff_engine is not None:
if not hasattr(diff_engine, 'process'):
raise TypeError("'diff_engine' of type {0} does not have a " +
"process() method.".format(type(diff_engine)))
else:
self.diff_engine = diff_engine
self.diff_processor = self.diff_engine.processor()
else:
self.diff_engine, self.diff_processor = None, None
# Either pass a detector or the revert radius so I can make one
if revert_detector is None and revert_radius is None:
raise TypeError("Either a 'revert_detector' or a " +
"'revert_radius' must be provided.")
if revert_detector is None:
self.revert_detector = mwreverts.Detector(int(revert_radius))
else:
self.revert_detector = revert_detector
# Stores the last tokens
self.last = Version()
[docs] def update(self, text, revision=None):
"""
Modifies the internal state based a change to the content and returns
the sets of words added and removed.
:Parameters:
text : str
The text content of a revision
revision : `mixed`
Revision metadata
:Returns:
A triple of lists:
current_tokens : `list` ( :class:`~mwpersistence.Token` )
A sequence of Tokens representing the revision that was just
processed.
tokens_added : `list` ( :class:`~mwpersistence.Token` )
Tokens that were added while updating state.
tokens_removed : `list` ( :class:`~mwpersistence.Token` )
Tokens that were removed while updating state.
"""
return self._update(text=text, revision=revision)
[docs] def update_opdocs(self, checksum, opdocs, revision=None):
"""
Modifies the internal state based a change to the content and returns
the sets of words added and removed.
:Parameters:
checksum : `hashable`
A checksum generated from the text of a revision
opdocs : `iterable` ( `dict` )
A sequence of operations that represent the diff of this new
revision
revision : `mixed`
Revision metadata
:Returns:
A triple of lists:
current_tokens : `list` ( :class:`~mwpersistence.Token` )
A sequence of Tokens representing the revision that was just
processed.
tokens_added : `list` ( :class:`~mwpersistence.Token` )
Tokens that were added while updating state.
tokens_removed : `list` ( :class:`~mwpersistence.Token` )
Tokens that were removed while updating state.
"""
return self._update(checksum=checksum, opdocs=opdocs,
revision=revision)
def _update(self, text=None, checksum=None, opdocs=None, revision=None):
if checksum is None:
if text is None:
raise TypeError("Either 'text' or 'checksum' must be " +
"specified.")
else:
checksum = sha1(bytes(text, 'utf8')).hexdigest()
current_version = Version()
revert = self.revert_detector.process(checksum, current_version)
if revert is not None: # Revert
logger.debug("Revert detected between {0} and {1}"
.format(revert.reverting, revert.reverted_to))
# Extract reverted_to revision
current_version.tokens = revert.reverted_to.tokens
# Update diff_processor state
if self.diff_processor is not None:
self.diff_processor.update(last_tokens=current_version.tokens)
transition = current_version.tokens, [], []
else:
if opdocs is not None:
transition = apply_opdocs(opdocs, self.last.tokens or [])
current_version.tokens, _, _ = transition
else:
# NOTICE: HEAVY COMPUTATION HERE!!!
#
# Diffs usually run in O(n^2) -- O(n^3) time and most
# tokenizers produce a lot of tokens.
if self.diff_processor is None:
raise RuntimeError("DiffState cannot process raw text " +
"without a diff_engine specified.")
operations, _, current_tokens = \
self.diff_processor.process(text, token_class=Token)
transition = apply_operations(operations,
self.last.tokens or [],
current_tokens)
current_version.tokens, _, _ = transition
# Record persistence
persist_revision_once(current_version.tokens, revision)
# Update last version
self.last = current_version
# Return the tranisitoned state
return transition
def persist_revision_once(tokens, revision):
"""
This function makes sure that a revision is only marked as persisting
for a token once. This is important since some diff algorithms allow
tokens to be copied more than once in a revision. The id(token) should
unique to the in-memory representation of any object, so we use that as
unique token instance identifier.
"""
token_map = {id(token):token for token in tokens}
for token in token_map.values():
token.persist(revision)
def apply_operations(operations, a, b):
tokens = []
tokens_added = []
tokens_removed = []
for op in operations:
if op.name in ("replace", "insert"):
new_tokens = b[op.b1:op.b2]
tokens.extend(new_tokens)
tokens_added.extend(new_tokens)
if op.name in ("replace", "delete"):
tokens_removed.extend(a[op.a1:op.a2])
elif op.name == "equal":
tokens.extend(a[op.a1:op.a2])
return (tokens, tokens_added, tokens_removed)
def apply_opdocs(op_docs, a, token_class=Token):
tokens = []
tokens_added = []
tokens_removed = []
for op_doc in op_docs:
if op_doc['name'] in ("replace", "insert"):
new_tokens = [token_class(s) for s in op_doc['tokens']]
tokens.extend(new_tokens)
tokens_added.extend(new_tokens)
if op_doc['name'] in ("replace", "delete"):
tokens_removed.extend(a[op_doc['a1']:op_doc['a2']])
elif op_doc['name'] == "equal":
tokens.extend(a[op_doc['a1']:op_doc['a2']])
return (tokens, tokens_added, tokens_removed)