Source code for mwpersistence.utilities.persistence2stats
r"""
``$ mwpersistence persistence2stats -h``
::
Generates revision-level statistics from a sequence of token persistence
infused revision documents into revision statistics.
Usage:
persistence2stats (-h | --help)
persistence2stats [<input-file>...] [--min-persisted=<num>]
[--min-visible=<days>] [--include=<regex>]
[--exclude=<regex>] [--keep-tokens] [--threads=<num>]
[--output=<path>] [--compress=<type>] [--verbose]
[--debug]
Options:
-h --help Print this documentation
<input-file> The path to a file containing persistence data.
[default: <stdin>]
--min-persisted=<num> The minimum number of revisions a token must
survive before being considered "persisted"
[default: 5]
--min-visible=<days> The minimum amount of time a token must survive
before being considered "persisted" (in days)
[default: 14]
--include=<regex> A regex matching tokens to include (case
insensitive) [default: <all>]
--exclude=<regex> A regex matching tokens to exclude (case
insensitive) [default: <none>]
--keep-tokens Do not drop 'tokens' field data from the JSON
document.
--threads=<num> If a collection of files are provided, how many
processor threads should be prepare?
[default: <cpu_count>]
--output=<path> Write output to a directory with one output file
per input path. [default: <stdout>]
--compress=<type> If set, output written to the output-dir will be
compressed in this format. [default: bz2]
--verbose Print out progress information
--debug Print debug logging to stderr.
"""
import logging
import re
import sys
from math import log
import mwcli
import mwxml.utilities
logger = logging.getLogger(__name__)
def process_args(args):
if args['--include'] == "<all>":
include = None
else:
include_re = re.compile(args['--include'], re.UNICODE | re.I)
include = lambda t: bool(include_re.search(t))
if args['--exclude'] == "<none>":
exclude = None
else:
exclude_re = re.compile(args['--exclude'], re.UNICODE | re.I)
exclude = lambda t: bool(exclude_re.search(t))
return {'min_persisted': int(args['--min-persisted']),
'min_visible': float(args['--min-visible']) * (60 * 60 * 24),
'keep_tokens': bool(args['--keep-tokens']),
'include': include,
'exclude': exclude}
def _persistence2stats(*args, keep_tokens, **kwargs):
docs = persistence2stats(*args, **kwargs)
if not keep_tokens:
docs = drop_tokens(docs)
yield from docs
def drop_tokens(rev_docs):
for rev_doc in rev_docs:
rev_doc['persistence'].pop('tokens', None)
yield rev_doc
[docs]def persistence2stats(rev_docs, min_persisted=5, min_visible=1209600,
include=None, exclude=None, verbose=False):
"""
Processes a sorted and page-partitioned sequence of revision documents into
and adds statistics to the 'persistence' field each token "added" in the
revision persisted through future revisions.
:Parameters:
rev_docs : `iterable` ( `dict` )
JSON documents of revision data containing a 'diff' field as
generated by ``dump2diffs``. It's assumed that rev_docs are
partitioned by page and otherwise in chronological order.
window_size : `int`
The size of the window of revisions from which persistence data
will be generated.
min_persisted : `int`
The minimum future revisions that a token must persist in order
to be considered "persistent".
min_visible : `int`
The minimum number of seconds that a token must be visible in order
to be considered "persistent".
include : `func`
A function that returns `True` when a token should be included in
statistical processing
exclude : `str` | `re.SRE_Pattern`
A function that returns `True` when a token should *not* be
included in statistical processing (Takes precedence over
'include')
verbose : `bool`
Prints out dots and stuff to stderr
:Returns:
A generator of rev_docs with a 'persistence' field containing
statistics about individual tokens.
"""
rev_docs = mwxml.utilities.normalize(rev_docs)
min_persisted = int(min_persisted)
min_visible = int(min_visible)
include = include if include is not None else lambda t: True
exclude = exclude if exclude is not None else lambda t: False
for rev_doc in rev_docs:
persistence_doc = rev_doc['persistence']
stats_doc = {
'tokens_added': 0,
'persistent_tokens': 0,
'non_self_persistent_tokens': 0,
'sum_log_persisted': 0,
'sum_log_non_self_persisted': 0,
'sum_log_seconds_visible': 0,
'censored': False,
'non_self_censored': False
}
filtered_docs = (t for t in persistence_doc['tokens']
if include(t['text']) and not exclude(t['text']))
for token_doc in filtered_docs:
if verbose:
sys.stderr.write(".")
sys.stderr.flush()
stats_doc['tokens_added'] += 1
stats_doc['sum_log_persisted'] += log(token_doc['persisted'] + 1)
stats_doc['sum_log_non_self_persisted'] += \
log(token_doc['non_self_persisted'] + 1)
stats_doc['sum_log_seconds_visible'] += \
log(token_doc['seconds_visible'] + 1)
# Look for time threshold
if token_doc['seconds_visible'] >= min_visible:
stats_doc['persistent_tokens'] += 1
stats_doc['non_self_persistent_tokens'] += 1
else:
# Look for review threshold
stats_doc['persistent_tokens'] += \
token_doc['persisted'] >= min_persisted
stats_doc['non_self_persistent_tokens'] += \
token_doc['non_self_persisted'] >= min_persisted
# Check for censoring
if persistence_doc['seconds_possible'] < min_visible:
stats_doc['censored'] = True
stats_doc['non_self_censored'] = True
else:
if persistence_doc['revisions_processed'] < min_persisted:
stats_doc['censored'] = True
if persistence_doc['non_self_processed'] < min_persisted:
stats_doc['non_self_censored'] = True
if verbose:
sys.stderr.write("\n")
sys.stderr.flush()
rev_doc['persistence'].update(stats_doc)
yield rev_doc
streamer = mwcli.Streamer(
__doc__,
__name__,
_persistence2stats,
process_args
)
main = streamer.main