Source code for mwpersistence.utilities.persistence2stats

r"""
``$ mwpersistence persistence2stats -h``
::

    Generates revision-level statistics from a sequence of token persistence
    infused revision documents into revision statistics.

    Usage:
        persistence2stats (-h | --help)
        persistence2stats [<input-file>...] [--min-persisted=<num>]
                          [--min-visible=<days>] [--include=<regex>]
                          [--exclude=<regex>] [--keep-tokens] [--threads=<num>]
                          [--output=<path>] [--compress=<type>] [--verbose]
                          [--debug]

    Options:
        -h --help              Print this documentation
        <input-file>           The path to a file containing persistence data.
                               [default: <stdin>]
        --min-persisted=<num>  The minimum number of revisions a token must
                               survive before being considered "persisted"
                               [default: 5]
        --min-visible=<days>   The minimum amount of time a token must survive
                               before being considered "persisted" (in days)
                               [default: 14]
        --include=<regex>      A regex matching tokens to include (case
                               insensitive) [default: <all>]
        --exclude=<regex>      A regex matching tokens to exclude (case
                               insensitive) [default: <none>]
        --keep-tokens          Do not drop 'tokens' field data from the JSON
                               document.
        --threads=<num>        If a collection of files are provided, how many
                               processor threads should be prepare?
                               [default: <cpu_count>]
        --output=<path>        Write output to a directory with one output file
                               per input path.  [default: <stdout>]
        --compress=<type>      If set, output written to the output-dir will be
                               compressed in this format. [default: bz2]
        --verbose              Print out progress information
        --debug                Print debug logging to stderr.
"""
import logging
import re
import sys
from math import log

import mwcli
import mwxml.utilities

logger = logging.getLogger(__name__)


def process_args(args):

    if args['--include'] == "<all>":
        include = None
    else:
        include_re = re.compile(args['--include'], re.UNICODE | re.I)
        include = lambda t: bool(include_re.search(t))

    if args['--exclude'] == "<none>":
        exclude = None
    else:
        exclude_re = re.compile(args['--exclude'], re.UNICODE | re.I)
        exclude = lambda t: bool(exclude_re.search(t))

    return {'min_persisted': int(args['--min-persisted']),
            'min_visible': float(args['--min-visible']) * (60 * 60 * 24),
            'keep_tokens': bool(args['--keep-tokens']),
            'include': include,
            'exclude': exclude}


def _persistence2stats(*args, keep_tokens, **kwargs):
    docs = persistence2stats(*args, **kwargs)
    if not keep_tokens:
        docs = drop_tokens(docs)

    yield from docs


def drop_tokens(rev_docs):
    for rev_doc in rev_docs:
        rev_doc['persistence'].pop('tokens', None)
        yield rev_doc


[docs]def persistence2stats(rev_docs, min_persisted=5, min_visible=1209600,
                      include=None, exclude=None, verbose=False):
    """
    Processes a sorted and page-partitioned sequence of revision documents into
    and adds statistics to the 'persistence' field each token "added" in the
    revision persisted through future revisions.

    :Parameters:
        rev_docs : `iterable` ( `dict` )
            JSON documents of revision data containing a 'diff' field as
            generated by ``dump2diffs``.  It's assumed that rev_docs are
            partitioned by page and otherwise in chronological order.
        window_size : `int`
            The size of the window of revisions from which persistence data
            will be generated.
        min_persisted : `int`
            The minimum future revisions that a token must persist in order
            to be considered "persistent".
        min_visible : `int`
            The minimum number of seconds that a token must be visible in order
            to be considered "persistent".
        include : `func`
            A function that returns `True` when a token should be included in
            statistical processing
        exclude : `str` | `re.SRE_Pattern`
            A function that returns `True` when a token should *not* be
            included in statistical processing (Takes precedence over
            'include')
        verbose : `bool`
            Prints out dots and stuff to stderr

    :Returns:
        A generator of rev_docs with a 'persistence' field containing
        statistics about individual tokens.
    """
    rev_docs = mwxml.utilities.normalize(rev_docs)

    min_persisted = int(min_persisted)
    min_visible = int(min_visible)
    include = include if include is not None else lambda t: True
    exclude = exclude if exclude is not None else lambda t: False

    for rev_doc in rev_docs:
        persistence_doc = rev_doc['persistence']
        stats_doc = {
            'tokens_added': 0,
            'persistent_tokens': 0,
            'non_self_persistent_tokens': 0,
            'sum_log_persisted': 0,
            'sum_log_non_self_persisted': 0,
            'sum_log_seconds_visible': 0,
            'censored': False,
            'non_self_censored': False
        }

        filtered_docs = (t for t in persistence_doc['tokens']
                         if include(t['text']) and not exclude(t['text']))
        for token_doc in filtered_docs:
            if verbose:
                sys.stderr.write(".")
                sys.stderr.flush()

            stats_doc['tokens_added'] += 1
            stats_doc['sum_log_persisted'] += log(token_doc['persisted'] + 1)
            stats_doc['sum_log_non_self_persisted'] += \
                log(token_doc['non_self_persisted'] + 1)
            stats_doc['sum_log_seconds_visible'] += \
                log(token_doc['seconds_visible'] + 1)

            # Look for time threshold
            if token_doc['seconds_visible'] >= min_visible:
                stats_doc['persistent_tokens'] += 1
                stats_doc['non_self_persistent_tokens'] += 1
            else:
                # Look for review threshold
                stats_doc['persistent_tokens'] += \
                    token_doc['persisted'] >= min_persisted

                stats_doc['non_self_persistent_tokens'] += \
                    token_doc['non_self_persisted'] >= min_persisted

                # Check for censoring
                if persistence_doc['seconds_possible'] < min_visible:
                    stats_doc['censored'] = True
                    stats_doc['non_self_censored'] = True

                else:
                    if persistence_doc['revisions_processed'] < min_persisted:
                        stats_doc['censored'] = True

                    if persistence_doc['non_self_processed'] < min_persisted:
                        stats_doc['non_self_censored'] = True

        if verbose:
            sys.stderr.write("\n")
            sys.stderr.flush()

        rev_doc['persistence'].update(stats_doc)

        yield rev_doc


streamer = mwcli.Streamer(
    __doc__,
    __name__,
    _persistence2stats,
    process_args
)
main = streamer.main