Source code for mwpersistence.utilities.revdocs2stats

r"""
``$ mwpersistence revdocs2stats -h``
::

    Full pipeline from JSON revision documents to content persistence
    statistics.

    Usage:
        revdocs2stats (-h|--help)
        revdocs2stats [<input-file>...] --config=<path> --sunset=<date>
                      [--namespaces=<ids>] [--timeout=<secs>]
                      [--window=<revs>] [--revert-radius=<revs>]
                      [--min-persisted=<num>] [--min-visible=<days>]
                      [--include=<regex>] [--exclude=<regex>]
                      [--keep-text] [--keep-diff] [--keep-tokens]
                      [--threads=<num>] [--output=<path>] [--compress=<type>]
                      [--verbose] [--debug]

    Options:
        -h|--help               Print this documentation
        <input-file>            The path to a file of page-partitioned JSON
                                revision documents. [default: <stdin>]
        --config=<path>         The path to a deltas DiffEngine configuration
        --namespaces=<ids>      A comma separated list of namespace IDs to be
                                considered [default: <all>]
        --timeout=<secs>        The maximum number of seconds that a diff will
                                be allowed to run before being stopped
                                [default: 10]
        --sunset=<date>         The date of the database dump we are generating
                                from.  This is used to apply a 'time visible'
                                statistic.  Expects %Y-%m-%dT%H:%M:%SZ".
                                [default: <now>]
        --window=<revs>         The size of the window of revisions from which
                                persistence data will be generated.
                                [default: 50]
        --revert-radius=<revs>  The number of revisions back that a revert can
                                reference. [default: 15]
        --min-persisted=<num>   The minimum number of revisions a token must
                                survive before being considered "persisted"
                                [default: 5]
        --min-visible=<days>    The minimum amount of time a token must survive
                                before being considered "persisted" (in days)
                                [default: 14]
        --include=<regex>       A regex matching tokens to include
                                [default: <all>]
        --exclude=<regex>       A regex matching tokens to exclude
                                [default: <none>]
        --keep-text             If set, the 'text' field will be populated in
                                the output JSON.
        --keep-diff             If set, the 'diff' field will be populated in
                                the output JSON.
        --keep-tokens           If set, the 'tokens' field will be populated in
                                the output JSON.
        --threads=<num>         If a collection of files are provided, how many
                                processor threads should be prepare?
                                [default: <cpu_count>]
        --output=<path>         Write output to a directory with one output
                                file per input path.  [default: <stdout>]
        --compress=<type>       If set, output written to the output-dir will
                                be compressed in this format. [default: bz2]
        --verbose               Print progress information to stderr.
        --debug                 Print debug logging to stderr.
"""
import logging

import mwcli
import mwxml.utilities

import mwdiffs.utilities

from .diffs2persistence import process_args as diffs2persistence_args
from .diffs2persistence import diffs2persistence, drop_diff
from .persistence2stats import process_args as persistence2stats_args
from .persistence2stats import drop_tokens, persistence2stats

logger = logging.getLogger(__name__)


def process_args(args):
    kwargs = mwdiffs.utilities.dump2diffs_args(args)
    kwargs.update(diffs2persistence_args(args))
    kwargs.update(persistence2stats_args(args))
    return kwargs


[docs]def revdocs2stats(rev_docs, diff_engine, namespaces, timeout, window_size, revert_radius, sunset, min_persisted, min_visible, include, exclude, keep_text=False, keep_diff=False, keep_tokens=False, verbose=False): diff_docs = mwdiffs.utilities.revdocs2diffs(rev_docs, diff_engine, namespaces, timeout) if not keep_text: diff_docs = mwdiffs.utilities.drop_text(diff_docs) persistence_docs = diffs2persistence( diff_docs, window_size, revert_radius, sunset, verbose=verbose) if not keep_diff: persistence_docs = drop_diff(persistence_docs) stats_docs = persistence2stats( persistence_docs, min_persisted, min_visible, include, exclude) if not keep_tokens: stats_docs = drop_tokens(stats_docs) yield from stats_docs
streamer = mwcli.Streamer( __doc__, __name__, revdocs2stats, process_args ) main = streamer.main