Source code for mwpersistence.utilities.dump2stats

r"""
``$ mwpersistence dump2stats -h``
::

    Full pipeline from MediaWiki XML dumps to content persistence statistics.

    Usage:
        dump2stats (-h|--help)
        dump2stats [<input-file>...] --config=<path> --sunset=<date>
                   [--namespaces=<ids>] [--timeout=<secs>]
                   [--window=<revs>] [--revert-radius=<revs>]
                   [--min-persisted=<num>] [--min-visible=<days>]
                   [--include=<regex>] [--exclude=<regex>]
                   [--keep-text] [--keep-diff] [--keep-tokens]
                   [--threads=<num>] [--output=<path>] [--compress=<type>]
                   [--verbose] [--debug]

    Options:
        -h|--help               Print this documentation
        <input-file>            The path to a MediaWiki XML Dump file
                                [default: <stdin>]
        --config=<path>         The path to a deltas DiffEngine configuration
        --namespaces=<ids>      A comma separated list of namespace IDs to be
                                considered [default: <all>]
        --timeout=<secs>        The maximum number of seconds that a diff will
                                be allowed to run before being stopped
                                [default: 10]
        --sunset=<date>         The date of the database dump we are generating
                                from.  This is used to apply a 'time visible'
                                statistic.  Expects %Y-%m-%dT%H:%M:%SZ".
                                [default: <now>]
        --window=<revs>         The size of the window of revisions from which
                                persistence data will be generated.
                                [default: 50]
        --revert-radius=<revs>  The number of revisions back that a revert can
                                reference. [default: 15]
        --min-persisted=<num>   The minimum number of revisions a token must
                                survive before being considered "persisted"
                                [default: 5]
        --min-visible=<days>    The minimum amount of time a token must survive
                                before being considered "persisted" (in days)
                                [default: 14]
        --include=<regex>       A regex matching tokens to include
                                [default: <all>]
        --exclude=<regex>       A regex matching tokens to exclude
                                [default: <none>]
        --keep-text             If set, the 'text' field will be populated in
                                the output JSON.
        --keep-diff             If set, the 'diff' field will be populated in
                                the output JSON.
        --keep-tokens           If set, the 'tokens' field will be populated in
                                the output JSON.
        --threads=<num>         If a collection of files are provided, how many
                                processor threads should be prepare?
                                [default: <cpu_count>]
        --output=<path>         Write output to a directory with one output
                                file per input path.  [default: <stdout>]
        --compress=<type>       If set, output written to the output-dir will
                                be compressed in this format. [default: bz2]
        --verbose               Print progress information to stderr.
        --debug                 Print debug logging to stderr.
"""
import logging

import mwcli
import mwxml

from .revdocs2stats import process_args as revdocs2stats_args
from .revdocs2stats import revdocs2stats

logger = logging.getLogger(__name__)


[docs]def dump2stats(dump, *args, **kwargs): rev_docs = mwxml.utilities.dump2revdocs(dump) stats_docs = revdocs2stats(rev_docs, *args, **kwargs) yield from stats_docs
streamer = mwcli.Streamer( __doc__, __name__, dump2stats, revdocs2stats_args, file_reader=mwxml.Dump.from_file ) main = streamer.main