Source code for tacl.command.utils
"""Functions useful when writing command-line scripts that interact
with tacl."""
import logging
import colorlog
import tacl
from tacl import constants
[docs]def add_common_arguments(parser):
"""Adds common arguments for all parsers."""
parser.add_argument('-v', '--verbose', action='count',
help=constants.VERBOSE_HELP)
[docs]def add_corpus_arguments(parser):
"""Adds common arguments for commands making use of a corpus to
`parser`."""
add_tokenizer_argument(parser)
parser.add_argument('corpus', help=constants.DB_CORPUS_HELP,
metavar='CORPUS')
[docs]def add_db_arguments(parser, db_option=False):
"""Adds common arguments for the database sub-commands to
`parser`.
`db_option` provides a means to work around
https://bugs.python.org/issue9338 whereby a positional argument
that follows an optional argument with nargs='+' will not be
recognised. When `db_optional` is True, create the database
argument as a required optional argument, rather than a positional
argument.
"""
parser.add_argument('-m', '--memory', action='store_true',
help=constants.DB_MEMORY_HELP)
parser.add_argument('-r', '--ram', default=3, help=constants.DB_RAM_HELP,
type=int)
if db_option:
parser.add_argument('-d', '--db', help=constants.DB_DATABASE_HELP,
metavar='DATABASE', required=True)
else:
parser.add_argument('db', help=constants.DB_DATABASE_HELP,
metavar='DATABASE')
[docs]def add_query_arguments(parser):
"""Adds common arguments for query sub-commonads to `parser`."""
parser.add_argument('catalogue', help=constants.CATALOGUE_CATALOGUE_HELP,
metavar='CATALOGUE')
[docs]def add_supplied_query_arguments(parser):
"""Adds common arguments for supplied query sub-commands to
`parser`."""
parser.add_argument('-l', '--labels', help=constants.SUPPLIED_LABELS_HELP,
nargs='+', required=True)
parser.add_argument('-s', '--supplied',
help=constants.SUPPLIED_RESULTS_HELP,
metavar='RESULTS', nargs='+', required=True)
[docs]def add_tokenizer_argument(parser):
parser.add_argument('-t', '--tokenizer',
choices=constants.TOKENIZER_CHOICES,
default=constants.TOKENIZER_CHOICE_CBETA,
help=constants.DB_TOKENIZER_HELP)
[docs]def get_corpus(args):
"""Returns a `tacl.Corpus`."""
tokenizer = get_tokenizer(args)
return tacl.Corpus(args.corpus, tokenizer)
[docs]def get_catalogue(path):
"""Returns a `tacl.Catalogue`."""
catalogue = tacl.Catalogue()
catalogue.load(path)
return catalogue
[docs]def get_data_store(args):
"""Returns a `tacl.DataStore`."""
return tacl.DataStore(args.db, args.memory, args.ram)
[docs]def get_ngrams(path):
"""Returns a list of n-grams read from the file at `path`."""
with open(path, encoding='utf-8') as fh:
ngrams = [ngram.strip() for ngram in fh.readlines()]
return ngrams
[docs]def get_tokenizer(args):
return tacl.Tokenizer(*constants.TOKENIZERS[args.tokenizer])