Source code for tacl.text

"""Module containing the Text and WitnessText classes."""

import collections
import hashlib
import os.path
import re


[docs]class Text: """Class for base text functionality (getting tokens, generating n-grams). Used for (snippets of) texts that are not witnesses. """ def __init__(self, content, tokenizer): self._content = content self._tokenizer = tokenizer
[docs] def get_content(self): """Returns the content of this text. :rtype: `str` """ return self._content
[docs] def get_ngrams(self, minimum, maximum, skip_sizes=None): """Returns a generator supplying the n-grams (`minimum` <= n <= `maximum`) for this text. Each iteration of the generator supplies a tuple consisting of the size of the n-grams and a `collections.Counter` of the n-grams. :param minimum: minimum n-gram size :type minimum: `int` :param maximum: maximum n-gram size :type maximum: `int` :rtype: `generator` """ skip_sizes = skip_sizes or [] tokens = self.get_tokens() for size in range(minimum, maximum + 1): if size not in skip_sizes: ngrams = collections.Counter(self._ngrams(tokens, size)) yield (size, ngrams)
[docs] def get_token_content(self): """Returns a string of the tokens in this text joined using the tokenizer joiner string. :rtype: `str` """ return self._tokenizer.joiner.join(self.get_tokens())
[docs] def get_tokens(self): """Returns a list of tokens in this text. :rtype: `list` of `str` """ return self._tokenizer.tokenize(self._content)
def _ngrams(self, sequence, degree): """Returns the n-grams generated from `sequence`. Based on the ngrams function from the Natural Language Toolkit. Each n-gram in the returned list is a string with whitespace removed. :param sequence: the source data to be converted into n-grams :type sequence: sequence :param degree: the degree of the n-grams :type degree: `int` :rtype: `list` of `str` """ count = max(0, len(sequence) - degree + 1) # The extra split and join are due to having to handle # whitespace within a CBETA token (eg, [(禾*尤)\n/上/日]). return [self._tokenizer.joiner.join( self._tokenizer.joiner.join(sequence[i:i+degree]).split()) for i in range(count)]
[docs]class WitnessText (Text): """Class for the text of a witness. A witness has a work name and a siglum, and has a corresponding filename.""" def __init__(self, name, siglum, content, tokenizer): super().__init__(content, tokenizer) self._name = name self._siglum = siglum self._filename = self.assemble_filename(name, siglum) @staticmethod
[docs] def assemble_filename(name, siglum): return os.path.join(name, siglum + '.txt')
[docs] def get_checksum(self): """Returns the checksum for the content of this text. :rtype: `str` """ return hashlib.md5(self._content.encode('utf-8')).hexdigest()
[docs] def get_filename(self): """Returns the filename of this text. :rtype: `str` """ return self._filename
[docs] def get_names(self): """Returns the name and siglum of this text. :rtype: `tuple` """ return self._name, self._siglum
[docs]class FilteredWitnessText (WitnessText): """Class for the text of a witness that supplies only those n-grams that contain a supplied list of n-grams.""" @staticmethod
[docs] def get_filter_ngrams_pattern(filter_ngrams): """Returns a compiled regular expression matching on any of the n-grams in `filter_ngrams`. :param filter_ngrams: n-grams to use in regular expression :type filter_ngrams: `list` of `str` :rtype: `_sre.SRE_Pattern` """ return re.compile('|'.join([re.escape(ngram) for ngram in filter_ngrams]))
[docs] def get_ngrams(self, minimum, maximum, filter_ngrams): """Returns a generator supplying the n-grams (`minimum` <= n <= `maximum`) for this text. Each iteration of the generator supplies a tuple consisting of the size of the n-grams and a `collections.Counter` of the n-grams. :param minimum: minimum n-gram size :type minimum: `int` :param maximum: maximum n-gram size :type maximum: `int` :param filter_ngrams: n-grams that must be contained by the generated n-grams :type filter_ngrams: `list` :rtype: `generator` """ tokens = self.get_tokens() filter_pattern = self.get_filter_ngrams_pattern(filter_ngrams) for size in range(minimum, maximum + 1): ngrams = collections.Counter( self._ngrams(tokens, size, filter_pattern)) yield (size, ngrams)
def _ngrams(self, sequence, degree, filter_ngrams): return [ngram for ngram in super()._ngrams(sequence, degree) if filter_ngrams.search(ngram)]