Source code for tacl.text
"""Module containing the Text and WitnessText classes."""
import collections
import hashlib
import os.path
import re
[docs]class Text:
"""Class for base text functionality (getting tokens, generating
n-grams).
Used for (snippets of) texts that are not witnesses.
"""
def __init__(self, content, tokenizer):
self._content = content
self._tokenizer = tokenizer
[docs] def get_content(self):
"""Returns the content of this text.
:rtype: `str`
"""
return self._content
[docs] def get_ngrams(self, minimum, maximum, skip_sizes=None):
"""Returns a generator supplying the n-grams (`minimum` <= n
<= `maximum`) for this text.
Each iteration of the generator supplies a tuple consisting of
the size of the n-grams and a `collections.Counter` of the
n-grams.
:param minimum: minimum n-gram size
:type minimum: `int`
:param maximum: maximum n-gram size
:type maximum: `int`
:rtype: `generator`
"""
skip_sizes = skip_sizes or []
tokens = self.get_tokens()
for size in range(minimum, maximum + 1):
if size not in skip_sizes:
ngrams = collections.Counter(self._ngrams(tokens, size))
yield (size, ngrams)
[docs] def get_token_content(self):
"""Returns a string of the tokens in this text joined using the
tokenizer joiner string.
:rtype: `str`
"""
return self._tokenizer.joiner.join(self.get_tokens())
[docs] def get_tokens(self):
"""Returns a list of tokens in this text.
:rtype: `list` of `str`
"""
return self._tokenizer.tokenize(self._content)
def _ngrams(self, sequence, degree):
"""Returns the n-grams generated from `sequence`.
Based on the ngrams function from the Natural Language
Toolkit.
Each n-gram in the returned list is a string with whitespace
removed.
:param sequence: the source data to be converted into n-grams
:type sequence: sequence
:param degree: the degree of the n-grams
:type degree: `int`
:rtype: `list` of `str`
"""
count = max(0, len(sequence) - degree + 1)
# The extra split and join are due to having to handle
# whitespace within a CBETA token (eg, [(禾*尤)\n/上/日]).
return [self._tokenizer.joiner.join(
self._tokenizer.joiner.join(sequence[i:i+degree]).split())
for i in range(count)]
[docs]class WitnessText (Text):
"""Class for the text of a witness. A witness has a work name and a
siglum, and has a corresponding filename."""
def __init__(self, name, siglum, content, tokenizer):
super().__init__(content, tokenizer)
self._name = name
self._siglum = siglum
self._filename = self.assemble_filename(name, siglum)
@staticmethod
[docs] def assemble_filename(name, siglum):
return os.path.join(name, siglum + '.txt')
[docs] def get_checksum(self):
"""Returns the checksum for the content of this text.
:rtype: `str`
"""
return hashlib.md5(self._content.encode('utf-8')).hexdigest()
[docs] def get_filename(self):
"""Returns the filename of this text.
:rtype: `str`
"""
return self._filename
[docs] def get_names(self):
"""Returns the name and siglum of this text.
:rtype: `tuple`
"""
return self._name, self._siglum
[docs]class FilteredWitnessText (WitnessText):
"""Class for the text of a witness that supplies only those n-grams
that contain a supplied list of n-grams."""
@staticmethod
[docs] def get_filter_ngrams_pattern(filter_ngrams):
"""Returns a compiled regular expression matching on any of the
n-grams in `filter_ngrams`.
:param filter_ngrams: n-grams to use in regular expression
:type filter_ngrams: `list` of `str`
:rtype: `_sre.SRE_Pattern`
"""
return re.compile('|'.join([re.escape(ngram) for ngram in
filter_ngrams]))
[docs] def get_ngrams(self, minimum, maximum, filter_ngrams):
"""Returns a generator supplying the n-grams (`minimum` <= n
<= `maximum`) for this text.
Each iteration of the generator supplies a tuple consisting of
the size of the n-grams and a `collections.Counter` of the
n-grams.
:param minimum: minimum n-gram size
:type minimum: `int`
:param maximum: maximum n-gram size
:type maximum: `int`
:param filter_ngrams: n-grams that must be contained by the generated
n-grams
:type filter_ngrams: `list`
:rtype: `generator`
"""
tokens = self.get_tokens()
filter_pattern = self.get_filter_ngrams_pattern(filter_ngrams)
for size in range(minimum, maximum + 1):
ngrams = collections.Counter(
self._ngrams(tokens, size, filter_pattern))
yield (size, ngrams)
def _ngrams(self, sequence, degree, filter_ngrams):
return [ngram for ngram in super()._ngrams(sequence, degree)
if filter_ngrams.search(ngram)]