Source code for tacl.corpus

"""Module containing the Corpus class."""

import glob
import logging
import os.path

from .text import WitnessText


[docs]class Corpus: """A Corpus represents a collection of `WitnessText`\s. A Corpus is built from a directory that contains the text files that become `WitnessText` objects. """ def __init__(self, path, tokenizer): self._logger = logging.getLogger(__name__) self._path = os.path.abspath(path) self._tokenizer = tokenizer
[docs] def get_sigla(self, work): """Returns a list of all of the sigla for `work`. :param work: name of work :type work: `str` :rtype: `list` of `str` """ return [os.path.splitext(os.path.basename(path))[0] for path in glob.glob(os.path.join(self._path, work, '*.txt'))]
[docs] def get_witness(self, work, siglum, text_class=WitnessText): """Returns a `WitnessText` representing the file associated with `work` and `siglum`. Combined, `work` and `siglum` form the basis of a filename for retrieving the text. :param work: name of work :type work: `str` :param siglum: siglum of witness :type siglum: `str` :rtype: `WitnessText` """ filename = os.path.join(work, siglum + '.txt') self._logger.debug('Creating WitnessText object from {}'.format( filename)) with open(os.path.join(self._path, filename), encoding='utf-8') \ as fh: content = fh.read() return text_class(work, siglum, content, self._tokenizer)
[docs] def get_witnesses(self, name='*'): """Returns a generator supplying `WitnessText` objects for each file in the corpus. :rtype: `generator` of `WitnessText` """ for filepath in glob.glob(os.path.join(self._path, name, '*.txt')): if os.path.isfile(filepath): name = os.path.split(os.path.split(filepath)[0])[1] siglum = os.path.splitext(os.path.basename(filepath))[0] yield self.get_witness(name, siglum)