Source code for eleve.segment

""" :mod:`eleve.segment`
==========================

The segmenter is available by importing ``eleve.Segmenter``.  It is used to
segment sentences (regroup tokens that goes together).

"""
import logging
from math import isnan

logger = logging.getLogger(__name__)

[docs]class Segmenter:
[docs] def __init__(self, storage, max_ngram_length=None): """ Create a segmenter. :param storage: A storage object that has been trained on a corpus (should have a ``query_autonomy`` method). :param max_ngram_length: The maximum length of n-gram that can be "merged". It should be strictly smaller to the storage's n-gram length. """ assert hasattr(storage, 'query_autonomy'), "The storage object should have a query_autonomy method." self.storage = storage if max_ngram_length is None: assert hasattr(storage, 'default_ngram_length'), "The storage should have a default_ngram_length attribute." self.max_ngram_length = storage.default_ngram_length - 1 else: assert isinstance(max_ngram_length, int) and max_ngram_length > 1, \ "max_ngram_length should be an integer bigger than one" if max_ngram_length >= storage.default_ngram_length: logger.warning("consider n-grams of size %d at max, BUT storage backend has a default ngram length of %s." % (max_ngram_length, storage.default_ngram_length)) self.max_ngram_length = max_ngram_length
[docs] def segment(self, sentence): """ Segment a sentence. :param sentence: A list of tokens. :returns: A list of sentence fragments. A sentence fragment is a list of tokens. """ if len(sentence) > 1000: logger.warning("The sentence you want to segment is HUGE. This will take a lot of memory.") sentence = [self.storage.sentence_start] + sentence + [self.storage.sentence_end] # dynamic programming to segment the sentence best_segmentation = [[]]*(len(sentence) + 1) best_score = [0] + [float('-inf')]*len(sentence) # best_score[1] -> autonomy of the first word # best_score[2] -> sum of autonomy of the first two words, or autonomy of the first two # ... order = self.max_ngram_length query_autonomy = self.storage.query_autonomy for i in range(1, len(sentence) + 1): for j in range(1, order + 1): if i - j < 0: break a = query_autonomy(sentence[i-j:i]) if isnan(a): a = -100. score = best_score[i-j] + a * j if score > best_score[i]: best_score[i] = score best_segmentation[i] = best_segmentation[i-j] + [sentence[i-j:i]] # keep the best segmentation and remove the None best_segmentation = best_segmentation[len(sentence)] best_segmentation[0].pop(0) best_segmentation[-1].pop() best_segmentation = list(filter(None, best_segmentation)) return best_segmentation