"""
Tokenizer
---------
Classes with a .text_to_token_list method (and a bit more). Used by other
modules as a means to convert stings to lists of strings.
If you have a function that converts strings to lists of strings, you can
make a tokenizer from it by using MakeTokenizer(my_tokenizing_func).
SparseFormatter
---------------
Classes for converting text to sparse representations (e.g. VW or SVMLight).
SFileFilter
-----------
Classes for filtering words/rows from a sparse formatted file.
"""
from collections import Counter, defaultdict
import hashlib
import random
import re
import math
import nltk
import numpy as np
import pandas as pd
from ..common import smart_open, DocIDError
from ..common_abc import SaveLoad
from . import nlp
from . import streaming_filters
[docs]class BaseTokenizer(SaveLoad):
"""
Base class, don't use directly.
"""
[docs] def text_to_counter(self, text):
"""
Return a counter associated to tokens in text.
Filter/transform words according to the scheme this Tokenizer uses.
Parameters
----------
text : String
Returns
-------
tokens : Counter
keys = the tokens
values = counts of the tokens in text
"""
return Counter(self.text_to_token_list(text))
[docs]class MakeTokenizer(BaseTokenizer):
"""
Makes a subclass of BaseTokenizer out of a function.
"""
def __init__(self, tokenizer_func):
"""
Parameters
----------
tokenizer_func : Function
Takes in strings, spits out lists of strings.
"""
self.text_to_token_list = tokenizer_func
[docs]class TokenizerBasic(BaseTokenizer):
"""
A simple tokenizer. Extracts word counts from text.
Keeps only non-stopwords, converts to lowercase,
keeps words of length >=2.
"""
[docs] def text_to_token_list(self, text):
"""
Return a list of tokens.
Filter/transform words according to the scheme this Tokenizer uses.
Parameters
----------
text : String
Returns
-------
tokens : List
Tokenized text, e.g. ['hello', 'my', 'name', 'is', 'ian']
"""
tokens = nlp.word_tokenize(text, L=2, numeric=False)
return [word.lower() for word in tokens if not nlp.is_stopword(word)]
[docs]class TokenizerPOSFilter(BaseTokenizer):
"""
Tokenizes, does POS tagging, then keeps words that match particular POS.
"""
def __init__(
self, pos_types=[], sent_tokenizer=nltk.sent_tokenize,
word_tokenizer=TokenizerBasic(), word_tokenizer_func=None,
pos_tagger=nltk.pos_tag):
"""
Parameters
----------
pos_types : List of Strings
Parts of Speech to keep
sent_tokenizer : Sentence tokenizer function.
Default: nltk.sent_tokenize
Splits text into a list of sentences (each sentence is a string)
word_tokenizer : Subclass of BaseTokenizer.
Default: TokenizerBasic
For tokenizing the words.
word_tokenizer_func : Function
Converts strings to list of strings. If given, use this in place
of word_tokenizer.
pos_tagger : POS tagging function
Default: nltk.pos_tag
Given a list of words, returns a list of tuples (word, POS)
"""
self.pos_types = set(pos_types)
self.sent_tokenizer = sent_tokenizer
self.pos_tagger = pos_tagger
if not word_tokenizer:
self.word_tokenizer = MakeTokenizer(word_tokenizer_func)
else:
self.word_tokenizer = word_tokenizer
[docs] def text_to_token_list(self, text):
"""
Tokenize a list of text that (possibly) includes multiple sentences.
"""
# sentences = [['I am Ian.'], ['Who are you?']]
sentences = self.sent_tokenizer(text)
# tokenized_sentences = [['I', 'am', 'Ian.'], ['Who', 'are', 'you?']]
func = self.word_tokenizer.text_to_token_list
tokenized_sentences = [func(sent) for sent in sentences]
# tagged_sentences = [[('I', 'PRP'), ('am', 'VBP'), ...]]
tagged_sentences = [
self.pos_tagger(sent) for sent in tokenized_sentences]
# Returning a list of words that meet the filter criteria
token_list = sum(
[self._sent_filter(sent) for sent in tagged_sentences], [])
return token_list
def _sent_filter(self, tokenized_sent):
return [
word for (word, pos) in tokenized_sent if pos in self.pos_types]
[docs]class SFileFilter(SaveLoad):
"""
Filters results stored in sfiles (sparsely formattted bag-of-words files).
"""
def __init__(self, formatter, bit_precision=18, sfile=None, verbose=True):
"""
Parameters
----------
formatter : Subclass of SparseFormatter
bit_precision : Integer
Hashes are taken modulo 2**bit_precision. Currently must be < 32.
sfile : filepath or buffer
Load this sfile during init
verbose : Boolean
"""
assert isinstance(bit_precision, int)
self.formatter = formatter
self.bit_precision = bit_precision
self.verbose = verbose
self.precision = 2**bit_precision
self.sfile_loaded = False
self.bit_precision_required = bit_precision
if sfile is not None:
self.load_sfile(sfile)
def _get_hash_fun(self):
"""
The fastest is the built in function hash. Quick experimentation
shows that this function maps similar words to similar values (not
cryptographic) and therefore increases collisions...no big deal.
hashlib.sha224 is up to 224 bit.
"""
if self.bit_precision <= 64:
hash_fun = lambda w: hash(w) % self.precision
elif self.bit_precision <= 224:
hash_fun = lambda w: (
int(hashlib.sha224(w).hexdigest(), 16) % self.precision)
else:
raise ValueError("Precision above 224 bit not supported")
return hash_fun
[docs] def load_sfile(self, sfile):
"""
Load an sfile, building self.token2id
Parameters
----------
sfile : String or open file
The sparse formatted file we will load.
Returns
-------
self
"""
# TODO Allow loading of more than one sfile
assert not self.sfile_loaded
# Build token2id
token2id, token_score, doc_freq, num_docs, idf = (
self._load_sfile_fwd(sfile))
self.token2id = token2id
self.token_score = token_score
self.doc_freq = doc_freq
self.num_docs = num_docs
self.idf = idf
self.sfile_loaded = True
self.collisions_resolved = False
def _load_sfile_fwd(self, sfile):
"""
Builds the "forward" objects involved in loading an sfile.
"""
token2id = {}
token_score = defaultdict(float)
doc_freq = defaultdict(int)
num_docs = 0
idf = defaultdict(float)
hash_fun = self._get_hash_fun()
with smart_open(sfile) as open_file:
# Each line represents one document
for line in open_file:
num_docs += 1
record_dict = self.formatter.sstr_to_dict(line)
for token, value in record_dict['feature_values'].iteritems():
hash_value = hash_fun(token)
token2id[token] = hash_value
token_score[token] += value
doc_freq[token] += 1
idf[token] += 1
for token in idf.iterkeys():
idf[token] = math.log(num_docs / idf[token])
return token2id, token_score, doc_freq, num_docs, idf
[docs] def set_id2token(self, seed=None):
"""
Sets self.id2token, resolving collisions as needed (which alters
self.token2id)
"""
self._resolve_collisions(seed=seed)
self.id2token = {v: k for k, v in self.token2id.iteritems()}
def _resolve_collisions(self, seed=None):
"""
Alters self.token2id by finding new id values used using a
"random probe" method.
Meant to be called by self.set_id2token. If you call this by itself,
then self.token2id is altered, but self.id2token is not!!!!
"""
id_counts = Counter(self.token2id.values())
vocab_size = self.vocab_size
# Make sure we don't have too many collisions
num_collisions = vocab_size - len(id_counts)
self._print(
"collisions = %d, vocab_size = %d" % (num_collisions, vocab_size))
if num_collisions > vocab_size / 2.:
msg = (
"Too many collisions to be efficient: "
"num_collisions = %d. vocab_size = %d. Try using the "
"function collision_probability to estimate needed precision"
% (num_collisions, vocab_size))
raise CollisionError(msg)
# Seed for testing
random.seed(seed)
# Resolve the collisions in this loop
collisions = (
tok for tok in self.token2id if id_counts[self.token2id[tok]] > 1)
for token in collisions:
old_id = self.token2id[token]
new_id = old_id
# If id_counts[old_id] > 1, then the collision still must be
# resolved. In that case, change new_id and update id_counts
if id_counts[old_id] > 1:
# id_counts is the only dict (at this time) holding every
# id you have ever seen
while new_id in id_counts:
new_id = random.randint(0, self.precision - 1)
new_id = new_id % self.precision
id_counts[old_id] -= 1
id_counts[new_id] = 1
# Update dictionaries
self.token2id[token] = new_id
self._print("All collisions resolved")
self.collisions_resolved = True
[docs] def compactify(self):
"""
Removes "gaps" in the id values in self.token2id. Every single id
value will (probably) be altered.
"""
# You can't compactify if self.bit_precision is too low
min_precision = int(np.ceil(np.log2(self.vocab_size)))
if self.bit_precision < min_precision:
raise CollisionError(
"Cannot compactify unless you increase self.bit_precision "
"to >= %d or remove some tokens" % min_precision)
new_token2id = {}
for i, tok in enumerate(self.token2id):
new_token2id[tok] = i
self.token2id = new_token2id
if hasattr(self, 'id2token'):
self.set_id2token()
self.set_bit_precision_required()
self._print(
"Compactification done. self.bit_precision_required = %d"
% self.bit_precision_required)
[docs] def set_bit_precision_required(self):
"""
Sets self.bit_precision_required to the minimum bit precision b such
that all token id values are less than 2^b.
The idea is that only compactification can change this, so we only
(automatically) call this after compactification.
"""
max_id = np.max(self.token2id.values())
self.bit_precision_required = int(np.ceil(np.log2(max_id)))
[docs] def filter_sfile(
self, infile, outfile, doc_id_list=None, enforce_all_doc_id=True,
min_tf_idf=0, filters=None):
"""
Alter an sfile by converting tokens to id values, and removing tokens
not in self.token2id. Optionally filters on doc_id, tf_idf and
user-defined filters.
Parameters
----------
infile : file path or buffer
outfile : file path or buffer
doc_id_list : Iterable over strings
Keep only rows with doc_id in this list
enforce_all_doc_id : Boolean
If True (and doc_id is not None), raise exception unless all doc_id
in doc_id_list are seen.
min_tf_idf : int or float
Keep only tokens whose term frequency-inverse document frequency
is greater than this threshold. Given a token t and a document d
in a corpus of documents D, tf_idf is given by the following
formula:
tf_idf(t, d, D) = tf(t, d) x idf(t, D),
where
(1) tf(t, d) is the number of times the term t shows up in the
document d,
(2) idf(t, D) = log (N / M), where N is the total number of
documents in D and M is the number of documents in D which
contain the token t. The logarithm is base e.
filters : iterable over functions
Each function must take a record_dict as a parameter and return a
boolean. The record_dict may (and usually should) be altered in
place. If the return value is False, the record_dict (corresponding
to a document) is filtered out of the sfile. Both the doc_id_list
and min_tf_idf parameters are implemented in this style internally.
If the doc_id_list or min_tf_idf flags are set, those filters will
run before the those found in filters. See
rosetta/text/streaming_filters.py
in the rosetta repository for the implementation details of the
record_dict and built-in filters as well as explanations of how to
define more filters.
"""
assert self.sfile_loaded, "Must load an sfile before you can filter"
if not hasattr(self, 'id2token'):
self._print(
"WARNING: Filtering an sfile before setting self.id2token. "
"The resultant outfile will have collisions and you will not "
"be able to convert ids back to tokens.\nIt is recommended to "
"call: self.compactify() then either self.set_id2token() or "
" self.save() before filtering")
if filters is None:
filters = []
# The doc_id_filter should be run before everything else to avoid
# unnecessary computations. The min_tf_idf filter is run next. If for
# some reason this is not the desired the ordering, the user needs to
# leave the the doc_id_list and min_tf_idf flags must be unset and pass
# user-defined filters to the filters flag explicitly.
prefilters = []
if doc_id_list is not None:
doc_id_set = set(doc_id_list)
prefilters.append(streaming_filters.get_doc_id_filter(doc_id_set))
else:
doc_id_set = set()
if min_tf_idf != 0:
prefilters.append(
streaming_filters.get_tf_idf_filter(self, min_tf_idf))
# The token_to_id_filter should be run last so that only the necessary
# conversions are made.
postfilters = [streaming_filters.get_token_to_id_filter(self)]
filters = prefilters + filters + postfilters
doc_id_seen = set()
with smart_open(infile) as f, smart_open(outfile, 'w') as g:
# Each line represents one document
for line in f:
record_dict = self.formatter.sstr_to_dict(line)
doc_id = record_dict['doc_id']
doc_id_seen.add(doc_id)
if all(func(record_dict) for func in filters):
new_sstr = self.formatter.get_sstr(**record_dict)
g.write(new_sstr + '\n')
if enforce_all_doc_id:
# Make sure we saw all the doc_id we're supposed to
assert doc_id_set.issubset(doc_id_seen), (
"Did not see every doc_id in the passed doc_id_list")
[docs] def filter_extremes(
self, doc_freq_min=0, doc_freq_max=np.inf, doc_fraction_min=0,
doc_fraction_max=1, token_score_min=0, token_score_max=np.inf,
token_score_quantile_min=0, token_score_quantile_max=1):
"""
Remove extreme tokens from self (calling self.filter_tokens).
Parameters
----------
doc_freq_min : Integer
Remove tokens that in less than this number of documents
doc_freq_max : Integer
doc_fraction_min : Float in [0, 1]
Remove tokens that are in less than this fraction of documents
doc_fraction_max : Float in [0, 1]
token_score_quantile_min : Float in [0, 1]
Minimum quantile that the token score (usually total token count)
can be in.
token_score_quantile_max : Float in [0, 1]
Maximum quantile that the token score can be in
Returns
-------
self
"""
frame = self.to_frame()
to_remove_mask = (
(frame.doc_freq < doc_freq_min)
| (frame.doc_freq > doc_freq_max)
| (frame.doc_freq < (doc_fraction_min * self.num_docs))
| (frame.doc_freq > (doc_fraction_max * self.num_docs))
| (frame.token_score < token_score_min)
| (frame.token_score > token_score_max)
| (frame.token_score
< frame.token_score.quantile(token_score_quantile_min))
| (frame.token_score
> frame.token_score.quantile(token_score_quantile_max))
)
self._print(
"Removed %d/%d tokens" % (to_remove_mask.sum(), len(frame)))
self.filter_tokens(frame[to_remove_mask].index)
[docs] def filter_tokens(self, tokens):
"""
Remove tokens from appropriate attributes.
Parameters
----------
tokens : String or iterable over strings
E.g. a single token or list of tokens
Returns
-------
self
"""
if isinstance(tokens, str):
tokens = [tokens]
for tok in tokens:
id_value = self.token2id[tok]
self.token2id.pop(tok)
self.token_score.pop(tok)
self.doc_freq.pop(tok)
if hasattr(self, 'id2token'):
self.id2token.pop(id_value)
def _print(self, msg):
if self.verbose:
print(msg)
[docs] def to_frame(self):
"""
Return a dataframe representation of self.
"""
token2id = self.token2id
token_score = self.token_score
doc_freq = self.doc_freq
frame = pd.DataFrame(
{'token_score': [token_score[tok] for tok in token2id],
'doc_freq': [doc_freq[tok] for tok in token2id]},
index=[tok for tok in token2id])
frame['doc_fraction'] = frame.doc_freq / float(self.num_docs)
frame.index.name = 'token'
return frame
@property
def vocab_size(self):
return len(self.token2id)
[docs] def save(self, savepath, protocol=-1, set_id2token=True):
"""
Pickle self to outfile.
Parameters
----------
savefile : filepath or buffer
protocol : 0, 1, 2, -1
0 < 1 < 2 in terms of performance. -1 means use highest available.
set_id2token : Boolean
If True, set self.id2token before saving.
Used to associate tokens with the output of a VW file.
"""
if set_id2token:
self.set_id2token()
SaveLoad.save(self, savepath, protocol=protocol)
[docs]def collision_probability(vocab_size, bit_precision):
"""
Approximate probability of at least one collision
(assuming perfect hashing). See the Wikipedia article on
"The birthday problem" for details.
Parameters
----------
vocab_size : Integer
Number of unique words in vocabulary
bit_precision : Integer
Number of bits in space we are hashing to
"""
exponent = - vocab_size * (vocab_size - 1) / 2.**bit_precision
return 1 - np.exp(exponent)
class CollisionError(Exception):
pass