1
2
3
4 """The Tokenizer class takes an input stream and parses it into tokens.
5
6 The parsing process is controlled by the character classification sets:
7
8 - blankspace characters: characters that mark a token boundary and are not
9 part of the token.
10
11 - separator characters: characters that mark a token boundary and might be
12 considered tokens, depending on the value of a flag
13 (to be implemented).
14
15 - valid characters: any non blankspace and non separator character.
16
17 Each byte read from the input stream is regarded as a character in the range
18 '\\u0000' through '\\u00FF'.
19
20 In addition, an instance has flags that control:
21
22 - whether the characters of tokens are converted to lowercase.
23 - whether separator characters constitute tokens. (TBD)
24
25 A typical application first constructs an instance of this class, supplying
26 the input stream to be tokenized, the set of blankspaces, and the set of
27 eparators, and then repeatedly loops, while method has_more_tokens() returns
28 true, calling the next_token() method.
29 """
30
31 from __future__ import absolute_import, unicode_literals
32 import abc
33 import codecs
34 import collections
35 import char
39 """Abstract class for all tokenizers.
40
41 G{classtree Tokenizer}"""
42
43 __metaclass__ = abc.ABCMeta
44
47 """Constructor of the Tokenizer abstract class.
48
49 @param stream:
50 The stream to tokenize. Can be a filename or any open IO stream.
51 @type stream: str or io.IOBase
52 @param blankspaces:
53 The characters that represent empty spaces.
54 @type blankspaces: str
55 @param separators:
56 The characters that separate token units (e.g. word boundaries).
57 @type separators: str
58 """
59 self.separators = separators
60 self.blankspaces = blankspaces
61 self.lowercase = False
62 self.offbeg = 0
63 self.offset = None
64 self.offend = None
65
67 """Test if a character is a blankspace.
68
69 @param char:
70 The character to test.
71 @type char: str
72
73 @return:
74 True if character is a blankspace, False otherwise.
75 @rtype: bool
76 """
77 if len(char) > 1:
78 raise TypeError("Expected a char.")
79 if char in self.blankspaces:
80 return True
81 else:
82 return False
83
85 """Test if a character is a separator.
86
87 @param char:
88 The character to test.
89 @type char: str
90
91 @return:
92 True if character is a separator, False otherwise.
93 @rtype: bool
94 """
95 if len(char) > 1:
96 raise TypeError("Expected a char.")
97 if char in self.separators:
98 return True
99 else:
100 return False
101
102 @abc.abstractmethod
104 raise NotImplementedError("Method must be implemented")
105
106 @abc.abstractmethod
108 raise NotImplementedError("Method must be implemented")
109
110 @abc.abstractmethod
112 raise NotImplementedError("Method must be implemented")
113
114 @abc.abstractmethod
116 raise NotImplementedError("Method must be implemented")
117
118 @abc.abstractmethod
120 raise NotImplementedError("Method must be implemented")
121
122 @abc.abstractmethod
124 raise NotImplementedError("Method must be implemented")
125
128 """Tokenize a stream from the beginning to the end.
129
130 G{classtree ForwardTokenizer}
131 """
132
135 """Constructor of the ForwardTokenizer class.
136
137 @warning: When passing IOBase type variable as stream parameter: the
138 read() method is used to read the stream and it can be time
139 consuming. Please don't pass IOBase during the prediction
140 process!
141
142 @param stream:
143 The stream to tokenize. Can be a filename or any open IO stream.
144 @type stream: str or io.IOBase
145 @param blankspaces:
146 The characters that represent empty spaces.
147 @type blankspaces: str
148 @param separators:
149 The characters that separate token units (e.g. word boundaries).
150 @type separators: str
151 """
152 Tokenizer.__init__(self, stream, blankspaces, separators)
153 if type(stream)is str:
154 self.text = stream
155 else:
156 if not hasattr(stream, 'read'):
157 stream = codecs.open(stream, "r", "utf-8")
158 self.text = stream.read()
159 stream.close()
160 self.lowercase = lowercase
161 self.offend = self.count_chars()
162 self.reset_stream()
163
177
179 """Count the number of characters in the stream.
180
181 @note: Should return the same value as the wc Unix command.
182
183 @return:
184 The number of characters in the stream.
185 @rtype: int
186 """
187 return len(self.text)
188
190 """Test if at least one token remains.
191
192 @return:
193 True or False weither there is at least one token left in the
194 stream.
195 @rtype: bool
196 """
197 if self.offset < self.offend:
198 return True
199 else:
200 return False
201
203 """Retrieve the next token in the stream.
204
205 @return:
206 Return the next token or '' if there is no next token.
207 @rtype: str
208 """
209 if not self.has_more_tokens():
210 return ''
211 current = self.text[self.offset]
212 token = ''
213 if self.offset < self.offend:
214 while self.is_blankspace(current) or self.is_separator(current):
215 self.offset += 1
216 try:
217 current = self.text[self.offset]
218 except IndexError:
219 break
220 while not self.is_blankspace(current) and not self.is_separator(
221 current) and self.offset < self.offend:
222 if self.lowercase:
223 current = current.lower()
224 token += current
225 self.offset += 1
226 try:
227 current = self.text[self.offset]
228 except IndexError:
229 break
230
231 return token
232
234 """Return the progress percentage.
235
236 @return:
237 The tokenization progress percentage.
238 @rtype: float
239 """
240 return float(self.offset) / self.offend * 100
241
243 """Reset the offset to 0."""
244 self.offset = 0
245
248 """Tokenize a stream from the end to the beginning.
249
250 G{classtree ReverseTokenizer}
251 """
252
255 """Constructor of the ReverseTokenizer class.
256
257 @param stream:
258 The stream to tokenize. Can be a filename or any open IO stream.
259 @type stream: str or io.IOBase
260 @param blankspaces:
261 The characters that represent empty spaces.
262 @type blankspaces: str
263 @param separators:
264 The characters that separate token units (e.g. word boundaries).
265 @type separators: str
266 """
267 Tokenizer.__init__(self, stream, blankspaces, separators)
268 if type(stream) is str:
269 self.text = stream
270 else:
271 if not hasattr(stream, 'read'):
272 stream = codecs.open(stream, "r", "utf-8")
273 self.text = stream.read()
274 stream.close()
275 self.lowercase = lowercase
276 self.offend = self.count_chars() - 1
277 self.reset_stream()
278
280 """Check the number of tokens left.
281
282 @return:
283 The number of tokens left.
284 @rtype: int
285 """
286 curroff = self.offset
287 self.offset = self.offend
288 count = 0
289 while (self.has_more_tokens()):
290 self.next_token()
291 count += 1
292 self.offset = curroff
293 return count
294
296 """Count the number of characters in the stream.
297
298 @note: Should return the same value as the wc Unix command.
299
300 @return:
301 The number of characters in the stream.
302 @rtype: int
303 """
304 return len(self.text)
305
307 """Test if at least one token remains.
308
309 @return:
310 True or False weither there is at least one token left in the
311 stream. (Keep in mind that the stream is tokenized from the end to
312 the beginning).
313 @rtype: bool
314 """
315 if self.offbeg <= self.offset:
316 return True
317 else:
318 return False
319
321 """Retrieve the next token in the stream.
322
323 @note: As this is a reversed tokenizer the "next" token is currently
324 what one would call the "previous" token but in the tokenizer
325 workflow if think its more logic to call it the "next" token.
326
327 @return:
328 Return the next token or '' if there is no next token.
329 @rtype: str
330 """
331 if not self.has_more_tokens():
332 return ''
333 token = ""
334 while self.offbeg <= self.offset and len(token) == 0:
335 current = self.text[self.offset]
336 if (self.offset == self.offend) and (self.is_separator(current)
337 or
338 self.is_blankspace(current)):
339 self.offset -= 1
340 return token
341 while (self.is_blankspace(current) or self.is_separator(current)) \
342 and self.offbeg < self.offset:
343 self.offset -= 1
344 if (self.offbeg <= self.offset):
345 current = self.text[self.offset]
346
347 while (not self.is_blankspace(current) and
348 not self.is_separator(current) and
349 self.offbeg <= self.offset):
350 if self.lowercase:
351 current = current.lower()
352 token = current + token
353 self.offset -= 1
354 if (self.offbeg <= self.offset):
355 current = self.text[self.offset]
356 return token
357
359 """Return the progress percentage.
360
361 @return:
362 The tokenization progress percentage.
363 @rtype: float
364 """
365 return float(self.offend - self.offset) / (self.offend - self.offbeg)
366
368 """Reset the offset to the end offset."""
369 self.offset = self.offend
370
371
372 -class TextTokenizer(Tokenizer):
373 """Tokenizer to tokenize a text file.
374
375 This tokenizer recieve a text file and generate n-grams of a given size "n".
376 It is usefule to the L{text miner<minr.TextMiner>} in order to generate
377 n-grams to be inserted in a database.
378
379 G{classtree TextTokenizer}
380 """
381
382 - def __init__(self, infile, n, lowercase=False, cutoff=0, callback=None):
383 """TextTokenizer creator.
384
385 @param infile:
386 Path to the file to tokenize.
387 @type infile: str
388 @param n:
389 The n in n-gram. Specify the maximum n-gram size to be created.
390 @type n: int
391 @param lowercase:
392 If True: all tokens are convert to lowercase before being added to
393 the dictionary.
394 If False: tokens case remains untouched.
395 @type lowercase: bool
396 @param cutoff:
397 Set the minimum number of token occurences. If a token dosen't
398 appear more than this number it is removed from the dictionary
399 before it is returned.
400 @type cutoff: int
401 """
402 self.infile = infile
403 self.n = n
404 self.lowercase = lowercase
405 self.cutoff = cutoff
406 self.callback = callback
407
408 - def tknize_text(self):
409 """Tokenize a file and return a dictionary mapping its n-grams.
410
411 The dictionary looks like::
412 { ('in', 'the', 'second'): 4,
413 ('right', 'hand', 'of'): 1,
414 ('subject', 'to', 'the'): 2,
415 ('serious', 'rebuff', 'in'): 1,
416 ('spirit', 'is', 'the'): 1 }
417 """
418 ngramMap = collections.defaultdict(int)
419 ngramList = []
420 tokenizer = ForwardTokenizer(self.infile, self.lowercase)
421 for i in range(self.n - 1):
422 if not tokenizer.has_more_tokens():
423 break
424 ngramList.append(tokenizer.next_token())
425 while tokenizer.has_more_tokens():
426 if self.callback:
427 self.callback(tokenizer.progress())
428 token = tokenizer.next_token()
429 ngramList.append(token)
430 ngramMap[tuple(ngramList)] += 1
431 ngramList.pop(0)
432 if self.cutoff > 0:
433 for k in ngramMap.keys():
434 if ngramMap[k] <= self.cutoff:
435 del(ngramMap[k])
436 return ngramMap
437