1
2
3
4 """The Tokenizer class takes an input stream and parses it into tokens.
5
6 The parsing process is controlled by the character classification sets:
7
8 - blankspace characters: characters that mark a token boundary and are not
9 part of the token.
10
11 - separator characters: characters that mark a token boundary and might be
12 considered tokens, depending on the value of a flag
13 (to be implemented).
14
15 - valid characters: any non blankspace and non separator character.
16
17 Each byte read from the input stream is regarded as a character in the range
18 '\\u0000' through '\\u00FF'.
19
20 In addition, an instance has flags that control:
21
22 - whether the characters of tokens are converted to lowercase.
23 - whether separator characters constitute tokens. (TBD)
24
25 A typical application first constructs an instance of this class, supplying
26 the input stream to be tokenized, the set of blankspaces, and the set of
27 eparators, and then repeatedly loops, while method has_more_tokens() returns
28 true, calling the next_token() method.
29 """
30
31 from abc import ABCMeta, abstractmethod
32 from codecs import open as copen
33 from collections import defaultdict
34 from tipy.char import blankspaces, separators
38 """Abstract class for all tokenizers.
39
40 G{classtree Tokenizer}"""
41
42 __metaclass__ = ABCMeta
43
45 """Constructor of the Tokenizer abstract class.
46
47 @param stream:
48 The stream to tokenize. Can be a filename or any open IO stream.
49 @type stream: str or io.IOBase
50 @param blankspaces:
51 The characters that represent empty spaces.
52 @type blankspaces: str
53 @param separators:
54 The characters that separate token units (e.g. word boundaries).
55 @type separators: str
56 """
57 self.separators = separators
58 self.blankspaces = blankspaces
59 self.lowercase = False
60 self.offbeg = 0
61 self.offset = None
62 self.offend = None
63
65 """Test if a character is a blankspace.
66
67 @param char:
68 The character to test.
69 @type char: str
70
71 @return:
72 True if character is a blankspace, False otherwise.
73 @rtype: bool
74 """
75 if len(char) > 1:
76 raise TypeError("Expected a char.")
77 if char in self.blankspaces:
78 return True
79 else:
80 return False
81
83 """Test if a character is a separator.
84
85 @param char:
86 The character to test.
87 @type char: str
88
89 @return:
90 True if character is a separator, False otherwise.
91 @rtype: bool
92 """
93 if len(char) > 1:
94 raise TypeError("Expected a char.")
95 if char in self.separators:
96 return True
97 else:
98 return False
99
100 @abstractmethod
102 raise NotImplementedError("Method must be implemented")
103
104 @abstractmethod
106 raise NotImplementedError("Method must be implemented")
107
108 @abstractmethod
110 raise NotImplementedError("Method must be implemented")
111
112 @abstractmethod
114 raise NotImplementedError("Method must be implemented")
115
116 @abstractmethod
118 raise NotImplementedError("Method must be implemented")
119
120 @abstractmethod
122 raise NotImplementedError("Method must be implemented")
123
126 """Tokenize a stream from the beginning to the end.
127
128 G{classtree ForwardTokenizer}
129 """
130
133 """Constructor of the ForwardTokenizer class.
134
135 @warning: When passing IOBase type variable as stream parameter: the
136 read() method is used to read the stream and it can be time
137 consuming. Please don't pass IOBase during the prediction
138 process!
139
140 @param stream:
141 The stream to tokenize. Can be a filename or any open IO stream.
142 @type stream: str or io.IOBase
143 @param blankspaces:
144 The characters that represent empty spaces.
145 @type blankspaces: str
146 @param separators:
147 The characters that separate token units (e.g. word boundaries).
148 @type separators: str
149 """
150 Tokenizer.__init__(self, stream, blankspaces, separators)
151 if type(stream)is str:
152 self.text = stream
153 else:
154 if not hasattr(stream, 'read'):
155 stream = copen(stream, "r", "utf-8")
156 self.text = stream.read()
157 stream.close()
158 self.lowercase = lowercase
159 self.offend = self.count_chars()
160 self.reset_stream()
161
175
177 """Count the number of characters in the stream.
178
179 @note: Should return the same value as the wc Unix command.
180
181 @return:
182 The number of characters in the stream.
183 @rtype: int
184 """
185 return len(self.text)
186
188 """Test if at least one token remains.
189
190 @return:
191 True or False weither there is at least one token left in the
192 stream.
193 @rtype: bool
194 """
195 if self.offset < self.offend:
196 return True
197 else:
198 return False
199
201 """Retrieve the next token in the stream.
202
203 @return:
204 Return the next token or '' if there is no next token.
205 @rtype: str
206 """
207 if not self.has_more_tokens():
208 return ''
209 current = self.text[self.offset]
210 token = ''
211 if self.offset < self.offend:
212 while self.is_blankspace(current) or self.is_separator(current):
213 self.offset += 1
214 try:
215 current = self.text[self.offset]
216 except IndexError:
217 break
218 while not self.is_blankspace(current) and not self.is_separator(
219 current) and self.offset < self.offend:
220 if self.lowercase:
221 current = current.lower()
222 token += current
223 self.offset += 1
224 try:
225 current = self.text[self.offset]
226 except IndexError:
227 break
228 return token
229
231 """Return the progress percentage.
232
233 @return:
234 The tokenization progress percentage.
235 @rtype: float
236 """
237 return float(self.offset) / self.offend * 100
238
240 """Reset the offset to 0."""
241 self.offset = 0
242
245 """Tokenize a stream from the end to the beginning.
246
247 G{classtree ReverseTokenizer}
248 """
249
252 """Constructor of the ReverseTokenizer class.
253
254 @param stream:
255 The stream to tokenize. Can be a filename or any open IO stream.
256 @type stream: str or io.IOBase
257 @param blankspaces:
258 The characters that represent empty spaces.
259 @type blankspaces: str
260 @param separators:
261 The characters that separate token units (e.g. word boundaries).
262 @type separators: str
263 """
264 Tokenizer.__init__(self, stream, blankspaces, separators)
265 if type(stream) is str:
266 self.text = stream
267 else:
268 if not hasattr(stream, 'read'):
269 stream = copen(stream, "r", "utf-8")
270 self.text = stream.read()
271 stream.close()
272 self.lowercase = lowercase
273 self.offend = self.count_chars() - 1
274 self.reset_stream()
275
277 """Check the number of tokens left.
278
279 @return:
280 The number of tokens left.
281 @rtype: int
282 """
283 curroff = self.offset
284 self.offset = self.offend
285 count = 0
286 while (self.has_more_tokens()):
287 self.next_token()
288 count += 1
289 self.offset = curroff
290 return count
291
293 """Count the number of characters in the stream.
294
295 @note: Should return the same value as the wc Unix command.
296
297 @return:
298 The number of characters in the stream.
299 @rtype: int
300 """
301 return len(self.text)
302
304 """Test if at least one token remains.
305
306 @return:
307 True or False weither there is at least one token left in the
308 stream. (Keep in mind that the stream is tokenized from the end to
309 the beginning).
310 @rtype: bool
311 """
312 if self.offbeg <= self.offset:
313 return True
314 else:
315 return False
316
318 """Retrieve the next token in the stream.
319
320 @note: As this is a reversed tokenizer the "next" token is currently
321 what one would call the "previous" token but in the tokenizer
322 workflow if think its more logic to call it the "next" token.
323
324 @return:
325 Return the next token or '' if there is no next token.
326 @rtype: str
327 """
328 if not self.has_more_tokens():
329 return ''
330 token = ""
331 while self.offbeg <= self.offset and len(token) == 0:
332 current = self.text[self.offset]
333 if (self.offset == self.offend) and (self.is_separator(current)
334 or
335 self.is_blankspace(current)):
336 self.offset -= 1
337 return token
338 while (self.is_blankspace(current) or self.is_separator(current)) \
339 and self.offbeg < self.offset:
340 self.offset -= 1
341 if (self.offbeg <= self.offset):
342 current = self.text[self.offset]
343
344 while (not self.is_blankspace(current) and
345 not self.is_separator(current) and
346 self.offbeg <= self.offset):
347 if self.lowercase:
348 current = current.lower()
349 token = current + token
350 self.offset -= 1
351 if (self.offbeg <= self.offset):
352 current = self.text[self.offset]
353 return token
354
356 """Return the progress percentage.
357
358 @return:
359 The tokenization progress percentage.
360 @rtype: float
361 """
362 return float(self.offend - self.offset) / (self.offend - self.offbeg)
363
365 """Reset the offset to the end offset."""
366 self.offset = self.offend
367
368
369 -class TextTokenizer(Tokenizer):
370 """Tokenizer to tokenize a text file.
371
372 This tokenizer recieve a text file and generate n-grams of a given size "n".
373 It is usefule to the L{text miner<minr.TextMiner>} in order to generate
374 n-grams to be inserted in a database.
375
376 G{classtree TextTokenizer}
377 """
378
379 - def __init__(self, infile, n, lowercase=False, cutoff=0, callback=None):
380 """TextTokenizer creator.
381
382 @param infile:
383 Path to the file to tokenize.
384 @type infile: str
385 @param n:
386 The n in n-gram. Specify the maximum n-gram size to be created.
387 @type n: int
388 @param lowercase:
389 If True: all tokens are convert to lowercase before being added to
390 the dictionary.
391 If False: tokens case remains untouched.
392 @type lowercase: bool
393 @param cutoff:
394 Set the minimum number of token occurences. If a token dosen't
395 appear more than this number it is removed from the dictionary
396 before it is returned.
397 @type cutoff: int
398 """
399 self.infile = infile
400 self.n = n
401 self.lowercase = lowercase
402 self.cutoff = cutoff
403 self.callback = callback
404
405 - def tknize_text(self):
406 """Tokenize a file and return a dictionary mapping its n-grams.
407
408 The dictionary looks like::
409 { ('in', 'the', 'second'): 4,
410 ('right', 'hand', 'of'): 1,
411 ('subject', 'to', 'the'): 2,
412 ('serious', 'rebuff', 'in'): 1,
413 ('spirit', 'is', 'the'): 1 }
414 """
415 ngramMap = defaultdict(int)
416 ngramList = []
417 tokenizer = ForwardTokenizer(open(self.infile), self.lowercase)
418 for i in range(self.n - 1):
419 if not tokenizer.has_more_tokens():
420 break
421 ngramList.append(tokenizer.next_token())
422 while tokenizer.has_more_tokens():
423 if self.callback:
424 self.callback(tokenizer.progress())
425 token = tokenizer.next_token()
426 ngramList.append(token)
427 ngramMap[tuple(ngramList)] += 1
428 ngramList.pop(0)
429 if self.cutoff > 0:
430 for k in ngramMap.keys():
431 if ngramMap[k] <= self.cutoff:
432 del(ngramMap[k])
433 return ngramMap
434