tipy.tknz

1 #!/usr/bin/env python3 2 # -*- coding: utf-8 -*- 3 4 """The Tokenizer class takes an input stream and parses it into tokens. 5 6 The parsing process is controlled by the character classification sets: 7 8 - blankspace characters: characters that mark a token boundary and are not 9 part of the token. 10 11 - separator characters: characters that mark a token boundary and might be 12 considered tokens, depending on the value of a flag 13 (to be implemented). 14 15 - valid characters: any non blankspace and non separator character. 16 17 Each byte read from the input stream is regarded as a character in the range 18 '\\u0000' through '\\u00FF'. 19 20 In addition, an instance has flags that control: 21 22 - whether the characters of tokens are converted to lowercase. 23 - whether separator characters constitute tokens. (TBD) 24 25 A typical application first constructs an instance of this class, supplying 26 the input stream to be tokenized, the set of blankspaces, and the set of 27 eparators, and then repeatedly loops, while method has_more_tokens() returns 28 true, calling the next_token() method. 29 """ 30 31 from abc import ABCMeta, abstractmethod 32 from codecs import open as copen 33 from collections import defaultdict 34 from tipy.char import blankspaces, separators

35 36 37 -class Tokenizer(object):

38 """Abstract class for all tokenizers. 39 40 G{classtree Tokenizer}""" 41 42 __metaclass__ = ABCMeta 43

44 - def __init__(self, stream, blankspaces=blankspaces, separators=separators):

45 """Constructor of the Tokenizer abstract class. 46 47 @param stream: 48 The stream to tokenize. Can be a filename or any open IO stream. 49 @type stream: str or io.IOBase 50 @param blankspaces: 51 The characters that represent empty spaces. 52 @type blankspaces: str 53 @param separators: 54 The characters that separate token units (e.g. word boundaries). 55 @type separators: str 56 """ 57 self.separators = separators 58 self.blankspaces = blankspaces 59 self.lowercase = False 60 self.offbeg = 0 61 self.offset = None 62 self.offend = None

63

64 - def is_blankspace(self, char):

65 """Test if a character is a blankspace. 66 67 @param char: 68 The character to test. 69 @type char: str 70 71 @return: 72 True if character is a blankspace, False otherwise. 73 @rtype: bool 74 """ 75 if len(char) > 1: 76 raise TypeError("Expected a char.") 77 if char in self.blankspaces: 78 return True 79 else: 80 return False

81

82 - def is_separator(self, char):

83 """Test if a character is a separator. 84 85 @param char: 86 The character to test. 87 @type char: str 88 89 @return: 90 True if character is a separator, False otherwise. 91 @rtype: bool 92 """ 93 if len(char) > 1: 94 raise TypeError("Expected a char.") 95 if char in self.separators: 96 return True 97 else: 98 return False

99 100 @abstractmethod

101 - def count_chars(self):

102 raise NotImplementedError("Method must be implemented")

103 104 @abstractmethod

105 - def reset_stream(self):

106 raise NotImplementedError("Method must be implemented")

107 108 @abstractmethod

109 - def count_tokens(self):

110 raise NotImplementedError("Method must be implemented")

111 112 @abstractmethod

113 - def has_more_tokens(self):

114 raise NotImplementedError("Method must be implemented")

115 116 @abstractmethod

117 - def next_token(self):

118 raise NotImplementedError("Method must be implemented")

119 120 @abstractmethod

121 - def progress(self):

122 raise NotImplementedError("Method must be implemented")

123

124 125 -class ForwardTokenizer(Tokenizer):

126 """Tokenize a stream from the beginning to the end. 127 128 G{classtree ForwardTokenizer} 129 """ 130

131 - def __init__(self, stream, lowercase=False, blankspaces=blankspaces, 132 separators=separators):

133 """Constructor of the ForwardTokenizer class. 134 135 @warning: When passing IOBase type variable as stream parameter: the 136 read() method is used to read the stream and it can be time 137 consuming. Please don't pass IOBase during the prediction 138 process! 139 140 @param stream: 141 The stream to tokenize. Can be a filename or any open IO stream. 142 @type stream: str or io.IOBase 143 @param blankspaces: 144 The characters that represent empty spaces. 145 @type blankspaces: str 146 @param separators: 147 The characters that separate token units (e.g. word boundaries). 148 @type separators: str 149 """ 150 Tokenizer.__init__(self, stream, blankspaces, separators) 151 if type(stream)is str: 152 self.text = stream 153 else: 154 if not hasattr(stream, 'read'): 155 stream = copen(stream, "r", "utf-8") 156 self.text = stream.read() 157 stream.close() 158 self.lowercase = lowercase 159 self.offend = self.count_chars() 160 self.reset_stream()

161

162 - def count_tokens(self):

163 """Check the number of tokens left. 164 165 @return: 166 The number of tokens left. 167 @rtype: int 168 """ 169 count = 0 170 while(self.has_more_tokens()): 171 count += 1 172 self.next_token() 173 self.reset_stream() 174 return count

175

176 - def count_chars(self):

177 """Count the number of characters in the stream. 178 179 @note: Should return the same value as the wc Unix command. 180 181 @return: 182 The number of characters in the stream. 183 @rtype: int 184 """ 185 return len(self.text)

186

187 - def has_more_tokens(self):

188 """Test if at least one token remains. 189 190 @return: 191 True or False weither there is at least one token left in the 192 stream. 193 @rtype: bool 194 """ 195 if self.offset < self.offend: 196 return True 197 else: 198 return False

199

200 - def next_token(self):

201 """Retrieve the next token in the stream. 202 203 @return: 204 Return the next token or '' if there is no next token. 205 @rtype: str 206 """ 207 if not self.has_more_tokens(): 208 return '' 209 current = self.text[self.offset] 210 token = '' 211 if self.offset < self.offend: 212 while self.is_blankspace(current) or self.is_separator(current): 213 self.offset += 1 214 try: 215 current = self.text[self.offset] 216 except IndexError: 217 break 218 while not self.is_blankspace(current) and not self.is_separator( 219 current) and self.offset < self.offend: 220 if self.lowercase: 221 current = current.lower() 222 token += current 223 self.offset += 1 224 try: 225 current = self.text[self.offset] 226 except IndexError: 227 break 228 return token

229

230 - def progress(self):

231 """Return the progress percentage. 232 233 @return: 234 The tokenization progress percentage. 235 @rtype: float 236 """ 237 return float(self.offset) / self.offend * 100

238

239 - def reset_stream(self):

240 """Reset the offset to 0.""" 241 self.offset = 0

242

243 244 -class ReverseTokenizer(Tokenizer):

245 """Tokenize a stream from the end to the beginning. 246 247 G{classtree ReverseTokenizer} 248 """ 249

250 - def __init__(self, stream, lowercase=False, blankspaces=blankspaces, 251 separators=separators):

252 """Constructor of the ReverseTokenizer class. 253 254 @param stream: 255 The stream to tokenize. Can be a filename or any open IO stream. 256 @type stream: str or io.IOBase 257 @param blankspaces: 258 The characters that represent empty spaces. 259 @type blankspaces: str 260 @param separators: 261 The characters that separate token units (e.g. word boundaries). 262 @type separators: str 263 """ 264 Tokenizer.__init__(self, stream, blankspaces, separators) 265 if type(stream) is str: 266 self.text = stream 267 else: 268 if not hasattr(stream, 'read'): 269 stream = copen(stream, "r", "utf-8") 270 self.text = stream.read() 271 stream.close() 272 self.lowercase = lowercase 273 self.offend = self.count_chars() - 1 274 self.reset_stream()

275

276 - def count_tokens(self):

277 """Check the number of tokens left. 278 279 @return: 280 The number of tokens left. 281 @rtype: int 282 """ 283 curroff = self.offset 284 self.offset = self.offend 285 count = 0 286 while (self.has_more_tokens()): 287 self.next_token() 288 count += 1 289 self.offset = curroff 290 return count

291

292 - def count_chars(self):

293 """Count the number of characters in the stream. 294 295 @note: Should return the same value as the wc Unix command. 296 297 @return: 298 The number of characters in the stream. 299 @rtype: int 300 """ 301 return len(self.text)

302

303 - def has_more_tokens(self):

304 """Test if at least one token remains. 305 306 @return: 307 True or False weither there is at least one token left in the 308 stream. (Keep in mind that the stream is tokenized from the end to 309 the beginning). 310 @rtype: bool 311 """ 312 if self.offbeg <= self.offset: 313 return True 314 else: 315 return False

316

317 - def next_token(self):

318 """Retrieve the next token in the stream. 319 320 @note: As this is a reversed tokenizer the "next" token is currently 321 what one would call the "previous" token but in the tokenizer 322 workflow if think its more logic to call it the "next" token. 323 324 @return: 325 Return the next token or '' if there is no next token. 326 @rtype: str 327 """ 328 if not self.has_more_tokens(): 329 return '' 330 token = "" 331 while self.offbeg <= self.offset and len(token) == 0: 332 current = self.text[self.offset] 333 if (self.offset == self.offend) and (self.is_separator(current) 334 or 335 self.is_blankspace(current)): 336 self.offset -= 1 337 return token 338 while (self.is_blankspace(current) or self.is_separator(current)) \ 339 and self.offbeg < self.offset: 340 self.offset -= 1 341 if (self.offbeg <= self.offset): 342 current = self.text[self.offset] 343 344 while (not self.is_blankspace(current) and 345 not self.is_separator(current) and 346 self.offbeg <= self.offset): 347 if self.lowercase: 348 current = current.lower() 349 token = current + token 350 self.offset -= 1 351 if (self.offbeg <= self.offset): 352 current = self.text[self.offset] 353 return token

354

355 - def progress(self):

356 """Return the progress percentage. 357 358 @return: 359 The tokenization progress percentage. 360 @rtype: float 361 """ 362 return float(self.offend - self.offset) / (self.offend - self.offbeg)

363

364 - def reset_stream(self):

365 """Reset the offset to the end offset.""" 366 self.offset = self.offend

367

368 369 -class TextTokenizer(Tokenizer):

370 """Tokenizer to tokenize a text file. 371 372 This tokenizer recieve a text file and generate n-grams of a given size "n". 373 It is usefule to the L{text miner<minr.TextMiner>} in order to generate 374 n-grams to be inserted in a database. 375 376 G{classtree TextTokenizer} 377 """ 378

379 - def __init__(self, infile, n, lowercase=False, cutoff=0, callback=None):

380 """TextTokenizer creator. 381 382 @param infile: 383 Path to the file to tokenize. 384 @type infile: str 385 @param n: 386 The n in n-gram. Specify the maximum n-gram size to be created. 387 @type n: int 388 @param lowercase: 389 If True: all tokens are convert to lowercase before being added to 390 the dictionary. 391 If False: tokens case remains untouched. 392 @type lowercase: bool 393 @param cutoff: 394 Set the minimum number of token occurences. If a token dosen't 395 appear more than this number it is removed from the dictionary 396 before it is returned. 397 @type cutoff: int 398 """ 399 self.infile = infile 400 self.n = n 401 self.lowercase = lowercase 402 self.cutoff = cutoff 403 self.callback = callback

404

405 - def tknize_text(self):

406 """Tokenize a file and return a dictionary mapping its n-grams. 407 408 The dictionary looks like:: 409 { ('in', 'the', 'second'): 4, 410 ('right', 'hand', 'of'): 1, 411 ('subject', 'to', 'the'): 2, 412 ('serious', 'rebuff', 'in'): 1, 413 ('spirit', 'is', 'the'): 1 } 414 """ 415 ngramMap = defaultdict(int) 416 ngramList = [] 417 tokenizer = ForwardTokenizer(open(self.infile), self.lowercase) 418 for i in range(self.n - 1): 419 if not tokenizer.has_more_tokens(): 420 break 421 ngramList.append(tokenizer.next_token()) 422 while tokenizer.has_more_tokens(): 423 if self.callback: 424 self.callback(tokenizer.progress()) 425 token = tokenizer.next_token() 426 ngramList.append(token) 427 ngramMap[tuple(ngramList)] += 1 428 ngramList.pop(0) 429 if self.cutoff > 0: 430 for k in ngramMap.keys(): 431 if ngramMap[k] <= self.cutoff: 432 del(ngramMap[k]) 433 return ngramMap

434

Source Code for Module tipy.tknz