prest.tknz

1 #!/usr/bin/env python3 2 # -*- coding: utf-8 -*- 3 4 """The Tokenizer class takes an input stream and parses it into tokens. 5 6 The parsing process is controlled by the character classification sets: 7 8 - blankspace characters: characters that mark a token boundary and are not 9 part of the token. 10 11 - separator characters: characters that mark a token boundary and might be 12 considered tokens, depending on the value of a flag 13 (to be implemented). 14 15 - valid characters: any non blankspace and non separator character. 16 17 Each byte read from the input stream is regarded as a character in the range 18 '\\u0000' through '\\u00FF'. 19 20 In addition, an instance has flags that control: 21 22 - whether the characters of tokens are converted to lowercase. 23 - whether separator characters constitute tokens. (TBD) 24 25 A typical application first constructs an instance of this class, supplying 26 the input stream to be tokenized, the set of blankspaces, and the set of 27 eparators, and then repeatedly loops, while method has_more_tokens() returns 28 true, calling the next_token() method. 29 """ 30 31 from __future__ import absolute_import, unicode_literals 32 import abc 33 import codecs 34 import collections 35 import char

36 37 38 -class Tokenizer(object):

39 """Abstract class for all tokenizers. 40 41 G{classtree Tokenizer}""" 42 43 __metaclass__ = abc.ABCMeta 44

45 - def __init__(self, stream, blankspaces=char.blankspaces, 46 separators=char.separators):

47 """Constructor of the Tokenizer abstract class. 48 49 @param stream: 50 The stream to tokenize. Can be a filename or any open IO stream. 51 @type stream: str or io.IOBase 52 @param blankspaces: 53 The characters that represent empty spaces. 54 @type blankspaces: str 55 @param separators: 56 The characters that separate token units (e.g. word boundaries). 57 @type separators: str 58 """ 59 self.separators = separators 60 self.blankspaces = blankspaces 61 self.lowercase = False 62 self.offbeg = 0 63 self.offset = None 64 self.offend = None

65

66 - def is_blankspace(self, char):

67 """Test if a character is a blankspace. 68 69 @param char: 70 The character to test. 71 @type char: str 72 73 @return: 74 True if character is a blankspace, False otherwise. 75 @rtype: bool 76 """ 77 if len(char) > 1: 78 raise TypeError("Expected a char.") 79 if char in self.blankspaces: 80 return True 81 else: 82 return False

83

84 - def is_separator(self, char):

85 """Test if a character is a separator. 86 87 @param char: 88 The character to test. 89 @type char: str 90 91 @return: 92 True if character is a separator, False otherwise. 93 @rtype: bool 94 """ 95 if len(char) > 1: 96 raise TypeError("Expected a char.") 97 if char in self.separators: 98 return True 99 else: 100 return False

101 102 @abc.abstractmethod

103 - def count_chars(self):

104 raise NotImplementedError("Method must be implemented")

105 106 @abc.abstractmethod

107 - def reset_stream(self):

108 raise NotImplementedError("Method must be implemented")

109 110 @abc.abstractmethod

111 - def count_tokens(self):

112 raise NotImplementedError("Method must be implemented")

113 114 @abc.abstractmethod

115 - def has_more_tokens(self):

116 raise NotImplementedError("Method must be implemented")

117 118 @abc.abstractmethod

119 - def next_token(self):

120 raise NotImplementedError("Method must be implemented")

121 122 @abc.abstractmethod

123 - def progress(self):

124 raise NotImplementedError("Method must be implemented")

125

126 127 -class ForwardTokenizer(Tokenizer):

128 """Tokenize a stream from the beginning to the end. 129 130 G{classtree ForwardTokenizer} 131 """ 132

133 - def __init__(self, stream, lowercase=False, blankspaces=char.blankspaces, 134 separators=char.separators):

135 """Constructor of the ForwardTokenizer class. 136 137 @warning: When passing IOBase type variable as stream parameter: the 138 read() method is used to read the stream and it can be time 139 consuming. Please don't pass IOBase during the prediction 140 process! 141 142 @param stream: 143 The stream to tokenize. Can be a filename or any open IO stream. 144 @type stream: str or io.IOBase 145 @param blankspaces: 146 The characters that represent empty spaces. 147 @type blankspaces: str 148 @param separators: 149 The characters that separate token units (e.g. word boundaries). 150 @type separators: str 151 """ 152 Tokenizer.__init__(self, stream, blankspaces, separators) 153 if type(stream)is str: 154 self.text = stream 155 else: 156 if not hasattr(stream, 'read'): 157 stream = codecs.open(stream, "r", "utf-8") 158 self.text = stream.read() 159 stream.close() 160 self.lowercase = lowercase 161 self.offend = self.count_chars() 162 self.reset_stream()

163

164 - def count_tokens(self):

165 """Check the number of tokens left. 166 167 @return: 168 The number of tokens left. 169 @rtype: int 170 """ 171 count = 0 172 while(self.has_more_tokens()): 173 count += 1 174 self.next_token() 175 self.reset_stream() 176 return count

177

178 - def count_chars(self):

179 """Count the number of characters in the stream. 180 181 @note: Should return the same value as the wc Unix command. 182 183 @return: 184 The number of characters in the stream. 185 @rtype: int 186 """ 187 return len(self.text)

188

189 - def has_more_tokens(self):

190 """Test if at least one token remains. 191 192 @return: 193 True or False weither there is at least one token left in the 194 stream. 195 @rtype: bool 196 """ 197 if self.offset < self.offend: 198 return True 199 else: 200 return False

201

202 - def next_token(self):

203 """Retrieve the next token in the stream. 204 205 @return: 206 Return the next token or '' if there is no next token. 207 @rtype: str 208 """ 209 if not self.has_more_tokens(): 210 return '' 211 current = self.text[self.offset] 212 token = '' 213 if self.offset < self.offend: 214 while self.is_blankspace(current) or self.is_separator(current): 215 self.offset += 1 216 try: 217 current = self.text[self.offset] 218 except IndexError: 219 break 220 while not self.is_blankspace(current) and not self.is_separator( 221 current) and self.offset < self.offend: 222 if self.lowercase: 223 current = current.lower() 224 token += current 225 self.offset += 1 226 try: 227 current = self.text[self.offset] 228 except IndexError: 229 break 230 #lg_debug_blu('next token is "' + token + '"') 231 return token

232

233 - def progress(self):

234 """Return the progress percentage. 235 236 @return: 237 The tokenization progress percentage. 238 @rtype: float 239 """ 240 return float(self.offset) / self.offend * 100

241

242 - def reset_stream(self):

243 """Reset the offset to 0.""" 244 self.offset = 0

245

246 247 -class ReverseTokenizer(Tokenizer):

248 """Tokenize a stream from the end to the beginning. 249 250 G{classtree ReverseTokenizer} 251 """ 252

253 - def __init__(self, stream, lowercase=False, blankspaces=char.blankspaces, 254 separators=char.separators):

255 """Constructor of the ReverseTokenizer class. 256 257 @param stream: 258 The stream to tokenize. Can be a filename or any open IO stream. 259 @type stream: str or io.IOBase 260 @param blankspaces: 261 The characters that represent empty spaces. 262 @type blankspaces: str 263 @param separators: 264 The characters that separate token units (e.g. word boundaries). 265 @type separators: str 266 """ 267 Tokenizer.__init__(self, stream, blankspaces, separators) 268 if type(stream) is str: 269 self.text = stream 270 else: 271 if not hasattr(stream, 'read'): 272 stream = codecs.open(stream, "r", "utf-8") 273 self.text = stream.read() 274 stream.close() 275 self.lowercase = lowercase 276 self.offend = self.count_chars() - 1 277 self.reset_stream()

278

279 - def count_tokens(self):

280 """Check the number of tokens left. 281 282 @return: 283 The number of tokens left. 284 @rtype: int 285 """ 286 curroff = self.offset 287 self.offset = self.offend 288 count = 0 289 while (self.has_more_tokens()): 290 self.next_token() 291 count += 1 292 self.offset = curroff 293 return count

294

295 - def count_chars(self):

296 """Count the number of characters in the stream. 297 298 @note: Should return the same value as the wc Unix command. 299 300 @return: 301 The number of characters in the stream. 302 @rtype: int 303 """ 304 return len(self.text)

305

306 - def has_more_tokens(self):

307 """Test if at least one token remains. 308 309 @return: 310 True or False weither there is at least one token left in the 311 stream. (Keep in mind that the stream is tokenized from the end to 312 the beginning). 313 @rtype: bool 314 """ 315 if self.offbeg <= self.offset: 316 return True 317 else: 318 return False

319

320 - def next_token(self):

321 """Retrieve the next token in the stream. 322 323 @note: As this is a reversed tokenizer the "next" token is currently 324 what one would call the "previous" token but in the tokenizer 325 workflow if think its more logic to call it the "next" token. 326 327 @return: 328 Return the next token or '' if there is no next token. 329 @rtype: str 330 """ 331 if not self.has_more_tokens(): 332 return '' 333 token = "" 334 while self.offbeg <= self.offset and len(token) == 0: 335 current = self.text[self.offset] 336 if (self.offset == self.offend) and (self.is_separator(current) 337 or 338 self.is_blankspace(current)): 339 self.offset -= 1 340 return token 341 while (self.is_blankspace(current) or self.is_separator(current)) \ 342 and self.offbeg < self.offset: 343 self.offset -= 1 344 if (self.offbeg <= self.offset): 345 current = self.text[self.offset] 346 347 while (not self.is_blankspace(current) and 348 not self.is_separator(current) and 349 self.offbeg <= self.offset): 350 if self.lowercase: 351 current = current.lower() 352 token = current + token 353 self.offset -= 1 354 if (self.offbeg <= self.offset): 355 current = self.text[self.offset] 356 return token

357

358 - def progress(self):

359 """Return the progress percentage. 360 361 @return: 362 The tokenization progress percentage. 363 @rtype: float 364 """ 365 return float(self.offend - self.offset) / (self.offend - self.offbeg)

366

367 - def reset_stream(self):

368 """Reset the offset to the end offset.""" 369 self.offset = self.offend

370

371 372 -class TextTokenizer(Tokenizer):

373 """Tokenizer to tokenize a text file. 374 375 This tokenizer recieve a text file and generate n-grams of a given size "n". 376 It is usefule to the L{text miner<minr.TextMiner>} in order to generate 377 n-grams to be inserted in a database. 378 379 G{classtree TextTokenizer} 380 """ 381

382 - def __init__(self, infile, n, lowercase=False, cutoff=0, callback=None):

383 """TextTokenizer creator. 384 385 @param infile: 386 Path to the file to tokenize. 387 @type infile: str 388 @param n: 389 The n in n-gram. Specify the maximum n-gram size to be created. 390 @type n: int 391 @param lowercase: 392 If True: all tokens are convert to lowercase before being added to 393 the dictionary. 394 If False: tokens case remains untouched. 395 @type lowercase: bool 396 @param cutoff: 397 Set the minimum number of token occurences. If a token dosen't 398 appear more than this number it is removed from the dictionary 399 before it is returned. 400 @type cutoff: int 401 """ 402 self.infile = infile 403 self.n = n 404 self.lowercase = lowercase 405 self.cutoff = cutoff 406 self.callback = callback

407

408 - def tknize_text(self):

409 """Tokenize a file and return a dictionary mapping its n-grams. 410 411 The dictionary looks like:: 412 { ('in', 'the', 'second'): 4, 413 ('right', 'hand', 'of'): 1, 414 ('subject', 'to', 'the'): 2, 415 ('serious', 'rebuff', 'in'): 1, 416 ('spirit', 'is', 'the'): 1 } 417 """ 418 ngramMap = collections.defaultdict(int) 419 ngramList = [] 420 tokenizer = ForwardTokenizer(self.infile, self.lowercase) 421 for i in range(self.n - 1): 422 if not tokenizer.has_more_tokens(): 423 break 424 ngramList.append(tokenizer.next_token()) 425 while tokenizer.has_more_tokens(): 426 if self.callback: 427 self.callback(tokenizer.progress()) 428 token = tokenizer.next_token() 429 ngramList.append(token) 430 ngramMap[tuple(ngramList)] += 1 431 ngramList.pop(0) 432 if self.cutoff > 0: 433 for k in ngramMap.keys(): 434 if ngramMap[k] <= self.cutoff: 435 del(ngramMap[k]) 436 return ngramMap

437

Source Code for Module prest.tknz