tipy.prdct

1 #!/usr/bin/env python3 2 # -*- coding: utf-8 -*- 3 4 """Classes for predictors and to handle suggestions and predictions.""" 5 6 from sys import exit 7 from tipy.db import SqliteDatabaseConnector 8 from tipy.mrgr import ProbabilisticMerger 9 from abc import ABCMeta, abstractmethod 10 from tipy.stpl import * 11 from math import exp 12 from multiprocessing import Queue, Process

13 14 15 -class Suggestion(object):

16 """A suggestion consists of a string and a probility. 17 18 The string is in fact a token and the probability is the probability 19 associated to this token by the predictor which compute it. The 20 probability reflect how accurate the word is over prediction. 21 22 G{classtree Suggestion} 23 """ 24

25 - def __init__(self, word, probability):

26 """Suggestion creator 27 28 A suggestion is a couple formed by a suggested word and its probability. 29 30 @note: the probabilities of each predictors should have the same 31 weight. Otherwise the suggestion selection will be truncated. 32 33 @param word: 34 The suggested word. 35 @type word: str 36 @param probability: 37 The suggested word probability. It is compute by the predictors. 38 @type probability: float 39 """ 40 self.word = word 41 self.probability = probability

42

43 - def __eq__(self, other):

44 """Override the == operator in order to compare instances equality. 45 46 Two Suggestion instances are equal if their word and probability are 47 equal. 48 49 @param other: 50 The Suggestion instance to compare to this one (self). 51 @type other: L{Suggestion} 52 53 @return: 54 True if the two instances are equal, False otherwise. 55 @rtype: 56 bool 57 """ 58 if self.word == other.word and self.probability == other.probability: 59 return True 60 return False

61

62 - def __lt__(self, other):

63 """Override the < operator in order to compare instances. 64 65 A Suggestion instance is less than another if its probability is less 66 than the probability of the other. If their probabilities are equal then 67 the Suggestion instance is less than the other if its word is 68 alphabetically 'before' the word of the other instance. 69 70 @param other: 71 The Suggestion instance to compare to this one (self). 72 @type other: L{Suggestion} 73 74 @return: 75 True if the this instance (self) is less than the other one, False 76 otherwise. 77 @rtype: bool 78 """ 79 if self.probability < other.probability: 80 return True 81 if self.probability == other.probability: 82 return self.word < other.word 83 return False

84

85 - def __gt__(self, other):

86 """Override the > operator in order to compare instances. 87 88 A Suggestion instance is greater than another if its probability is 89 greater than the probability of the other. If their probabilities are 90 equal then the Suggestion instance is greater than the other if its word 91 is alphabetically 'after' the word of the other instance. 92 93 @param other: 94 The Suggestion instance to compare to this one (self). 95 @type other: L{Suggestion} 96 97 @return: 98 True if the this instance (self) is greater than the other one, 99 False otherwise. 100 @rtype: bool 101 """ 102 if self.probability > other.probability: 103 return True 104 if self.probability == other.probability: 105 return self.word > other.word 106 return False

107

108 109 -class Prediction(list):

110 """Class for predictions from predictors. 111 112 A Prediction instance hold multiple Suggestion instances. It is a list of 113 Suggestion instances and the list is kept ordered according to the 114 suggestions probabilities. 115 Every predictors should return a Preidiction instance. 116 117 G{classtree Prediction} 118 """ 119

120 - def __init__(self):

121 """Prediction creator.""" 122 pass

123

124 - def __eq__(self, other):

125 """Override the == operator in order to compare instances. 126 127 Two Prediction instances are equal if they contain the same items 128 (thus, have the same length). 129 """ 130 if self is other: 131 return True 132 if len(self) != len(other): 133 return False 134 for i, s in enumerate(other): 135 if not s == self[i]: 136 return False 137 return True

138

139 - def add_suggestion(self, suggestion):

140 """Add a suggestion in the Prediction list. 141 142 The suggestion is added at the correct index so that the Prediction 143 list remains ordered. 144 145 @note: Using insert() and a while loop seams a little faster than using 146 sorted(). Also, using insort from the bisect module seems to 147 produce similar benchmarks. 148 """ 149 if len(self) == 0: 150 self.append(suggestion) 151 else: 152 i = 0 153 while i < len(self) and suggestion < self[i]: 154 i += 1 155 self.insert(i, suggestion)

156

157 158 -class PredictorActivator(object):

159 """Query the predictors listed in the registry to get their suggestions. 160 161 This class has access to a PredictorRegistry and asks the predictors listed 162 in this PredictorRegistry to call their predict() method, store the 163 resulting Prediction instances, merge them into a single Prediction 164 instance and return it. 165 166 G{classtree PredictorActivator} 167 """ 168

169 - def __init__(self, config, predictorRegistry):

170 """PredictorActivator creator. 171 172 @param config: 173 The configuration dictionary is used in order to retrieve the 174 PredictorActivator settings from the config file. 175 @type config: L{drvr.Configuration} 176 @param predictorRegistry: 177 The class needs to access the PredictorRegistry to call their 178 predict() method. 179 @type predictorRegistry: L{PredictorRegistry} 180 """ 181 self.config = config 182 self.predictorRegistry = predictorRegistry 183 self.predictionList = [] 184 self.maxPartialPredictionSize = self.config.getas( 185 'PredictorActivator', 'max_partial_prediction_size', 'int') + 1 186 self.mergingMethod = self.config.getas( 187 'PredictorActivator', 'merging_method') 188 self.stopListFile = self.config.getas( 189 'PredictorActivator', 'stoplist') 190 self.stopList = StopList(self.stopListFile) 191 if self.mergingMethod.lower() == "probabilistic": 192 self.merger = ProbabilisticMerger() 193 else: 194 lg.error('Unknown merging method') 195 exit(1)

196

197 - def pred_worker(self, predictor, queue, factor):

198 """Worker function for the predictor predict() methods. 199 200 This method is used as the predictors workers target. It push the 201 predictor's L{prdct.Predictor.predict} method result (a 202 L{Prediction} instance) in a queue (which is used because it is 203 thread-safe). 204 205 @param predictor: 206 The Predictor based class instance. 207 @type predictor: L{Predictor} based class. 208 @param queue: 209 A queue in which the result will be pushed. 210 @type queue: 211 multiprocessing.Queue 212 @param factor: 213 A factor used to increase the number of suggestions. 214 @type factor: int 215 """ 216 queue.put(predictor.predict( 217 self.maxPartialPredictionSize * factor, self.stopList.words))

218

219 - def predict(self, factor=1):

220 """Build a list of every predicted words. 221 222 Call the predict() method of every predictors in the registry then 223 merge their Prediction into a single Prediction instance. 224 225 @change: 226 - 16/06/15: The method now uses multi-processing. It concurrently 227 runs every predictors's predict() method which allow a 228 significant speed augmentation. The queue is used because it is 229 thread safe. The point is that when the threads args are passed 230 to the L{PredictorActivator.pred_worker()}, they are 231 packed up with pickle, shipped to the other process, where they 232 are unpacked used. A list wouldn't be passed but would be 233 cloned. 234 235 @note: 236 Using multi-processing allow significant speed boost. The next 237 benchmark have been maid runing 100 * 10 different contexts 238 predictions:: 239 240 Total time without multi-processing: 86.785 s 241 Total time wit multi-processing: 76.513 s 242 243 @todo 0.0.9: 244 Demonize the processes, set a timeout value. When the time runs out 245 the unfinished workers return their results as is. This can alter 246 the prediction qality but totaly avoid any possible "slow 247 predictions". 248 249 @param factor: 250 A factor used to increase the number of suggestions. 251 @type factor: int 252 253 @return: 254 The merged Prediction instance containing every suggestions of 255 every Prediction instances sorted in descending order according to 256 their probabilities. 257 @rtype: L{Prediction} 258 """ 259 self.predictionList[:] = [] 260 jobs = [] 261 queue = Queue() 262 for predictor in self.predictorRegistry: 263 p = Process( 264 target=self.pred_worker, args=(predictor, queue, factor,)) 265 jobs.append(p) 266 p.start() 267 for job in jobs: 268 job.join() 269 for x in range(len(jobs)): 270 self.predictionList.append(queue.get()) 271 return self.merger.merge(self.predictionList)

272

273 274 -class PredictorRegistry(list):

275 """List every predictors instances that are to be used for word prediction. 276 277 G{classtree PredictorRegistry} 278 """ 279

280 - def __init__(self, config):

281 """PredictorRegistry creator. 282 283 @param config: 284 config is used to retrieve the PredictorRegistry settings and each 285 Predictor settings from the config file. Also it needs to be passed 286 to the predictors instances to allow them to retrieve their settings 287 from the config file too. 288 @type config: L{drvr.Configuration} 289 """ 290 self._contextMonitor = None 291 self.config = config 292 self.contextMonitor = None

293

294 - def contextMonitor():

295 296 def fget(self): 297 return self._contextMonitor

298 299 def fset(self, value): 300 if self._contextMonitor is not value: 301 self._contextMonitor = value 302 self[:] = [] 303 self.set_predictors()

304 305 def fdel(self): 306 del self._contextMonitor 307 308 return locals() 309 contextMonitor = property(**contextMonitor()) 310

311 - def set_predictors(self):

312 """Read the configuration file and create needed predictors.""" 313 if self.contextMonitor: 314 self[:] = [] 315 preds = self.config.getas('PredictorRegistry', 'predictors', 'list') 316 for predictor in preds: 317 self.add_predictor(predictor)

318

319 - def add_predictor(self, predictorName):

320 """Create and add a predictor to the list. 321 322 Create a predictor instance according to the predictor name and add 323 it to the list. 324 325 @param predictorName: 326 The name of the predictor. It is used to retrieve the predictor 327 settings from the config. It must correspond to a section of the 328 config, otherwise no predictor will be created and added. 329 @type predictorName: str 330 """ 331 predictorClass = self.config.getas(predictorName, 'class') 332 if predictorClass == 'WeightNgramPredictor': 333 predictor = WeightNgramPredictor( 334 self.config, self.contextMonitor, predictorName) 335 elif predictorClass == 'LastOccurPredictor': 336 predictor = LastOccurPredictor( 337 self.config, self.contextMonitor, predictorName) 338 elif predictorClass == 'MemorizePredictor': 339 predictor = MemorizePredictor( 340 self.config, self.contextMonitor, predictorName) 341 elif predictorClass == 'DictionaryPredictor': 342 predictor = DictionaryPredictor( 343 self.config, self.contextMonitor, predictorName) 344 else: 345 predictor = None 346 if predictor: 347 self.append(predictor)

348

349 - def close_databases(self):

350 """Close every opened predictors database.""" 351 for predictor in self: 352 predictor.close_database()

353

354 355 -class Predictor(object):

356 """Base class for predictors. 357 358 G{classtree Predictor} 359 """ 360 361 __metaclass__ = ABCMeta 362

363 - def __init__(self, config, contextMonitor):

364 """Predictor creator. 365 366 @param config: 367 The config is used to retrieve the predictor settings from the 368 config file. 369 @type config: L{Configuration} 370 @param contextMonitor: 371 The contextMonitor is needed because it allow the predictor to get 372 the input buffers tokens. 373 @type contextMonitor: L{ContextMonitor} 374 """ 375 self.contextMonitor = contextMonitor 376 self.name = "Predictor dosen't set any name" 377 self.config = config

378 379 @abstractmethod

380 - def predict(self, maxPartialPredictionSize, stopList):

381 raise NotImplementedError("Method must be implemented")

382 383 @abstractmethod

384 - def learn(self, text):

385 raise NotImplementedError("Method must be implemented")

386

387 388 -class WeightNgramPredictor(Predictor):

389 """Compute prediction from n-gram model in database. 390 391 G{classtree WeightNgramPredictor} 392 """ 393

394 - def __init__(self, config, contextMonitor, predictorName=None):

395 """WeightNgramPredictor creator. 396 397 @param config: 398 The config is used to retrieve the predictor settings from the 399 config file. 400 @type config: L{drvr.Configuration} 401 @param contextMonitor: 402 The contextMonitor is needed because it allow the predictor to get 403 the input buffers tokens. 404 @type contextMonitor: L{ContextMonitor} 405 @param predictorName: 406 The custom name of the configuration using this predictor. 407 @type predictorName: str 408 """ 409 Predictor.__init__(self, config, contextMonitor) 410 self.name = predictorName 411 self.db = None 412 self.dbFile = self.config.getas(self.name, 'database') 413 self.deltas = self.config.getas(self.name, 'DELTAS', 'floatlist') 414 self.learnMode = self.config.getas(self.name, 'learn', 'bool') 415 self.maxN = len(self.deltas) 416 self.init_database_connector()

417

418 - def init_database_connector(self):

419 """Initialize the database connector. 420 421 Using the database file path, the n-gram maximum size and the learn 422 mode to initialize and open the database. 423 """ 424 if self.dbFile and self.maxN > 0: 425 self.db = SqliteDatabaseConnector(self.dbFile, self.maxN)

426

427 - def predict(self, maxPartialPredictionSize, stopList=[]):

428 """Predict the next word according to the current context. 429 430 Use the input buffers (thanks to contextMonitor) and the n-gram database 431 to predict the most probable suggestions. 432 A suggestion is a word which can: 433 - Predict the end of the world. i.e. complete the actual partial 434 word (the user has not finished to input the word, we try to 435 predict the end of the word). 436 - Predict the next word (the user has type a separator after a word, 437 we try to predict the next word before he starts to type it). 438 439 In order to compute the suggestions, this method: 440 - Retrieve the last n tokens from the left input buffer ; where n is 441 the maximum n-grams size (max(n)) which is stored in the database. 442 - Loop for each n-gram size from max(n) to 1: 443 - Find n-grams of current n-gram size in the database which 444 match the last input tokens. 445 - Add each retrieved n-gram to the suggestion list if it is not 446 already in it and if we have not reach the maximum number of 447 suggestions yet. 448 449 @param maxPartialPredictionSize: 450 Maximum number of suggestion to compute. If this number is reached, 451 the suggestions list is immediatly return. 452 DatabaseConnector.ngram_table_tp() returns the records in descending 453 order according to their number of occurences so the most probable 454 suggestions will be added to the list first. 455 This result in no suggestion quality loss, regardless of the desired 456 number of suggestions. 457 @type maxPartialPredictionSize: int 458 @param stopList: 459 The stoplist is a list of undesirable words. Any suggestion which 460 is in the stopList won't be added to the suggestions list. 461 @type stopList: list 462 463 @return: 464 A list of every suggestions possible (limited to 465 maxPartialPredictionSize). 466 @rtype: L{Prediction} 467 """ 468 tokens = [''] * self.maxN 469 for i in range(self.maxN): 470 tokens[self.maxN - 1 - i] = self.contextMonitor.left_token(i) 471 prefixCompletionCandidates = [] 472 for k in reversed(range(self.maxN)): 473 if len(prefixCompletionCandidates) >= maxPartialPredictionSize: 474 break 475 prefixNgram = tokens[(len(tokens) - k - 1):] 476 partial = None 477 partial = self.db.ngram_table_tp( 478 prefixNgram, 479 maxPartialPredictionSize - len(prefixCompletionCandidates)) 480 for p in partial: 481 if len(prefixCompletionCandidates) > maxPartialPredictionSize: 482 break 483 candidate = p[-2] 484 if candidate not in prefixCompletionCandidates: 485 if not candidate.lower() in stopList: 486 prefixCompletionCandidates.append(candidate) 487 return self.weight(prefixCompletionCandidates, tokens)

488

489 - def weight(self, prefixCompletionCandidates, tokens):

490 """Compute probability of suggestions and return the most probable ones. 491 492 The probability of a suggestion is based on its relative frequency 493 toward the whole set of suggestions and the number of single tokens in 494 the database. 495 496 @param prefixCompletionCandidates: 497 List of every suggestions returned by self.predict(). 498 @type prefixCompletionCandidates: list 499 @param tokens: 500 The last input tokens. 501 @type tokens: list 502 503 @return: 504 List of every "good enought" suggestions. 505 @rtype: L{Prediction} 506 """ 507 prediction = Prediction() 508 unigramCountsSum = self.db.sum_ngrams_occ(1) 509 for j, candidate in enumerate(prefixCompletionCandidates): 510 tokens[self.maxN - 1] = candidate 511 probability = 0 512 for k in range(self.maxN): 513 numerator = self.count(tokens, 0, k + 1) 514 denominator = unigramCountsSum 515 if numerator > 0: 516 denominator = self.count(tokens, -1, k) 517 frequency = 0 518 if denominator > 0: 519 frequency = float(numerator) / denominator 520 probability += self.deltas[k] * frequency 521 if probability > 0: 522 prediction.add_suggestion( 523 Suggestion(tokens[self.maxN - 1], probability)) 524 return(prediction)

525

526 - def close_database(self):

527 """Close the predictor's database.""" 528 self.close_database()

529

530 - def learn(self, change):

531 """Learn what need to be learnt by adding n-grams in database. 532 533 @param change: 534 The part of the left input buffer which represent the last change. 535 @type change: str 536 """ 537 if self.learnMode is False: 538 return 539 ngramMap = self.make_ngram_map(change) 540 ngramMap = self.prefix_ngrams_with_input(change, ngramMap) 541 self.push_ngrams_in_db(ngramMap)

542

543 - def make_ngram_map(self, change):

544 """Create a map associating n-grams (lists of words) and their count. 545 546 @param change: 547 The part of the left input buffer which represent the last change. 548 @type change: str 549 """ 550 ngramMap = {} 551 for curCard in range(1, self.maxN + 1): 552 changeIdx = 0 553 changeSize = len(change) 554 ngramList = () 555 for i in range(curCard - 1): 556 if changeIdx >= changeSize: 557 break 558 ngramList = ngramList + (change[changeIdx],) 559 changeIdx += 1 560 while changeIdx < changeSize: 561 ngramList = ngramList + (change[changeIdx],) 562 changeIdx += 1 563 try: 564 ngramMap[ngramList] = ngramMap[ngramList] + 1 565 except KeyError: 566 ngramMap[ngramList] = 1 567 ngramList = ngramList[1:] 568 curCard += 1 569 return ngramMap

570

571 - def prefix_ngrams_with_input(self, change, ngramMap):

572 """Use the input left buffer to expand the n-gram map. 573 574 This method call L{cntxt.ContextMonitor.previous_tokens} to get the 575 tokens from the left input buffer that are just before the change 576 and add them BEFORE the change n-grams generated by 577 L{self.make_ngram_map}. 578 579 For instance, if the current left input buffer is:: 580 "phone is on the white table " 581 582 And change is:: 583 "table" 584 585 Then, the n-gram map generated by self.make_ngram_map() will be:: 586 {("table"): 1} 587 588 The n-gram map contain a sinle n-gram of size 1. And so this method 589 will add the tokens preceding the change in the left input buffer to 590 form n-grams of size 2 and more (until it reaches self.maxN):: 591 {("the", "white", "table"): 1, ("white", "table"): 1, {"table"): 1} 592 593 @param change: 594 The part of the left input buffer which represent the last change. 595 @type change: str 596 @param ngramMap: 597 Dictionary associating n-grams with their number of occurences, 598 generated by self.make_ngram_map(). 599 @type ngramMap: dict 600 601 @return: 602 The extanded n-grams dictionary. 603 @rtype: dict 604 """ 605 changeMatchInput = (change and 606 change[-1] == self.contextMonitor.left_token(1) and 607 self.contextMonitor.left_token(len(change))) 608 if changeMatchInput: 609 ngramList = tuple(change[:1]) 610 tkIdx = 1 611 while len(ngramList) < self.maxN: 612 extraToken = self.contextMonitor.previous_tokens( 613 tkIdx, change) 614 if not extraToken: 615 break 616 ngramList = (extraToken,) + ngramList 617 try: 618 ngramMap[ngramList] = ngramMap[ngramList] + 1 619 except KeyError: 620 ngramMap[ngramList] = 1 621 tkIdx += 1 622 return ngramMap

623

624 - def push_ngrams_in_db(self, ngramMap):

625 """Update the database with the n-grams contained in the n-gram map. 626 627 Each n-gram of the n-gram map is pushed into the database with its 628 number of occurences (count). 629 If the n-gram is already in the database then its count (number of 630 occurences) is updated. If the n-gram is not in the database then it is 631 simply inserted in it. 632 633 @param ngramMap: 634 Dictionary associating n-grams with their number of occurences, 635 generated by L{self.make_ngram_map} and modified by 636 L{self.prefix_ngrams_with_input}. 637 @type ngramMap: dict 638 """ 639 for ngram in ngramMap: 640 count = self.db.ngram_count(ngram) 641 if count > 0: 642 self.db.update_ngram(ngram, count + ngramMap[ngram]) 643 else: 644 self.db.insert_ngram(list(ngram), ngramMap[ngram]) 645 self.db.commit()

646

647 - def count(self, tokens, offset, n):

648 """Make an n-gram then retrieve and return its 'count' entry in the db. 649 650 @param tokens: 651 The tokens used to make the n-gram. 652 @type tokens: list 653 @param offset: 654 Offsset of the first token in the tokens. 655 @type offset: int 656 @param n: 657 Size of the n-gram. 658 @type n: int 659 """ 660 if n > 0: 661 ngram = tokens[len(tokens) - n + offset:len(tokens) + offset] 662 result = self.db.ngram_count(ngram) 663 else: 664 result = self.db.sum_ngrams_occ(1) 665 return result

666

667 668 -class LastOccurPredictor(Predictor):

669 """Compute predictions based on their last occurences and frequencies. 670 671 G{classtree LastOccurPredictor} 672 """ 673

674 - def __init__(self, config, contextMonitor, predictorName=None):

675 """LastOccurPredictor creator. 676 677 @param config: 678 The config is used to retrieve the predictor settings from the 679 config file. 680 @type config: L{drvr.Configuration} 681 @param contextMonitor: 682 The contextMonitor is needed because it allow the predictor to get 683 the input buffers tokens. 684 @type contextMonitor: L{ContextMonitor} 685 @param predictorName: 686 The custom name of the configuration using this predictor. 687 @type predictorName: str 688 """ 689 Predictor.__init__(self, config, contextMonitor) 690 self.name = predictorName 691 self.lambdav = self.config.getas(self.name, 'lambda', 'int') 692 self.n0 = self.config.getas(self.name, 'n_0', 'int') 693 self.cutoffThreshold = self.config.getas( 694 self.name, 'cutoff_threshold', 'int')

695

696 - def predict(self, maxPartialPredictionSize, stopList=[]):

697 """Compute the predictions using a simple exponential decay method. 698 699 @param maxPartialPredictionSize: 700 Maximum number of suggestion to compute. If this number is reached, 701 the suggestions list is immediatly return. 702 DatabaseConnector.ngram_table_tp() returns the records in descending 703 order according to their number of occurences so the most probable 704 suggestions will be added to the list first. 705 This result in no suggestion quality loss, regardless of the desired 706 number of suggestions. 707 @type maxPartialPredictionSize: int 708 @param stopList: 709 The stoplist is a list of undesirable words. Any suggestion which 710 is in the stopList won't be added to the suggestions list. 711 @type stopList: list 712 713 @return: 714 A list of every suggestions possible (limited to 715 maxPartialPredictionSize). 716 @rtype: L{Prediction} 717 """ 718 result = Prediction() 719 prefix = self.contextMonitor.prefix() 720 if prefix: 721 index = 1 722 token = self.contextMonitor.left_token(index) 723 prob = 0 724 while (token and 725 len(result) < maxPartialPredictionSize and 726 index <= self.cutoffThreshold): 727 if token.startswith(prefix): 728 if not token.lower() in stopList: 729 prob = self.n0 * exp(- (self.lambdav * (index - 1))) 730 result.add_suggestion(Suggestion(token, prob)) 731 index += 1 732 token = self.contextMonitor.left_token(index) 733 return result

734

735 - def learn(self, text):

736 """This predictor has no ability to learn.""" 737 pass

738

739 740 -class MemorizePredictor(Predictor):

741 """Predict words based on memorized (learnt) input tokens patterns. 742 743 This predictor is capable of tokens memorization. It memorize the inputed 744 tokens and try to predict the suggestion using memorized tokens and n-grams 745 (group of consecutive tokens). 746 747 G{classtree MemorizePredictor} 748 """ 749

750 - def __init__(self, config, contextMonitor, predictorName=None):

751 """MemorizePredictor creator. 752 753 @param config: 754 The config is used to retrieve the predictor settings from the 755 config file. 756 @type config: L{drvr.Configuration} 757 @param contextMonitor: 758 The contextMonitor is needed because it allow the predictor to get 759 the input buffers tokens. 760 @type contextMonitor: L{ContextMonitor} 761 @param predictorName: 762 The custom name of the configuration using this predictor. 763 @type predictorName: str 764 """ 765 Predictor.__init__(self, config, contextMonitor) 766 self.name = predictorName 767 self.memory = self.config.getas(self.name, 'memory') 768 self.trigger = self.config.getas(self.name, 'trigger', 'int') 769 self.learnMode = self.config.getas(self.name, 'learn', 'bool')

770

771 - def predict(self, maxPartialPredictionSize, stopList):

772 """Predict words based on memorized input tokens. 773 774 @param maxPartialPredictionSize: 775 Maximum number of suggestion to compute. If this number is reached, 776 the suggestions list is immediatly return. 777 DatabaseConnector.ngram_table_tp() returns the records in descending 778 order according to their number of occurences so the most probable 779 suggestions will be added to the list first. 780 This result in no suggestion quality loss, regardless of the desired 781 number of suggestions. 782 @type maxPartialPredictionSize: int 783 @param stopList: 784 The stoplist is a list of undesirable words. Any suggestion which 785 is in the stopList won't be added to the suggestions list. 786 @type stopList: list 787 788 @return: 789 A list of every suggestions possible (limited to 790 maxPartialPredictionSize). 791 @rtype: L{Prediction} 792 """ 793 result = Prediction() 794 memTrigger = [] 795 try: 796 memFile = open(self.memory, 'r+') 797 except FileNotFoundError: 798 lg_error('Cannot open file ' + self.memory) 799 return 800 if self.init_mem_trigg(memTrigger): 801 rollingWindow = '' 802 if self.init_rolling_window(rollingWindow, memFile): 803 token = '' 804 while memFile.write(token): 805 if memTrigger == rollingWindow: 806 if not token.lower() in stopList: 807 result.add_suggestion(Suggestion(token, 1.)) 808 self.update_rolling_window(rollingWindow, token) 809 memFile.close() 810 return result

811

812 - def learn(self, change):

813 """Learn what need to be learnt by tokens in the memory file. 814 815 @param change: 816 The part of the left input buffer which represent the last change. 817 @type change: str 818 """ 819 if self.learnMode is False: 820 return 821 try: 822 memFile = open(self.memory, 'a') 823 except FileNotFoundError: 824 lg_error('Cannot open file ' + self.memory) 825 return 826 for tok in change: 827 memFile.write(tok + '\n') 828 memFile.close()

829

830 - def init_mem_trigg(self, memTrigger):

831 result = False 832 for i in range(self.trigger, 0, -1): 833 memTrigger.append(self.contextMonitor.left_token(i)) 834 if not '' in memTrigger: 835 result = True 836 return result

837

838 - def init_rolling_window(self, rollingWindow, memFile):

839 tmp = [x.strip('\n') for x in memFile.readlines()] 840 token = '' 841 count = 0 842 while count < self.trigger and tmp[count]: 843 count += 1 844 return count == self.trigger

845

846 - def update_rolling_window(self, rollingWindow, token):

847 rollingWindow = rollingWindow[1:] 848 rollingWindow += token

849

850 851 -class DictionaryPredictor(Predictor):

852 """Very simple word predictor using a dictionary. 853 854 The dictionary is a file containing one word per line. This predictor does 855 not use n-grams and is therefore less effective than the predictors using 856 n-grams because it does not consider context. 857 858 G{classtree DictionaryPredictor} 859 """ 860

861 - def __init__(self, config, contextMonitor, predictorName):

862 """DictionaryPredictor creator. 863 864 @note: The string.lower() and string.strip() methods have a great impact 865 on performance (the profile module show that they require almost 866 1 second of processing time when calculating suggestions for 10 867 contexts. So this constructor no more directly use the dictionary 868 file. A database is created instead. 869 Every words of the dictionary are lowered and stripped then added 870 to the database. 871 Doing so, the performance of the predictor are way better. 872 Profiling a script querying suggestions for 10 successive contexts 873 show the improvement profits: 874 - lower()ing and strip()ping each word of the file on each 875 predict() call:: 876 ncalls tottime percall cumtime percall filename:lineno 877 690048 0.468 0.000 0.468 0.000 :0(lower) 878 - Creating an improved list upon initialization and using it on 879 each predict() call (previous optimization method):: 880 ncalls tottime percall cumtime percall filename:lineno 881 100046 0.059 0.000 0.059 0.000 :0(lower) 882 It is approx. 800% faster. But this profiling mix 883 initialization and later computation. It means than most of 884 the time of the previous profiling line is spend in 885 initializing the list, computation on each predict() call are 886 even more profitable. 887 - Creating a database and querying it on each predict() call:: 888 ncalls tottime percall cumtime percall filename:lineno 889 100046 0.059 0.000 0.059 0.000 :0(lower) 890 It is not faster than the previous method but the database 891 must only be created once. And once it is created the 892 initialization time is (near) null and the querying time on 893 each predict() call is even faster. 894 895 @change: 896 - 08/06/15: Method now create an ordered optimized list containing 897 dictionary words upon initialization in order to increase the 898 speed of the predictor. 899 - 13/06/15: Method now use a database containing the dictionary 900 words. See: L{minr.DictMiner} 901 902 @param config: 903 The config is used to retrieve the predictor settings from the 904 config file. 905 @type config: L{drvr.Configuration} 906 @param contextMonitor: 907 The contextMonitor is needed because it allow the predictor to get 908 the input buffers tokens. 909 @type contextMonitor: L{ContextMonitor} 910 @param predictorName: 911 The custom name of the configuration using this predictor. 912 @type predictorName: str 913 """ 914 Predictor.__init__(self, config, contextMonitor) 915 self.name = predictorName 916 self.dbFile = self.config.getas(self.name, 'database') 917 self.db = None 918 self.prob = self.config.getas(self.name, 'probability', 'float') 919 self.init_database_connector()

920

921 - def init_database_connector(self):

922 """Initialize the database connector. 923 924 Using the database file path, the n-gram maximum size and the learn 925 mode to initialize and open the database. 926 """ 927 if self.dbFile: 928 self.db = SqliteDatabaseConnector(self.dbFile)

929

930 - def get_dict_range(self, prefix):

931 """Select the dictionary range where words starts with the given prefix. 932 933 A suggested word must complete the given token, it means that suggested 934 words all start with this token, here called the prefix. 935 This method create a list containing the suggested words for the 936 given prefix, i.e. every words of the dictionary list starting with 937 the prefix. 938 It is easy as the dictionary list is ordered. For instance: 939 940 If the prefix is:: 941 'hell' 942 943 And the dictionary list is:: 944 ['bird', 'blue', 'given', 'hair', 'hellish', 'hello', 'red', 'zip'] 945 946 We first remove every words of the list one by one until we reach a word 947 which actualy starts with the prefix 'hell', then we have:: 948 ['hellish', 'hello', 'red', 'zip'] 949 950 Finaly we scan every words of the remaining list and when we reach a 951 word which does not starts with the given prefix then we know that every 952 remaining words won't start with the prefix neither as the list is 953 ordered, so we have:: 954 ['hellish', 'hello'] 955 956 @deprecated: This method has become useless since the words are now 957 stored in a database. 958 959 @param prefix: 960 The prefix from which suggested words range is computed. 961 @type prefix: str 962 """ 963 rangeWords = [] 964 for word in self.dictWords: 965 if word.startswith(prefix): 966 rangeWords = self.dictWords[self.dictWords.index(word):] 967 break 968 for word in rangeWords: 969 if not word.startswith(prefix): 970 rangeWords = rangeWords[:rangeWords.index(word)] 971 break 972 return rangeWords

973

974 - def predict(self, maxPartialPredictionSize, stopList):

975 """Complete the actual word or predict the next word using dictionary. 976 977 Use the input buffers (thanks to contextMonitor) and the word dictionary 978 to predict the most probable suggestions. 979 A suggestion is a word which can: 980 - Predict the end of the world. i.e. complete the actual partial 981 word (the user has not finished to input the word, we try to 982 predict the end of the word). 983 - Predict the next word (the user has type a separator after a 984 word, we try to predict the next word before he starts to type 985 it). 986 987 In order to compute the suggestions, this method: 988 - Retrieve the last token from the left input buffer. 989 - Loop for each word in the dictionary: 990 - If the word starts with the last token retrieved: add it to 991 the suggestion list if we have not reach the maximum number of 992 suggestions yet. 993 It is not necessary to check if the word is already in the 994 suggestion list because in a dictionary a word should only 995 appear once. In any case, the merger will merge the 996 duplicate suggestions. 997 998 @param maxPartialPredictionSize: 999 Maximum number of suggestion to compute. If this number is reached, 1000 the suggestions list is immediatly return. 1001 DatabaseConnector.ngram_table_tp() returns the records in descending 1002 order according to their number of occurences so the most probable 1003 suggestions will be added to the list first. 1004 This result in no suggestion quality loss, regardless of the desired 1005 number of suggestions. 1006 @type maxPartialPredictionSize: int 1007 @param stopList: 1008 The stoplist is a list of undesirable words. Any suggestion which 1009 is in the stopList won't be added to the suggestions list. 1010 @type stopList: list 1011 1012 @return: 1013 A list of every suggestions possible (limited to 1014 maxPartialPredictionSize). 1015 @rtype: L{Prediction} 1016 """ 1017 result = Prediction() 1018 prefix = self.contextMonitor.prefix().lower() 1019 count = 0 1020 candidates = self.db.ngram_table_tp([prefix], maxPartialPredictionSize) 1021 for candidate in candidates: 1022 if count > maxPartialPredictionSize: 1023 break 1024 candidate = candidate[-2] 1025 if not candidate in stopList: 1026 result.add_suggestion(Suggestion(candidate, self.prob)) 1027 count += 1 1028 return result

1029

1030 - def learn(self, text):

1031 """This predictor has no ability to learn.""" 1032 pass

1033

Source Code for Module tipy.prdct