prest.prdct

1 #!/usr/bin/env python3 2 # -*- coding: utf-8 -*- 3 4 """Classes for predictors and to handle suggestions and predictions.""" 5 6 from __future__ import absolute_import, unicode_literals 7 import os 8 import sys 9 try: 10 import configparser 11 except ImportError: 12 import ConfigParser as configparser 13 import db 14 import cmbn 15 import abc 16 import stpl 17 from math import exp 18 19 20 MIN_PROBABILITY = 0.0 21 MAX_PROBABILITY = 1.0

22 23 24 -class Suggestion(object):

25 """A suggestion consists of a string and a probility. 26 27 The string is in fact a token and the probability is the probability 28 associated to this token by the predictor which compute it. The 29 probability reflect how accurate the word is over prediction. 30 31 G{classtree Suggestion} 32 """ 33

34 - def __init__(self, word, probability):

35 """Suggestion creator 36 37 A suggestion is a couple formed by a suggested word and its probability. 38 39 @note: the probabilities of each predictors should have the same 40 weight. Otherwise the suggestion selection will be truncated. 41 42 @param word: 43 The suggested word. 44 @type word: str 45 @param probability: 46 The suggested word probability. It is compute by the predictors. 47 @type probability: float 48 """ 49 self.word = word 50 self.probability = probability

51

52 - def __eq__(self, other):

53 """Override the == operator in order to compare instances equality. 54 55 Two Suggestion instances are equal if their word and probability are 56 equal. 57 58 @param other: 59 The Suggestion instance to compare to this one (self). 60 @type other: L{prdct.Suggestion} 61 62 @return: 63 True if the two instances are equal, False otherwise. 64 @rtype: 65 bool 66 """ 67 if self.word == other.word and self.probability == other.probability: 68 return True 69 return False

70

71 - def __lt__(self, other):

72 """Override the < operator in order to compare instances. 73 74 A Suggestion instance is less than another if its probability is less 75 than the probability of the other. If their probabilities are equal then 76 the Suggestion instance is less than the other if its word is 77 alphabetically 'before' the word of the other instance. 78 79 @param other: 80 The Suggestion instance to compare to this one (self). 81 @type other: L{prdct.Suggestion} 82 83 @return: 84 True if the this instance (self) is less than the other one, False 85 otherwise. 86 @rtype: bool 87 """ 88 if self.probability < other.probability: 89 return True 90 if self.probability == other.probability: 91 return self.word < other.word 92 return False

93

94 - def __gt__(self, other):

95 """Override the > operator in order to compare instances. 96 97 A Suggestion instance is greater than another if its probability is 98 greater than the probability of the other. If their probabilities are 99 equal then the Suggestion instance is greater than the other if its word 100 is alphabetically 'after' the word of the other instance. 101 102 @param other: 103 The Suggestion instance to compare to this one (self). 104 @type other: L{prdct.Suggestion} 105 106 @return: 107 True if the this instance (self) is greater than the other one, 108 False otherwise. 109 @rtype: bool 110 """ 111 if self.probability > other.probability: 112 return True 113 if self.probability == other.probability: 114 return self.word > other.word 115 return False

116

117 118 -class Prediction(list):

119 """Class for predictions from predictors. 120 121 A Prediction instance hold multiple Suggestion instances. It is a list of 122 Suggestion instances and the list is kept ordered according to the 123 suggestions probabilities. 124 Every predictors should return a Preidiction instance. 125 126 G{classtree Prediction} 127 """ 128

129 - def __init__(self):

130 """Prediction creator.""" 131 pass

132

133 - def __eq__(self, other):

134 """Override the == operator in order to compare instances. 135 136 Two Prediction instances are equal if they contain the same items 137 (thus, have the same length). 138 """ 139 if self is other: 140 return True 141 if len(self) != len(other): 142 return False 143 for i, s in enumerate(other): 144 if not s == self[i]: 145 return False 146 return True

147

148 - def add_suggestion(self, suggestion):

149 """Add a suggestion in the Prediction list. 150 151 The suggestion is added at the correct index so that the Prediction 152 list remains ordered. 153 154 @note: Using insert() and a while loop seams a little faster than using 155 sorted(). Also, using insort from the bisect module seems to 156 produce similar benchmarks. 157 """ 158 if len(self) == 0: 159 self.append(suggestion) 160 else: 161 i = 0 162 while i < len(self) and suggestion < self[i]: 163 i += 1 164 self.insert(i, suggestion)

165 166 import multiprocessing

167 168 -class PredictorActivator(object):

169 """Query the predictors listed in the registry to get their suggestions. 170 171 This class has access to a PredictorRegistry and asks the predictors listed 172 in this PredictorRegistry to call their predict() method, store the 173 resulting Prediction instances, combine them into a single Prediction 174 instance and return it. 175 176 G{classtree PredictorActivator} 177 """ 178

179 - def __init__(self, config, predictorRegistry):

180 """PredictorActivator creator. 181 182 @param config: 183 The configuration dictionary is used in order to retrieve the 184 PredictorActivator settings from the config file. 185 @type config: L{drvr.Configuration} 186 @param predictorRegistry: 187 The class needs to access the PredictorRegistry to call their 188 predict() method. 189 @type predictorRegistry: L{prdct.PredictorRegistry} 190 """ 191 self.config = config 192 self.predictorRegistry = predictorRegistry 193 self.predictionList = [] 194 self.maxPartialPredictionSize = self.config.getas( 195 'PredictorActivator', 'max_partial_prediction_size', 'int') + 1 196 self.combinationPolicy = self.config.getas( 197 'PredictorActivator', 'combination_policy') 198 self.stopListFile = self.config.getas( 199 'PredictorActivator', 'stoplist') 200 self.stopList = stpl.StopList(self.stopListFile) 201 if self.combinationPolicy.lower() == "probabilistic": 202 self.combiner = cmbn.ProbabilisticCombiner() 203 else: 204 lg.error('Unknown combination policy') 205 sys.exit(1)

206

207 - def pred_worker(self, predictor, queue, factor):

208 """Worker function for the predictor predict() methods. 209 210 This method is used as the predictors workers target. It push the 211 predictor's L{prdcr.Predictor.predict} method result (a 212 L{prdct.Prediction} instance) in a queue (which is used because it is 213 thread-safe). 214 215 @param predictor: 216 The Predictor based class instance. 217 @type predictor: L{prdct.Predictor} based class. 218 @param queue: 219 A queue in which the result will be pushed. 220 @type queue: 221 multiprocessing.Queue 222 @param factor: 223 A factor used to increase the number of suggestions. 224 @type factor: int 225 """ 226 queue.put(predictor.predict( 227 self.maxPartialPredictionSize * factor, self.stopList.words))

228

229 - def predict(self, factor=1):

230 """Build a list of every predicted words. 231 232 Call the predict() method of every predictors in the registry then 233 merge their Prediction into a single Prediction instance. 234 235 @change: 236 - 16/06/15: The method now uses multi-processing. It concurrently 237 runs every predictors's predict() method which allow a 238 significant speed augmentation. The queue is used because it is 239 thread safe. The point is that when the threads args are passed 240 to the L{prdct.PredictorActivator.pred_worker()}, they are 241 packed up with pickle, shipped to the other process, where they 242 are unpacked used. A list wouldn't be passed but would be 243 cloned. 244 245 @note: 246 Using multi-processing allow significant speed boost. The next 247 benchmark have been maid runing 100 * 10 different contexts 248 predictions:: 249 250 Total time without multi-processing: 86.785 s 251 Total time wit multi-processing: 76.513 s 252 253 @todo 0.0.2: 254 Demonize the processes, set a timeout value. When the time runs out 255 the unfinished workers return their results as is. This can alter 256 the prediction qality but totaly avoid any possible "slow 257 predictions". 258 259 @param factor: 260 A factor used to increase the number of suggestions. 261 @type factor: int 262 263 @return: 264 The combined Prediction instance containing every suggestions of 265 every Prediction instances sorted in descending order according to 266 their probabilities. 267 @rtype: L{prdct.Prediction} 268 """ 269 self.predictionList[:] = [] 270 jobs = [] 271 queue = multiprocessing.Queue() 272 for predictor in self.predictorRegistry: 273 p = multiprocessing.Process( 274 target=self.pred_worker, args=(predictor, queue, factor,)) 275 jobs.append(p) 276 p.start() 277 for job in jobs: 278 job.join() 279 for x in range(len(jobs)): 280 self.predictionList.append(queue.get()) 281 return self.combiner.combine(self.predictionList)

282

283 284 -class PredictorRegistry(list): # observer.Observer,

285 """List every predictors instances that are to be used for word prediction. 286 287 G{classtree PredictorRegistry} 288 """ 289

290 - def __init__(self, config):

291 """PredictorRegistry creator. 292 293 @param config: 294 config is used to retrieve the PredictorRegistry settings and each 295 Predictor settings from the config file. Also it needs to be passed 296 to the predictors instances to allow them to retrieve their settings 297 from the config file too. 298 @type config: L{drvr.Configuration} 299 """ 300 self._contextMonitor = None 301 self.config = config 302 self.contextMonitor = None

303

304 - def contextMonitor():

305 306 def fget(self): 307 return self._contextMonitor

308 309 def fset(self, value): 310 if self._contextMonitor is not value: 311 self._contextMonitor = value 312 self[:] = [] 313 self.set_predictors() 314 315 def fdel(self): 316 del self._contextMonitor 317 318 return locals() 319 contextMonitor = property(**contextMonitor()) 320

321 - def set_predictors(self):

322 """Read the configuration file and create needed predictors.""" 323 if self.contextMonitor: 324 self[:] = [] 325 preds = self.config.getas('PredictorRegistry', 'predictors', 'list') 326 for predictor in preds: 327 self.add_predictor(predictor)

328

329 - def add_predictor(self, predictorName):

330 """Create and add a predictor to the list. 331 332 Create a predictor instance according to the predictor name and add 333 it to the list. 334 335 @param predictorName: 336 The name of the predictor. It is used to retrieve the predictor 337 settings from the config. It must correspond to a section of the 338 config, otherwise no predictor will be created and added. 339 @type predictorName: str 340 """ 341 predictorClass = self.config.getas(predictorName, 'class') 342 if predictorClass == 'WeightNgramPredictor': 343 predictor = WeightNgramPredictor( 344 self.config, self.contextMonitor, predictorName) 345 elif predictorClass == 'LastOccurPredictor': 346 predictor = LastOccurPredictor( 347 self.config, self.contextMonitor, predictorName) 348 elif predictorClass == 'MemorizePredictor': 349 predictor = MemorizePredictor( 350 self.config, self.contextMonitor, predictorName) 351 elif predictorClass == 'DictionaryPredictor': 352 predictor = DictionaryPredictor( 353 self.config, self.contextMonitor, predictorName) 354 else: 355 predictor = None 356 if predictor: 357 self.append(predictor)

358

359 - def close_databases(self):

360 """Close every opened predictors database.""" 361 for predictor in self: 362 predictor.close_database()

363

364 365 -class Predictor(object):

366 """Base class for predictors. 367 368 G{classtree Predictor} 369 """ 370 371 __metaclass__ = abc.ABCMeta 372

373 - def __init__(self, config, contextMonitor):

374 """Predictor creator. 375 376 @param config: 377 The config is used to retrieve the predictor settings from the 378 config file. 379 @type config: L{prdct.Configuration} 380 @param contextMonitor: 381 The contextMonitor is needed because it allow the predictor to get 382 the input buffers tokens. 383 @type contextMonitor: L{cntxt.ContextMonitor} 384 """ 385 self.contextMonitor = contextMonitor 386 self.name = "Predictor dosen't set any name" 387 self.config = config

388 389 @abc.abstractmethod

390 - def predict(self, maxPartialPredictionSize, stopList):

391 raise NotImplementedError("Method must be implemented")

392 393 @abc.abstractmethod

394 - def learn(self, text):

395 raise NotImplementedError("Method must be implemented")

396

397 398 -class WeightNgramPredictor(Predictor): # observer.Observer

399 """Compute prediction from n-gram model in database. 400 401 G{classtree WeightNgramPredictor} 402 """ 403

404 - def __init__(self, config, contextMonitor, predictorName=None):

405 """WeightNgramPredictor creator. 406 407 @param config: 408 The config is used to retrieve the predictor settings from the 409 config file. 410 @type config: L{drvr.Configuration} 411 @param contextMonitor: 412 The contextMonitor is needed because it allow the predictor to get 413 the input buffers tokens. 414 @type contextMonitor: L{cntxt.ContextMonitor} 415 @param predictorName: 416 The custom name of the configuration using this predictor. 417 @type predictorName: str 418 """ 419 Predictor.__init__(self, config, contextMonitor) 420 self.name = predictorName 421 self.db = None 422 self.dbFile = self.config.getas(self.name, 'DBFILENAME') 423 self.deltas = self.config.getas(self.name, 'DELTAS', 'floatlist') 424 self.learnMode = self.config.getas(self.name, 'learn') 425 self.maxN = len(self.deltas) 426 self.init_database_connector()

427

428 - def init_database_connector(self):

429 """Initialize the database connector. 430 431 Using the database file path, the n-gram maximum size and the learn 432 mode to initialize and open the database. 433 """ 434 if self.dbFile and self.maxN > 0: 435 self.db = db.SqliteDatabaseConnector(self.dbFile, self.maxN)

436

437 - def predict(self, maxPartialPredictionSize, stopList=[]):

438 """Predict the next word according to the current context. 439 440 Use the input buffers (thanks to contextMonitor) and the n-gram database 441 to predict the most probable suggestions. 442 A suggestion is a word which can: 443 - Predict the end of the world. i.e. complete the actual partial 444 word (the user has not finished to input the word, we try to 445 predict the end of the word). 446 - Predict the next word (the user has type a separator after a word, 447 we try to predict the next word before he starts to type it). 448 449 In order to compute the suggestions, this method: 450 - Retrieve the last n tokens from the left input buffer ; where n is 451 the maximum n-grams size (max(n)) which is stored in the database. 452 - Loop for each n-gram size from max(n) to 1: 453 - Find n-grams of current n-gram size in the database which 454 match the last input tokens. 455 - Add each retrieved n-gram to the suggestion list if it is not 456 already in it and if we have not reach the maximum number of 457 suggestions yet. 458 459 @param maxPartialPredictionSize: 460 Maximum number of suggestion to compute. If this number is reached, 461 the suggestions list is immediatly return. 462 DatabaseConnector.ngram_table_tp() returns the records in descending 463 order according to their number of occurences so the most probable 464 suggestions will be added to the list first. 465 This result in no suggestion quality loss, regardless of the desired 466 number of suggestions. 467 @type maxPartialPredictionSize: int 468 @param stopList: 469 The stoplist is a list of undesirable words. Any suggestion which 470 is in the stopList won't be added to the suggestions list. 471 @type stopList: list 472 473 @return: 474 A list of every suggestions possible (limited to 475 maxPartialPredictionSize). 476 @rtype: L{prdct.Prediction} 477 """ 478 tokens = [''] * self.maxN 479 for i in range(self.maxN): 480 tokens[self.maxN - 1 - i] = self.contextMonitor.left_token(i) 481 prefixCompletionCandidates = [] 482 for k in reversed(range(self.maxN)): 483 if len(prefixCompletionCandidates) >= maxPartialPredictionSize: 484 break 485 prefixNgram = tokens[(len(tokens) - k - 1):] 486 partial = None 487 partial = self.db.ngram_table_tp( 488 prefixNgram, 489 maxPartialPredictionSize - len(prefixCompletionCandidates)) 490 for p in partial: 491 if len(prefixCompletionCandidates) > maxPartialPredictionSize: 492 break 493 candidate = p[-2] 494 if candidate not in prefixCompletionCandidates: 495 if not candidate.lower() in stopList: 496 prefixCompletionCandidates.append(candidate) 497 return self.weight(prefixCompletionCandidates, tokens)

498

499 - def weight(self, prefixCompletionCandidates, tokens):

500 """Compute probability of suggestions and return the most probable ones. 501 502 The probability of a suggestion is based on its relative frequency 503 toward the whole set of suggestions and the number of single tokens in 504 the database. 505 506 @param prefixCompletionCandidates: 507 List of every suggestions returned by self.predict(). 508 @type prefixCompletionCandidates: list 509 @param tokens: 510 The last input tokens. 511 @type tokens: list 512 513 @return: 514 List of every "good enought" suggestions. 515 @rtype: L{prdct.Prediction} 516 """ 517 prediction = Prediction() 518 unigramCountsSum = self.db.sum_ngrams_occ(1) 519 for j, candidate in enumerate(prefixCompletionCandidates): 520 tokens[self.maxN - 1] = candidate 521 probability = 0 522 for k in range(self.maxN): 523 numerator = self.count(tokens, 0, k + 1) 524 denominator = unigramCountsSum 525 if numerator > 0: 526 denominator = self.count(tokens, -1, k) 527 frequency = 0 528 if denominator > 0: 529 frequency = float(numerator) / denominator 530 probability += self.deltas[k] * frequency 531 if probability > 0: 532 prediction.add_suggestion( 533 Suggestion(tokens[self.maxN - 1], probability)) 534 return(prediction)

535

536 - def close_database(self):

537 """Close the predictor's database.""" 538 self.db.close_database()

539

540 - def learn(self, change):

541 """Learn what need to be learnt by adding n-grams in database. 542 543 @param change: 544 The part of the left input buffer which represent the last change. 545 @type change: str 546 """ 547 if self.learnMode is False: 548 return 549 ngramMap = self.make_ngram_map(change) 550 ngramMap = self.prefix_ngrams_with_input(change, ngramMap) 551 self.push_ngrams_in_db(ngramMap)

552

553 - def make_ngram_map(self, change):

554 """Create a map associating n-grams (lists of words) and their count. 555 556 @param change: 557 The part of the left input buffer which represent the last change. 558 @type change: str 559 """ 560 ngramMap = {} 561 for curCard in range(1, self.maxN + 1): 562 changeIdx = 0 563 changeSize = len(change) 564 ngramList = () 565 for i in range(curCard - 1): 566 if changeIdx >= changeSize: 567 break 568 ngramList = ngramList + (change[changeIdx],) 569 changeIdx += 1 570 while changeIdx < changeSize: 571 ngramList = ngramList + (change[changeIdx],) 572 changeIdx += 1 573 try: 574 ngramMap[ngramList] = ngramMap[ngramList] + 1 575 except KeyError: 576 ngramMap[ngramList] = 1 577 ngramList = ngramList[1:] 578 curCard += 1 579 return ngramMap

580

581 - def prefix_ngrams_with_input(self, change, ngramMap):

582 """Use the input left buffer to expand the n-gram map. 583 584 This method call ContextMonitor.previous_tokens() to get the tokens from 585 the left input buffer that are just before the change and add them 586 BEFORE the change n-grams generated by self.make_ngram_map(). 587 588 For instance, if the current left input buffer is:: 589 "phone is on the white table " 590 591 And change is:: 592 "table" 593 594 Then, the n-gram map generated by self.make_ngram_map() will be:: 595 {("table"): 1} 596 597 The n-gram map contain a sinle n-gram of size 1. And so this method 598 will add the tokens preceding the change in the left input buffer to 599 form n-grams of size 2 and more (until it reaches self.maxN):: 600 {("the", "white", "table"): 1, ("white", "table"): 1, {"table"): 1} 601 602 @param change: 603 The part of the left input buffer which represent the last change. 604 @type change: str 605 @param ngramMap: 606 Dictionary associating n-grams with their number of occurences, 607 generated by self.make_ngram_map(). 608 @type ngramMap: dict 609 610 @return: 611 The extanded n-grams dictionary. 612 @rtype: dict 613 """ 614 changeMatchInput = (change and 615 change[-1] == self.contextMonitor.left_token(1) and 616 self.contextMonitor.left_token(len(change))) 617 if changeMatchInput: 618 ngramList = tuple(change[:1]) 619 tkIdx = 1 620 while len(ngramList) < self.maxN: 621 extraToken = self.contextMonitor.previous_tokens( 622 tkIdx, change) 623 if not extraToken: 624 break 625 ngramList = (extraToken,) + ngramList 626 try: 627 ngramMap[ngramList] = ngramMap[ngramList] + 1 628 except KeyError: 629 ngramMap[ngramList] = 1 630 tkIdx += 1 631 return ngramMap

632

633 - def push_ngrams_in_db(self, ngramMap):

634 """Update the database with the n-grams contained in the n-gram map. 635 636 Each n-gram of the n-gram map is pushed into the database with its 637 number of occurences (count). 638 If the n-gram is already in the database then its count (number of 639 occurences) is updated. If the n-gram is not in the database then it is 640 simply inserted in it. 641 642 @param ngramMap: 643 Dictionary associating n-grams with their number of occurences, 644 generated by self.make_ngram_map() and modified by 645 self.prefix_ngrams_with_input(). 646 @type ngramMap: dict 647 """ 648 for ngram in ngramMap: 649 count = self.db.ngram_count(ngram) 650 if count > 0: 651 self.db.update_ngram(ngram, count + ngramMap[ngram]) 652 else: 653 self.db.insert_ngram(list(ngram), ngramMap[ngram]) 654 self.db.commit()

655

656 - def count(self, tokens, offset, n):

657 """Make an n-gram then retrieve and return its 'count' entry in the db. 658 659 @param tokens: 660 The tokens used to make the n-gram. 661 @type tokens: list 662 @param offset: 663 Offsset of the first token in the tokens. 664 @type offset: int 665 @param n: 666 Size of the n-gram. 667 @type n: int 668 """ 669 if n > 0: 670 ngram = tokens[len(tokens) - n + offset:len(tokens) + offset] 671 result = self.db.ngram_count(ngram) 672 else: 673 result = self.db.sum_ngrams_occ(1) 674 return result

675

676 677 -class LastOccurPredictor(Predictor):

678 """Compute predictions based on their last occurences and frequencies. 679 680 G{classtree LastOccurPredictor} 681 """ 682

683 - def __init__(self, config, contextMonitor, predictorName=None):

684 """LastOccurPredictor creator. 685 686 @param config: 687 The config is used to retrieve the predictor settings from the 688 config file. 689 @type config: L{drvr.Configuration} 690 @param contextMonitor: 691 The contextMonitor is needed because it allow the predictor to get 692 the input buffers tokens. 693 @type contextMonitor: L{cntxt.ContextMonitor} 694 @param predictorName: 695 The custom name of the configuration using this predictor. 696 @type predictorName: str 697 """ 698 Predictor.__init__(self, config, contextMonitor) 699 self.name = predictorName 700 self.lambdav = self.config.getas(self.name, 'lambda', 'int') 701 self.n0 = self.config.getas(self.name, 'n_0', 'int') 702 self.cutoffThreshold = self.config.getas( 703 self.name, 'cutoff_threshold', 'int')

704

705 - def predict(self, maxPartialPredictionSize, stopList=[]):

706 """Compute the predictions using a simple exponential decay method. 707 708 @param maxPartialPredictionSize: 709 Maximum number of suggestion to compute. If this number is reached, 710 the suggestions list is immediatly return. 711 DatabaseConnector.ngram_table_tp() returns the records in descending 712 order according to their number of occurences so the most probable 713 suggestions will be added to the list first. 714 This result in no suggestion quality loss, regardless of the desired 715 number of suggestions. 716 @type maxPartialPredictionSize: int 717 @param stopList: 718 The stoplist is a list of undesirable words. Any suggestion which 719 is in the stopList won't be added to the suggestions list. 720 @type stopList: list 721 722 @return: 723 A list of every suggestions possible (limited to 724 maxPartialPredictionSize). 725 @rtype: L{prdct.Prediction} 726 """ 727 result = Prediction() 728 prefix = self.contextMonitor.prefix() 729 if prefix: 730 index = 1 731 token = self.contextMonitor.left_token(index) 732 prob = 0 733 while (token and 734 len(result) < maxPartialPredictionSize and 735 index <= self.cutoffThreshold): 736 if token.startswith(prefix): 737 if not token.lower() in stopList: 738 prob = self.n0 * exp(- (self.lambdav * (index - 1))) 739 result.add_suggestion(Suggestion(token, prob)) 740 index += 1 741 token = self.contextMonitor.left_token(index) 742 return result

743

744 - def learn(self, text):

745 """This predictor has no ability to learn.""" 746 pass

747

748 749 -class MemorizePredictor(Predictor):

750 """Predict words based on memorized (learnt) input tokens patterns. 751 752 This predictor is capable of tokens memorization. It memorize the inputed 753 tokens and try to predict the suggestion using memorized tokens and n-grams 754 (group of consecutive tokens). 755 756 G{classtree MemorizePredictor} 757 """ 758

759 - def __init__(self, config, contextMonitor, predictorName=None):

760 """MemorizePredictor creator. 761 762 @param config: 763 The config is used to retrieve the predictor settings from the 764 config file. 765 @type config: L{drvr.Configuration} 766 @param contextMonitor: 767 The contextMonitor is needed because it allow the predictor to get 768 the input buffers tokens. 769 @type contextMonitor: L{cntxt.ContextMonitor} 770 @param predictorName: 771 The custom name of the configuration using this predictor. 772 @type predictorName: str 773 """ 774 Predictor.__init__(self, config, contextMonitor) 775 self.name = predictorName 776 self.memory = self.config.getas(self.name, 'memory') 777 self.trigger = self.config.getas(self.name, 'trigger', 'int') 778 self.learnMode = self.config.getas(self.name, 'learn')

779

780 - def predict(self, maxPartialPredictionSize, stopList):

781 """Predict words based on memorized input tokens. 782 783 @param maxPartialPredictionSize: 784 Maximum number of suggestion to compute. If this number is reached, 785 the suggestions list is immediatly return. 786 DatabaseConnector.ngram_table_tp() returns the records in descending 787 order according to their number of occurences so the most probable 788 suggestions will be added to the list first. 789 This result in no suggestion quality loss, regardless of the desired 790 number of suggestions. 791 @type maxPartialPredictionSize: int 792 @param stopList: 793 The stoplist is a list of undesirable words. Any suggestion which 794 is in the stopList won't be added to the suggestions list. 795 @type stopList: list 796 797 @return: 798 A list of every suggestions possible (limited to 799 maxPartialPredictionSize). 800 @rtype: L{prdct.Prediction} 801 """ 802 result = Prediction() 803 memTrigger = [] 804 try: 805 memFile = open(self.memory, 'r+') 806 except FileNotFoundError: 807 lg_error('Cannot open file ' + self.memory) 808 return 809 if self.init_mem_trigg(memTrigger): 810 rollingWindow = '' 811 if self.init_rolling_window(rollingWindow, memFile): 812 token = '' 813 while memFile.write(token): 814 if memTrigger == rollingWindow: 815 if not token.lower() in stopList: 816 result.add_suggestion(Suggestion(token, 1.)) 817 self.update_rolling_window(rollingWindow, token) 818 memFile.close() 819 return result

820

821 - def learn(self, change):

822 """Learn what need to be learnt by tokens in the memory file. 823 824 @param change: 825 The part of the left input buffer which represent the last change. 826 @type change: str 827 """ 828 if self.learnMode is False: 829 return 830 try: 831 memFile = open(self.memory, 'a') 832 except FileNotFoundError: 833 lg_error('Cannot open file ' + self.memory) 834 return 835 for tok in change: 836 memFile.write(tok + '\n') 837 memFile.close()

838

839 - def init_mem_trigg(self, memTrigger):

840 result = False 841 for i in range(self.trigger, 0, -1): 842 memTrigger.append(self.contextMonitor.left_token(i)) 843 if not '' in memTrigger: 844 result = True 845 return result

846

847 - def init_rolling_window(self, rollingWindow, memFile):

848 tmp = [x.strip('\n') for x in memFile.readlines()] 849 token = '' 850 count = 0 851 while count < self.trigger and tmp[count]: 852 count += 1 853 return count == self.trigger

854

855 - def update_rolling_window(self, rollingWindow, token):

856 rollingWindow = rollingWindow[1:] 857 rollingWindow += token

858

859 860 -class DictionaryPredictor(Predictor):

861 """Very simple word predictor using a dictionary. 862 863 The dictionary is a file containing one word per line. This predictor does 864 not use n-grams and is therefore less effective than the predictors using 865 n-grams because it does not consider context. 866 867 G{classtree DictionaryPredictor} 868 """ 869

870 - def __init__(self, config, contextMonitor, predictorName):

871 """DictionaryPredictor creator. 872 873 @note: The string.lower() and string.strip() methods have a great impact 874 on performance (the profile module show that they require almost 875 1 second of processing time when calculating suggestions for 10 876 contexts. So this constructor no more directly use the dictionary 877 file. A database is created instead. 878 Every words of the dictionary are lowered and stripped then added 879 to the database. 880 Doing so, the performance of the predictor are way better. 881 Profiling a script querying suggestions for 10 successive contexts 882 show the improvement profits: 883 - lower()ing and strip()ping each word of the file on each 884 predict() call:: 885 ncalls tottime percall cumtime percall filename:lineno 886 690048 0.468 0.000 0.468 0.000 :0(lower) 887 - Creating an improved list upon initialization and using it on 888 each predict() call (previous optimization method):: 889 ncalls tottime percall cumtime percall filename:lineno 890 100046 0.059 0.000 0.059 0.000 :0(lower) 891 It is approx. 800% faster. But this profiling mix 892 initialization and later computation. It means than most of 893 the time of the previous profiling line is spend in 894 initializing the list, computation on each predict() call are 895 even more profitable. 896 - Creating a database and querying it on each predict() call:: 897 ncalls tottime percall cumtime percall filename:lineno 898 100046 0.059 0.000 0.059 0.000 :0(lower) 899 It is not faster than the previous method but the database 900 must only be created once. And once it is created the 901 initialization time is (near) null and the querying time on 902 each predict() call is even faster. 903 904 @change: 905 - 08/06/15: Method now create an ordered optimized list containing 906 dictionary words upon initialization in order to increase the 907 speed of the predictor. 908 - 13/06/15: Method now use a database containing the dictionary 909 words. See: L{minr.DictMiner} 910 911 @param config: 912 The config is used to retrieve the predictor settings from the 913 config file. 914 @type config: L{drvr.Configuration} 915 @param contextMonitor: 916 The contextMonitor is needed because it allow the predictor to get 917 the input buffers tokens. 918 @type contextMonitor: L{cntxt.ContextMonitor} 919 @param predictorName: 920 The custom name of the configuration using this predictor. 921 @type predictorName: str 922 """ 923 Predictor.__init__(self, config, contextMonitor) 924 self.name = predictorName 925 self.dbFile = self.config.getas(self.name, 'dbfilename') 926 self.db = None 927 self.prob = self.config.getas(self.name, 'probability', 'float') 928 self.init_database_connector()

929

930 - def init_database_connector(self):

931 """Initialize the database connector. 932 933 Using the database file path, the n-gram maximum size and the learn 934 mode to initialize and open the database. 935 """ 936 if self.dbFile: 937 self.db = db.SqliteDatabaseConnector(self.dbFile)

938

939 - def get_dict_range(self, prefix):

940 """Select the dictionary range where words starts with the given prefix. 941 942 A suggested word must complete the given token, it means that suggested 943 words all start with this token, here called the prefix. 944 This method create a list containing the suggested words for the 945 given prefix, i.e. every words of the dictionary list starting with 946 the prefix. 947 It is easy as the dictionary list is ordered. For instance: 948 949 If the prefix is:: 950 'hell' 951 952 And the dictionary list is:: 953 ['bird', 'blue', 'given', 'hair', 'hellish', 'hello', 'red', 'zip'] 954 955 We first remove every words of the list one by one until we reach a word 956 which actualy starts with the prefix 'hell', then we have:: 957 ['hellish', 'hello', 'red', 'zip'] 958 959 Finaly we scan every words of the remaining list and when we reach a 960 word which does not starts with the given prefix then we know that every 961 remaining words won't start with the prefix neither as the list is 962 ordered, so we have:: 963 ['hellish', 'hello'] 964 965 @warning: This method has become useless since the words are now stored 966 in a database. 967 968 @param prefix: 969 The prefix from which suggested words range is computed. 970 @type prefix: str 971 """ 972 rangeWords = [] 973 for word in self.dictWords: 974 if word.startswith(prefix): 975 rangeWords = self.dictWords[self.dictWords.index(word):] 976 break 977 for word in rangeWords: 978 if not word.startswith(prefix): 979 rangeWords = rangeWords[:rangeWords.index(word)] 980 break 981 return rangeWords

982

983 - def predict(self, maxPartialPredictionSize, stopList):

984 """Complete the actual word or predict the next word using dictionary. 985 986 Use the input buffers (thanks to contextMonitor) and the word dictionary 987 to predict the most probable suggestions. 988 A suggestion is a word which can: 989 - Predict the end of the world. i.e. complete the actual partial 990 word (the user has not finished to input the word, we try to 991 predict the end of the word). 992 - Predict the next word (the user has type a separator after a 993 word, we try to predict the next word before he starts to type 994 it). 995 996 In order to compute the suggestions, this method: 997 - Retrieve the last token from the left input buffer. 998 - Loop for each word in the dictionary: 999 - If the word starts with the last token retrieved: add it to 1000 the suggestion list if we have not reach the maximum number of 1001 suggestions yet. 1002 It is not necessary to check if the word is already in the 1003 suggestion list because in a dictionary a word should only 1004 appear once. In any case, the combiner will merge the 1005 duplicate suggestions. 1006 1007 @param maxPartialPredictionSize: 1008 Maximum number of suggestion to compute. If this number is reached, 1009 the suggestions list is immediatly return. 1010 DatabaseConnector.ngram_table_tp() returns the records in descending 1011 order according to their number of occurences so the most probable 1012 suggestions will be added to the list first. 1013 This result in no suggestion quality loss, regardless of the desired 1014 number of suggestions. 1015 @type maxPartialPredictionSize: int 1016 @param stopList: 1017 The stoplist is a list of undesirable words. Any suggestion which 1018 is in the stopList won't be added to the suggestions list. 1019 @type stopList: list 1020 1021 @return: 1022 A list of every suggestions possible (limited to 1023 maxPartialPredictionSize). 1024 @rtype: L{prdct.Prediction} 1025 """ 1026 result = Prediction() 1027 prefix = self.contextMonitor.prefix().lower() 1028 count = 0 1029 candidates = self.db.ngram_table_tp([prefix], maxPartialPredictionSize) 1030 for candidate in candidates: 1031 if count > maxPartialPredictionSize: 1032 break 1033 candidate = candidate[-2] 1034 if not candidate in stopList: 1035 result.add_suggestion(Suggestion(candidate, self.prob)) 1036 count += 1 1037 return result

1038

1039 - def learn(self, text):

1040 """This predictor has no ability to learn.""" 1041 pass

1042

Source Code for Module prest.prdct