1
2
3
4 """Classes for predictors and to handle suggestions and predictions."""
5
6 from sys import exit
7 from tipy.db import SqliteDatabaseConnector
8 from tipy.mrgr import ProbabilisticMerger
9 from abc import ABCMeta, abstractmethod
10 from tipy.stpl import *
11 from math import exp
12 from multiprocessing import Queue, Process
16 """A suggestion consists of a string and a probility.
17
18 The string is in fact a token and the probability is the probability
19 associated to this token by the predictor which compute it. The
20 probability reflect how accurate the word is over prediction.
21
22 G{classtree Suggestion}
23 """
24
26 """Suggestion creator
27
28 A suggestion is a couple formed by a suggested word and its probability.
29
30 @note: the probabilities of each predictors should have the same
31 weight. Otherwise the suggestion selection will be truncated.
32
33 @param word:
34 The suggested word.
35 @type word: str
36 @param probability:
37 The suggested word probability. It is compute by the predictors.
38 @type probability: float
39 """
40 self.word = word
41 self.probability = probability
42
44 """Override the == operator in order to compare instances equality.
45
46 Two Suggestion instances are equal if their word and probability are
47 equal.
48
49 @param other:
50 The Suggestion instance to compare to this one (self).
51 @type other: L{Suggestion}
52
53 @return:
54 True if the two instances are equal, False otherwise.
55 @rtype:
56 bool
57 """
58 if self.word == other.word and self.probability == other.probability:
59 return True
60 return False
61
63 """Override the < operator in order to compare instances.
64
65 A Suggestion instance is less than another if its probability is less
66 than the probability of the other. If their probabilities are equal then
67 the Suggestion instance is less than the other if its word is
68 alphabetically 'before' the word of the other instance.
69
70 @param other:
71 The Suggestion instance to compare to this one (self).
72 @type other: L{Suggestion}
73
74 @return:
75 True if the this instance (self) is less than the other one, False
76 otherwise.
77 @rtype: bool
78 """
79 if self.probability < other.probability:
80 return True
81 if self.probability == other.probability:
82 return self.word < other.word
83 return False
84
86 """Override the > operator in order to compare instances.
87
88 A Suggestion instance is greater than another if its probability is
89 greater than the probability of the other. If their probabilities are
90 equal then the Suggestion instance is greater than the other if its word
91 is alphabetically 'after' the word of the other instance.
92
93 @param other:
94 The Suggestion instance to compare to this one (self).
95 @type other: L{Suggestion}
96
97 @return:
98 True if the this instance (self) is greater than the other one,
99 False otherwise.
100 @rtype: bool
101 """
102 if self.probability > other.probability:
103 return True
104 if self.probability == other.probability:
105 return self.word > other.word
106 return False
107
110 """Class for predictions from predictors.
111
112 A Prediction instance hold multiple Suggestion instances. It is a list of
113 Suggestion instances and the list is kept ordered according to the
114 suggestions probabilities.
115 Every predictors should return a Preidiction instance.
116
117 G{classtree Prediction}
118 """
119
121 """Prediction creator."""
122 pass
123
125 """Override the == operator in order to compare instances.
126
127 Two Prediction instances are equal if they contain the same items
128 (thus, have the same length).
129 """
130 if self is other:
131 return True
132 if len(self) != len(other):
133 return False
134 for i, s in enumerate(other):
135 if not s == self[i]:
136 return False
137 return True
138
140 """Add a suggestion in the Prediction list.
141
142 The suggestion is added at the correct index so that the Prediction
143 list remains ordered.
144
145 @note: Using insert() and a while loop seams a little faster than using
146 sorted(). Also, using insort from the bisect module seems to
147 produce similar benchmarks.
148 """
149 if len(self) == 0:
150 self.append(suggestion)
151 else:
152 i = 0
153 while i < len(self) and suggestion < self[i]:
154 i += 1
155 self.insert(i, suggestion)
156
159 """Query the predictors listed in the registry to get their suggestions.
160
161 This class has access to a PredictorRegistry and asks the predictors listed
162 in this PredictorRegistry to call their predict() method, store the
163 resulting Prediction instances, merge them into a single Prediction
164 instance and return it.
165
166 G{classtree PredictorActivator}
167 """
168
169 - def __init__(self, config, predictorRegistry):
170 """PredictorActivator creator.
171
172 @param config:
173 The configuration dictionary is used in order to retrieve the
174 PredictorActivator settings from the config file.
175 @type config: L{drvr.Configuration}
176 @param predictorRegistry:
177 The class needs to access the PredictorRegistry to call their
178 predict() method.
179 @type predictorRegistry: L{PredictorRegistry}
180 """
181 self.config = config
182 self.predictorRegistry = predictorRegistry
183 self.predictionList = []
184 self.maxPartialPredictionSize = self.config.getas(
185 'PredictorActivator', 'max_partial_prediction_size', 'int') + 1
186 self.mergingMethod = self.config.getas(
187 'PredictorActivator', 'merging_method')
188 self.stopListFile = self.config.getas(
189 'PredictorActivator', 'stoplist')
190 self.stopList = StopList(self.stopListFile)
191 if self.mergingMethod.lower() == "probabilistic":
192 self.merger = ProbabilisticMerger()
193 else:
194 lg.error('Unknown merging method')
195 exit(1)
196
198 """Worker function for the predictor predict() methods.
199
200 This method is used as the predictors workers target. It push the
201 predictor's L{prdct.Predictor.predict} method result (a
202 L{Prediction} instance) in a queue (which is used because it is
203 thread-safe).
204
205 @param predictor:
206 The Predictor based class instance.
207 @type predictor: L{Predictor} based class.
208 @param queue:
209 A queue in which the result will be pushed.
210 @type queue:
211 multiprocessing.Queue
212 @param factor:
213 A factor used to increase the number of suggestions.
214 @type factor: int
215 """
216 queue.put(predictor.predict(
217 self.maxPartialPredictionSize * factor, self.stopList.words))
218
220 """Build a list of every predicted words.
221
222 Call the predict() method of every predictors in the registry then
223 merge their Prediction into a single Prediction instance.
224
225 @change:
226 - 16/06/15: The method now uses multi-processing. It concurrently
227 runs every predictors's predict() method which allow a
228 significant speed augmentation. The queue is used because it is
229 thread safe. The point is that when the threads args are passed
230 to the L{PredictorActivator.pred_worker()}, they are
231 packed up with pickle, shipped to the other process, where they
232 are unpacked used. A list wouldn't be passed but would be
233 cloned.
234
235 @note:
236 Using multi-processing allow significant speed boost. The next
237 benchmark have been maid runing 100 * 10 different contexts
238 predictions::
239
240 Total time without multi-processing: 86.785 s
241 Total time wit multi-processing: 76.513 s
242
243 @todo 0.0.9:
244 Demonize the processes, set a timeout value. When the time runs out
245 the unfinished workers return their results as is. This can alter
246 the prediction qality but totaly avoid any possible "slow
247 predictions".
248
249 @param factor:
250 A factor used to increase the number of suggestions.
251 @type factor: int
252
253 @return:
254 The merged Prediction instance containing every suggestions of
255 every Prediction instances sorted in descending order according to
256 their probabilities.
257 @rtype: L{Prediction}
258 """
259 self.predictionList[:] = []
260 jobs = []
261 queue = Queue()
262 for predictor in self.predictorRegistry:
263 p = Process(
264 target=self.pred_worker, args=(predictor, queue, factor,))
265 jobs.append(p)
266 p.start()
267 for job in jobs:
268 job.join()
269 for x in range(len(jobs)):
270 self.predictionList.append(queue.get())
271 return self.merger.merge(self.predictionList)
272
275 """List every predictors instances that are to be used for word prediction.
276
277 G{classtree PredictorRegistry}
278 """
279
281 """PredictorRegistry creator.
282
283 @param config:
284 config is used to retrieve the PredictorRegistry settings and each
285 Predictor settings from the config file. Also it needs to be passed
286 to the predictors instances to allow them to retrieve their settings
287 from the config file too.
288 @type config: L{drvr.Configuration}
289 """
290 self._contextMonitor = None
291 self.config = config
292 self.contextMonitor = None
293
295
296 def fget(self):
297 return self._contextMonitor
298
299 def fset(self, value):
300 if self._contextMonitor is not value:
301 self._contextMonitor = value
302 self[:] = []
303 self.set_predictors()
304
305 def fdel(self):
306 del self._contextMonitor
307
308 return locals()
309 contextMonitor = property(**contextMonitor())
310
312 """Read the configuration file and create needed predictors."""
313 if self.contextMonitor:
314 self[:] = []
315 preds = self.config.getas('PredictorRegistry', 'predictors', 'list')
316 for predictor in preds:
317 self.add_predictor(predictor)
318
320 """Create and add a predictor to the list.
321
322 Create a predictor instance according to the predictor name and add
323 it to the list.
324
325 @param predictorName:
326 The name of the predictor. It is used to retrieve the predictor
327 settings from the config. It must correspond to a section of the
328 config, otherwise no predictor will be created and added.
329 @type predictorName: str
330 """
331 predictorClass = self.config.getas(predictorName, 'class')
332 if predictorClass == 'WeightNgramPredictor':
333 predictor = WeightNgramPredictor(
334 self.config, self.contextMonitor, predictorName)
335 elif predictorClass == 'LastOccurPredictor':
336 predictor = LastOccurPredictor(
337 self.config, self.contextMonitor, predictorName)
338 elif predictorClass == 'MemorizePredictor':
339 predictor = MemorizePredictor(
340 self.config, self.contextMonitor, predictorName)
341 elif predictorClass == 'DictionaryPredictor':
342 predictor = DictionaryPredictor(
343 self.config, self.contextMonitor, predictorName)
344 else:
345 predictor = None
346 if predictor:
347 self.append(predictor)
348
350 """Close every opened predictors database."""
351 for predictor in self:
352 predictor.close_database()
353
356 """Base class for predictors.
357
358 G{classtree Predictor}
359 """
360
361 __metaclass__ = ABCMeta
362
363 - def __init__(self, config, contextMonitor):
364 """Predictor creator.
365
366 @param config:
367 The config is used to retrieve the predictor settings from the
368 config file.
369 @type config: L{Configuration}
370 @param contextMonitor:
371 The contextMonitor is needed because it allow the predictor to get
372 the input buffers tokens.
373 @type contextMonitor: L{ContextMonitor}
374 """
375 self.contextMonitor = contextMonitor
376 self.name = "Predictor dosen't set any name"
377 self.config = config
378
379 @abstractmethod
380 - def predict(self, maxPartialPredictionSize, stopList):
381 raise NotImplementedError("Method must be implemented")
382
383 @abstractmethod
385 raise NotImplementedError("Method must be implemented")
386
389 """Compute prediction from n-gram model in database.
390
391 G{classtree WeightNgramPredictor}
392 """
393
394 - def __init__(self, config, contextMonitor, predictorName=None):
395 """WeightNgramPredictor creator.
396
397 @param config:
398 The config is used to retrieve the predictor settings from the
399 config file.
400 @type config: L{drvr.Configuration}
401 @param contextMonitor:
402 The contextMonitor is needed because it allow the predictor to get
403 the input buffers tokens.
404 @type contextMonitor: L{ContextMonitor}
405 @param predictorName:
406 The custom name of the configuration using this predictor.
407 @type predictorName: str
408 """
409 Predictor.__init__(self, config, contextMonitor)
410 self.name = predictorName
411 self.db = None
412 self.dbFile = self.config.getas(self.name, 'database')
413 self.deltas = self.config.getas(self.name, 'DELTAS', 'floatlist')
414 self.learnMode = self.config.getas(self.name, 'learn', 'bool')
415 self.maxN = len(self.deltas)
416 self.init_database_connector()
417
419 """Initialize the database connector.
420
421 Using the database file path, the n-gram maximum size and the learn
422 mode to initialize and open the database.
423 """
424 if self.dbFile and self.maxN > 0:
425 self.db = SqliteDatabaseConnector(self.dbFile, self.maxN)
426
427 - def predict(self, maxPartialPredictionSize, stopList=[]):
428 """Predict the next word according to the current context.
429
430 Use the input buffers (thanks to contextMonitor) and the n-gram database
431 to predict the most probable suggestions.
432 A suggestion is a word which can:
433 - Predict the end of the world. i.e. complete the actual partial
434 word (the user has not finished to input the word, we try to
435 predict the end of the word).
436 - Predict the next word (the user has type a separator after a word,
437 we try to predict the next word before he starts to type it).
438
439 In order to compute the suggestions, this method:
440 - Retrieve the last n tokens from the left input buffer ; where n is
441 the maximum n-grams size (max(n)) which is stored in the database.
442 - Loop for each n-gram size from max(n) to 1:
443 - Find n-grams of current n-gram size in the database which
444 match the last input tokens.
445 - Add each retrieved n-gram to the suggestion list if it is not
446 already in it and if we have not reach the maximum number of
447 suggestions yet.
448
449 @param maxPartialPredictionSize:
450 Maximum number of suggestion to compute. If this number is reached,
451 the suggestions list is immediatly return.
452 DatabaseConnector.ngram_table_tp() returns the records in descending
453 order according to their number of occurences so the most probable
454 suggestions will be added to the list first.
455 This result in no suggestion quality loss, regardless of the desired
456 number of suggestions.
457 @type maxPartialPredictionSize: int
458 @param stopList:
459 The stoplist is a list of undesirable words. Any suggestion which
460 is in the stopList won't be added to the suggestions list.
461 @type stopList: list
462
463 @return:
464 A list of every suggestions possible (limited to
465 maxPartialPredictionSize).
466 @rtype: L{Prediction}
467 """
468 tokens = [''] * self.maxN
469 for i in range(self.maxN):
470 tokens[self.maxN - 1 - i] = self.contextMonitor.left_token(i)
471 prefixCompletionCandidates = []
472 for k in reversed(range(self.maxN)):
473 if len(prefixCompletionCandidates) >= maxPartialPredictionSize:
474 break
475 prefixNgram = tokens[(len(tokens) - k - 1):]
476 partial = None
477 partial = self.db.ngram_table_tp(
478 prefixNgram,
479 maxPartialPredictionSize - len(prefixCompletionCandidates))
480 for p in partial:
481 if len(prefixCompletionCandidates) > maxPartialPredictionSize:
482 break
483 candidate = p[-2]
484 if candidate not in prefixCompletionCandidates:
485 if not candidate.lower() in stopList:
486 prefixCompletionCandidates.append(candidate)
487 return self.weight(prefixCompletionCandidates, tokens)
488
489 - def weight(self, prefixCompletionCandidates, tokens):
490 """Compute probability of suggestions and return the most probable ones.
491
492 The probability of a suggestion is based on its relative frequency
493 toward the whole set of suggestions and the number of single tokens in
494 the database.
495
496 @param prefixCompletionCandidates:
497 List of every suggestions returned by self.predict().
498 @type prefixCompletionCandidates: list
499 @param tokens:
500 The last input tokens.
501 @type tokens: list
502
503 @return:
504 List of every "good enought" suggestions.
505 @rtype: L{Prediction}
506 """
507 prediction = Prediction()
508 unigramCountsSum = self.db.sum_ngrams_occ(1)
509 for j, candidate in enumerate(prefixCompletionCandidates):
510 tokens[self.maxN - 1] = candidate
511 probability = 0
512 for k in range(self.maxN):
513 numerator = self.count(tokens, 0, k + 1)
514 denominator = unigramCountsSum
515 if numerator > 0:
516 denominator = self.count(tokens, -1, k)
517 frequency = 0
518 if denominator > 0:
519 frequency = float(numerator) / denominator
520 probability += self.deltas[k] * frequency
521 if probability > 0:
522 prediction.add_suggestion(
523 Suggestion(tokens[self.maxN - 1], probability))
524 return(prediction)
525
529
530 - def learn(self, change):
531 """Learn what need to be learnt by adding n-grams in database.
532
533 @param change:
534 The part of the left input buffer which represent the last change.
535 @type change: str
536 """
537 if self.learnMode is False:
538 return
539 ngramMap = self.make_ngram_map(change)
540 ngramMap = self.prefix_ngrams_with_input(change, ngramMap)
541 self.push_ngrams_in_db(ngramMap)
542
544 """Create a map associating n-grams (lists of words) and their count.
545
546 @param change:
547 The part of the left input buffer which represent the last change.
548 @type change: str
549 """
550 ngramMap = {}
551 for curCard in range(1, self.maxN + 1):
552 changeIdx = 0
553 changeSize = len(change)
554 ngramList = ()
555 for i in range(curCard - 1):
556 if changeIdx >= changeSize:
557 break
558 ngramList = ngramList + (change[changeIdx],)
559 changeIdx += 1
560 while changeIdx < changeSize:
561 ngramList = ngramList + (change[changeIdx],)
562 changeIdx += 1
563 try:
564 ngramMap[ngramList] = ngramMap[ngramList] + 1
565 except KeyError:
566 ngramMap[ngramList] = 1
567 ngramList = ngramList[1:]
568 curCard += 1
569 return ngramMap
570
623
625 """Update the database with the n-grams contained in the n-gram map.
626
627 Each n-gram of the n-gram map is pushed into the database with its
628 number of occurences (count).
629 If the n-gram is already in the database then its count (number of
630 occurences) is updated. If the n-gram is not in the database then it is
631 simply inserted in it.
632
633 @param ngramMap:
634 Dictionary associating n-grams with their number of occurences,
635 generated by L{self.make_ngram_map} and modified by
636 L{self.prefix_ngrams_with_input}.
637 @type ngramMap: dict
638 """
639 for ngram in ngramMap:
640 count = self.db.ngram_count(ngram)
641 if count > 0:
642 self.db.update_ngram(ngram, count + ngramMap[ngram])
643 else:
644 self.db.insert_ngram(list(ngram), ngramMap[ngram])
645 self.db.commit()
646
647 - def count(self, tokens, offset, n):
648 """Make an n-gram then retrieve and return its 'count' entry in the db.
649
650 @param tokens:
651 The tokens used to make the n-gram.
652 @type tokens: list
653 @param offset:
654 Offsset of the first token in the tokens.
655 @type offset: int
656 @param n:
657 Size of the n-gram.
658 @type n: int
659 """
660 if n > 0:
661 ngram = tokens[len(tokens) - n + offset:len(tokens) + offset]
662 result = self.db.ngram_count(ngram)
663 else:
664 result = self.db.sum_ngrams_occ(1)
665 return result
666
669 """Compute predictions based on their last occurences and frequencies.
670
671 G{classtree LastOccurPredictor}
672 """
673
674 - def __init__(self, config, contextMonitor, predictorName=None):
675 """LastOccurPredictor creator.
676
677 @param config:
678 The config is used to retrieve the predictor settings from the
679 config file.
680 @type config: L{drvr.Configuration}
681 @param contextMonitor:
682 The contextMonitor is needed because it allow the predictor to get
683 the input buffers tokens.
684 @type contextMonitor: L{ContextMonitor}
685 @param predictorName:
686 The custom name of the configuration using this predictor.
687 @type predictorName: str
688 """
689 Predictor.__init__(self, config, contextMonitor)
690 self.name = predictorName
691 self.lambdav = self.config.getas(self.name, 'lambda', 'int')
692 self.n0 = self.config.getas(self.name, 'n_0', 'int')
693 self.cutoffThreshold = self.config.getas(
694 self.name, 'cutoff_threshold', 'int')
695
696 - def predict(self, maxPartialPredictionSize, stopList=[]):
697 """Compute the predictions using a simple exponential decay method.
698
699 @param maxPartialPredictionSize:
700 Maximum number of suggestion to compute. If this number is reached,
701 the suggestions list is immediatly return.
702 DatabaseConnector.ngram_table_tp() returns the records in descending
703 order according to their number of occurences so the most probable
704 suggestions will be added to the list first.
705 This result in no suggestion quality loss, regardless of the desired
706 number of suggestions.
707 @type maxPartialPredictionSize: int
708 @param stopList:
709 The stoplist is a list of undesirable words. Any suggestion which
710 is in the stopList won't be added to the suggestions list.
711 @type stopList: list
712
713 @return:
714 A list of every suggestions possible (limited to
715 maxPartialPredictionSize).
716 @rtype: L{Prediction}
717 """
718 result = Prediction()
719 prefix = self.contextMonitor.prefix()
720 if prefix:
721 index = 1
722 token = self.contextMonitor.left_token(index)
723 prob = 0
724 while (token and
725 len(result) < maxPartialPredictionSize and
726 index <= self.cutoffThreshold):
727 if token.startswith(prefix):
728 if not token.lower() in stopList:
729 prob = self.n0 * exp(- (self.lambdav * (index - 1)))
730 result.add_suggestion(Suggestion(token, prob))
731 index += 1
732 token = self.contextMonitor.left_token(index)
733 return result
734
736 """This predictor has no ability to learn."""
737 pass
738
741 """Predict words based on memorized (learnt) input tokens patterns.
742
743 This predictor is capable of tokens memorization. It memorize the inputed
744 tokens and try to predict the suggestion using memorized tokens and n-grams
745 (group of consecutive tokens).
746
747 G{classtree MemorizePredictor}
748 """
749
750 - def __init__(self, config, contextMonitor, predictorName=None):
751 """MemorizePredictor creator.
752
753 @param config:
754 The config is used to retrieve the predictor settings from the
755 config file.
756 @type config: L{drvr.Configuration}
757 @param contextMonitor:
758 The contextMonitor is needed because it allow the predictor to get
759 the input buffers tokens.
760 @type contextMonitor: L{ContextMonitor}
761 @param predictorName:
762 The custom name of the configuration using this predictor.
763 @type predictorName: str
764 """
765 Predictor.__init__(self, config, contextMonitor)
766 self.name = predictorName
767 self.memory = self.config.getas(self.name, 'memory')
768 self.trigger = self.config.getas(self.name, 'trigger', 'int')
769 self.learnMode = self.config.getas(self.name, 'learn', 'bool')
770
771 - def predict(self, maxPartialPredictionSize, stopList):
772 """Predict words based on memorized input tokens.
773
774 @param maxPartialPredictionSize:
775 Maximum number of suggestion to compute. If this number is reached,
776 the suggestions list is immediatly return.
777 DatabaseConnector.ngram_table_tp() returns the records in descending
778 order according to their number of occurences so the most probable
779 suggestions will be added to the list first.
780 This result in no suggestion quality loss, regardless of the desired
781 number of suggestions.
782 @type maxPartialPredictionSize: int
783 @param stopList:
784 The stoplist is a list of undesirable words. Any suggestion which
785 is in the stopList won't be added to the suggestions list.
786 @type stopList: list
787
788 @return:
789 A list of every suggestions possible (limited to
790 maxPartialPredictionSize).
791 @rtype: L{Prediction}
792 """
793 result = Prediction()
794 memTrigger = []
795 try:
796 memFile = open(self.memory, 'r+')
797 except FileNotFoundError:
798 lg_error('Cannot open file ' + self.memory)
799 return
800 if self.init_mem_trigg(memTrigger):
801 rollingWindow = ''
802 if self.init_rolling_window(rollingWindow, memFile):
803 token = ''
804 while memFile.write(token):
805 if memTrigger == rollingWindow:
806 if not token.lower() in stopList:
807 result.add_suggestion(Suggestion(token, 1.))
808 self.update_rolling_window(rollingWindow, token)
809 memFile.close()
810 return result
811
812 - def learn(self, change):
813 """Learn what need to be learnt by tokens in the memory file.
814
815 @param change:
816 The part of the left input buffer which represent the last change.
817 @type change: str
818 """
819 if self.learnMode is False:
820 return
821 try:
822 memFile = open(self.memory, 'a')
823 except FileNotFoundError:
824 lg_error('Cannot open file ' + self.memory)
825 return
826 for tok in change:
827 memFile.write(tok + '\n')
828 memFile.close()
829
831 result = False
832 for i in range(self.trigger, 0, -1):
833 memTrigger.append(self.contextMonitor.left_token(i))
834 if not '' in memTrigger:
835 result = True
836 return result
837
839 tmp = [x.strip('\n') for x in memFile.readlines()]
840 token = ''
841 count = 0
842 while count < self.trigger and tmp[count]:
843 count += 1
844 return count == self.trigger
845
847 rollingWindow = rollingWindow[1:]
848 rollingWindow += token
849
852 """Very simple word predictor using a dictionary.
853
854 The dictionary is a file containing one word per line. This predictor does
855 not use n-grams and is therefore less effective than the predictors using
856 n-grams because it does not consider context.
857
858 G{classtree DictionaryPredictor}
859 """
860
861 - def __init__(self, config, contextMonitor, predictorName):
862 """DictionaryPredictor creator.
863
864 @note: The string.lower() and string.strip() methods have a great impact
865 on performance (the profile module show that they require almost
866 1 second of processing time when calculating suggestions for 10
867 contexts. So this constructor no more directly use the dictionary
868 file. A database is created instead.
869 Every words of the dictionary are lowered and stripped then added
870 to the database.
871 Doing so, the performance of the predictor are way better.
872 Profiling a script querying suggestions for 10 successive contexts
873 show the improvement profits:
874 - lower()ing and strip()ping each word of the file on each
875 predict() call::
876 ncalls tottime percall cumtime percall filename:lineno
877 690048 0.468 0.000 0.468 0.000 :0(lower)
878 - Creating an improved list upon initialization and using it on
879 each predict() call (previous optimization method)::
880 ncalls tottime percall cumtime percall filename:lineno
881 100046 0.059 0.000 0.059 0.000 :0(lower)
882 It is approx. 800% faster. But this profiling mix
883 initialization and later computation. It means than most of
884 the time of the previous profiling line is spend in
885 initializing the list, computation on each predict() call are
886 even more profitable.
887 - Creating a database and querying it on each predict() call::
888 ncalls tottime percall cumtime percall filename:lineno
889 100046 0.059 0.000 0.059 0.000 :0(lower)
890 It is not faster than the previous method but the database
891 must only be created once. And once it is created the
892 initialization time is (near) null and the querying time on
893 each predict() call is even faster.
894
895 @change:
896 - 08/06/15: Method now create an ordered optimized list containing
897 dictionary words upon initialization in order to increase the
898 speed of the predictor.
899 - 13/06/15: Method now use a database containing the dictionary
900 words. See: L{minr.DictMiner}
901
902 @param config:
903 The config is used to retrieve the predictor settings from the
904 config file.
905 @type config: L{drvr.Configuration}
906 @param contextMonitor:
907 The contextMonitor is needed because it allow the predictor to get
908 the input buffers tokens.
909 @type contextMonitor: L{ContextMonitor}
910 @param predictorName:
911 The custom name of the configuration using this predictor.
912 @type predictorName: str
913 """
914 Predictor.__init__(self, config, contextMonitor)
915 self.name = predictorName
916 self.dbFile = self.config.getas(self.name, 'database')
917 self.db = None
918 self.prob = self.config.getas(self.name, 'probability', 'float')
919 self.init_database_connector()
920
922 """Initialize the database connector.
923
924 Using the database file path, the n-gram maximum size and the learn
925 mode to initialize and open the database.
926 """
927 if self.dbFile:
928 self.db = SqliteDatabaseConnector(self.dbFile)
929
931 """Select the dictionary range where words starts with the given prefix.
932
933 A suggested word must complete the given token, it means that suggested
934 words all start with this token, here called the prefix.
935 This method create a list containing the suggested words for the
936 given prefix, i.e. every words of the dictionary list starting with
937 the prefix.
938 It is easy as the dictionary list is ordered. For instance:
939
940 If the prefix is::
941 'hell'
942
943 And the dictionary list is::
944 ['bird', 'blue', 'given', 'hair', 'hellish', 'hello', 'red', 'zip']
945
946 We first remove every words of the list one by one until we reach a word
947 which actualy starts with the prefix 'hell', then we have::
948 ['hellish', 'hello', 'red', 'zip']
949
950 Finaly we scan every words of the remaining list and when we reach a
951 word which does not starts with the given prefix then we know that every
952 remaining words won't start with the prefix neither as the list is
953 ordered, so we have::
954 ['hellish', 'hello']
955
956 @deprecated: This method has become useless since the words are now
957 stored in a database.
958
959 @param prefix:
960 The prefix from which suggested words range is computed.
961 @type prefix: str
962 """
963 rangeWords = []
964 for word in self.dictWords:
965 if word.startswith(prefix):
966 rangeWords = self.dictWords[self.dictWords.index(word):]
967 break
968 for word in rangeWords:
969 if not word.startswith(prefix):
970 rangeWords = rangeWords[:rangeWords.index(word)]
971 break
972 return rangeWords
973
974 - def predict(self, maxPartialPredictionSize, stopList):
975 """Complete the actual word or predict the next word using dictionary.
976
977 Use the input buffers (thanks to contextMonitor) and the word dictionary
978 to predict the most probable suggestions.
979 A suggestion is a word which can:
980 - Predict the end of the world. i.e. complete the actual partial
981 word (the user has not finished to input the word, we try to
982 predict the end of the word).
983 - Predict the next word (the user has type a separator after a
984 word, we try to predict the next word before he starts to type
985 it).
986
987 In order to compute the suggestions, this method:
988 - Retrieve the last token from the left input buffer.
989 - Loop for each word in the dictionary:
990 - If the word starts with the last token retrieved: add it to
991 the suggestion list if we have not reach the maximum number of
992 suggestions yet.
993 It is not necessary to check if the word is already in the
994 suggestion list because in a dictionary a word should only
995 appear once. In any case, the merger will merge the
996 duplicate suggestions.
997
998 @param maxPartialPredictionSize:
999 Maximum number of suggestion to compute. If this number is reached,
1000 the suggestions list is immediatly return.
1001 DatabaseConnector.ngram_table_tp() returns the records in descending
1002 order according to their number of occurences so the most probable
1003 suggestions will be added to the list first.
1004 This result in no suggestion quality loss, regardless of the desired
1005 number of suggestions.
1006 @type maxPartialPredictionSize: int
1007 @param stopList:
1008 The stoplist is a list of undesirable words. Any suggestion which
1009 is in the stopList won't be added to the suggestions list.
1010 @type stopList: list
1011
1012 @return:
1013 A list of every suggestions possible (limited to
1014 maxPartialPredictionSize).
1015 @rtype: L{Prediction}
1016 """
1017 result = Prediction()
1018 prefix = self.contextMonitor.prefix().lower()
1019 count = 0
1020 candidates = self.db.ngram_table_tp([prefix], maxPartialPredictionSize)
1021 for candidate in candidates:
1022 if count > maxPartialPredictionSize:
1023 break
1024 candidate = candidate[-2]
1025 if not candidate in stopList:
1026 result.add_suggestion(Suggestion(candidate, self.prob))
1027 count += 1
1028 return result
1029
1031 """This predictor has no ability to learn."""
1032 pass
1033