1
2
3
4 """Classes for predictors and to handle suggestions and predictions."""
5
6 from __future__ import absolute_import, unicode_literals
7 import os
8 import sys
9 try:
10 import configparser
11 except ImportError:
12 import ConfigParser as configparser
13 import db
14 import cmbn
15 import abc
16 import stpl
17 from math import exp
18
19
20 MIN_PROBABILITY = 0.0
21 MAX_PROBABILITY = 1.0
25 """A suggestion consists of a string and a probility.
26
27 The string is in fact a token and the probability is the probability
28 associated to this token by the predictor which compute it. The
29 probability reflect how accurate the word is over prediction.
30
31 G{classtree Suggestion}
32 """
33
35 """Suggestion creator
36
37 A suggestion is a couple formed by a suggested word and its probability.
38
39 @note: the probabilities of each predictors should have the same
40 weight. Otherwise the suggestion selection will be truncated.
41
42 @param word:
43 The suggested word.
44 @type word: str
45 @param probability:
46 The suggested word probability. It is compute by the predictors.
47 @type probability: float
48 """
49 self.word = word
50 self.probability = probability
51
53 """Override the == operator in order to compare instances equality.
54
55 Two Suggestion instances are equal if their word and probability are
56 equal.
57
58 @param other:
59 The Suggestion instance to compare to this one (self).
60 @type other: L{prdct.Suggestion}
61
62 @return:
63 True if the two instances are equal, False otherwise.
64 @rtype:
65 bool
66 """
67 if self.word == other.word and self.probability == other.probability:
68 return True
69 return False
70
72 """Override the < operator in order to compare instances.
73
74 A Suggestion instance is less than another if its probability is less
75 than the probability of the other. If their probabilities are equal then
76 the Suggestion instance is less than the other if its word is
77 alphabetically 'before' the word of the other instance.
78
79 @param other:
80 The Suggestion instance to compare to this one (self).
81 @type other: L{prdct.Suggestion}
82
83 @return:
84 True if the this instance (self) is less than the other one, False
85 otherwise.
86 @rtype: bool
87 """
88 if self.probability < other.probability:
89 return True
90 if self.probability == other.probability:
91 return self.word < other.word
92 return False
93
95 """Override the > operator in order to compare instances.
96
97 A Suggestion instance is greater than another if its probability is
98 greater than the probability of the other. If their probabilities are
99 equal then the Suggestion instance is greater than the other if its word
100 is alphabetically 'after' the word of the other instance.
101
102 @param other:
103 The Suggestion instance to compare to this one (self).
104 @type other: L{prdct.Suggestion}
105
106 @return:
107 True if the this instance (self) is greater than the other one,
108 False otherwise.
109 @rtype: bool
110 """
111 if self.probability > other.probability:
112 return True
113 if self.probability == other.probability:
114 return self.word > other.word
115 return False
116
119 """Class for predictions from predictors.
120
121 A Prediction instance hold multiple Suggestion instances. It is a list of
122 Suggestion instances and the list is kept ordered according to the
123 suggestions probabilities.
124 Every predictors should return a Preidiction instance.
125
126 G{classtree Prediction}
127 """
128
130 """Prediction creator."""
131 pass
132
134 """Override the == operator in order to compare instances.
135
136 Two Prediction instances are equal if they contain the same items
137 (thus, have the same length).
138 """
139 if self is other:
140 return True
141 if len(self) != len(other):
142 return False
143 for i, s in enumerate(other):
144 if not s == self[i]:
145 return False
146 return True
147
149 """Add a suggestion in the Prediction list.
150
151 The suggestion is added at the correct index so that the Prediction
152 list remains ordered.
153
154 @note: Using insert() and a while loop seams a little faster than using
155 sorted(). Also, using insort from the bisect module seems to
156 produce similar benchmarks.
157 """
158 if len(self) == 0:
159 self.append(suggestion)
160 else:
161 i = 0
162 while i < len(self) and suggestion < self[i]:
163 i += 1
164 self.insert(i, suggestion)
165
166 import multiprocessing
169 """Query the predictors listed in the registry to get their suggestions.
170
171 This class has access to a PredictorRegistry and asks the predictors listed
172 in this PredictorRegistry to call their predict() method, store the
173 resulting Prediction instances, combine them into a single Prediction
174 instance and return it.
175
176 G{classtree PredictorActivator}
177 """
178
179 - def __init__(self, config, predictorRegistry):
180 """PredictorActivator creator.
181
182 @param config:
183 The configuration dictionary is used in order to retrieve the
184 PredictorActivator settings from the config file.
185 @type config: L{drvr.Configuration}
186 @param predictorRegistry:
187 The class needs to access the PredictorRegistry to call their
188 predict() method.
189 @type predictorRegistry: L{prdct.PredictorRegistry}
190 """
191 self.config = config
192 self.predictorRegistry = predictorRegistry
193 self.predictionList = []
194 self.maxPartialPredictionSize = self.config.getas(
195 'PredictorActivator', 'max_partial_prediction_size', 'int') + 1
196 self.combinationPolicy = self.config.getas(
197 'PredictorActivator', 'combination_policy')
198 self.stopListFile = self.config.getas(
199 'PredictorActivator', 'stoplist')
200 self.stopList = stpl.StopList(self.stopListFile)
201 if self.combinationPolicy.lower() == "probabilistic":
202 self.combiner = cmbn.ProbabilisticCombiner()
203 else:
204 lg.error('Unknown combination policy')
205 sys.exit(1)
206
208 """Worker function for the predictor predict() methods.
209
210 This method is used as the predictors workers target. It push the
211 predictor's L{prdcr.Predictor.predict} method result (a
212 L{prdct.Prediction} instance) in a queue (which is used because it is
213 thread-safe).
214
215 @param predictor:
216 The Predictor based class instance.
217 @type predictor: L{prdct.Predictor} based class.
218 @param queue:
219 A queue in which the result will be pushed.
220 @type queue:
221 multiprocessing.Queue
222 @param factor:
223 A factor used to increase the number of suggestions.
224 @type factor: int
225 """
226 queue.put(predictor.predict(
227 self.maxPartialPredictionSize * factor, self.stopList.words))
228
230 """Build a list of every predicted words.
231
232 Call the predict() method of every predictors in the registry then
233 merge their Prediction into a single Prediction instance.
234
235 @change:
236 - 16/06/15: The method now uses multi-processing. It concurrently
237 runs every predictors's predict() method which allow a
238 significant speed augmentation. The queue is used because it is
239 thread safe. The point is that when the threads args are passed
240 to the L{prdct.PredictorActivator.pred_worker()}, they are
241 packed up with pickle, shipped to the other process, where they
242 are unpacked used. A list wouldn't be passed but would be
243 cloned.
244
245 @note:
246 Using multi-processing allow significant speed boost. The next
247 benchmark have been maid runing 100 * 10 different contexts
248 predictions::
249
250 Total time without multi-processing: 86.785 s
251 Total time wit multi-processing: 76.513 s
252
253 @todo 0.0.2:
254 Demonize the processes, set a timeout value. When the time runs out
255 the unfinished workers return their results as is. This can alter
256 the prediction qality but totaly avoid any possible "slow
257 predictions".
258
259 @param factor:
260 A factor used to increase the number of suggestions.
261 @type factor: int
262
263 @return:
264 The combined Prediction instance containing every suggestions of
265 every Prediction instances sorted in descending order according to
266 their probabilities.
267 @rtype: L{prdct.Prediction}
268 """
269 self.predictionList[:] = []
270 jobs = []
271 queue = multiprocessing.Queue()
272 for predictor in self.predictorRegistry:
273 p = multiprocessing.Process(
274 target=self.pred_worker, args=(predictor, queue, factor,))
275 jobs.append(p)
276 p.start()
277 for job in jobs:
278 job.join()
279 for x in range(len(jobs)):
280 self.predictionList.append(queue.get())
281 return self.combiner.combine(self.predictionList)
282
285 """List every predictors instances that are to be used for word prediction.
286
287 G{classtree PredictorRegistry}
288 """
289
291 """PredictorRegistry creator.
292
293 @param config:
294 config is used to retrieve the PredictorRegistry settings and each
295 Predictor settings from the config file. Also it needs to be passed
296 to the predictors instances to allow them to retrieve their settings
297 from the config file too.
298 @type config: L{drvr.Configuration}
299 """
300 self._contextMonitor = None
301 self.config = config
302 self.contextMonitor = None
303
305
306 def fget(self):
307 return self._contextMonitor
308
309 def fset(self, value):
310 if self._contextMonitor is not value:
311 self._contextMonitor = value
312 self[:] = []
313 self.set_predictors()
314
315 def fdel(self):
316 del self._contextMonitor
317
318 return locals()
319 contextMonitor = property(**contextMonitor())
320
322 """Read the configuration file and create needed predictors."""
323 if self.contextMonitor:
324 self[:] = []
325 preds = self.config.getas('PredictorRegistry', 'predictors', 'list')
326 for predictor in preds:
327 self.add_predictor(predictor)
328
330 """Create and add a predictor to the list.
331
332 Create a predictor instance according to the predictor name and add
333 it to the list.
334
335 @param predictorName:
336 The name of the predictor. It is used to retrieve the predictor
337 settings from the config. It must correspond to a section of the
338 config, otherwise no predictor will be created and added.
339 @type predictorName: str
340 """
341 predictorClass = self.config.getas(predictorName, 'class')
342 if predictorClass == 'WeightNgramPredictor':
343 predictor = WeightNgramPredictor(
344 self.config, self.contextMonitor, predictorName)
345 elif predictorClass == 'LastOccurPredictor':
346 predictor = LastOccurPredictor(
347 self.config, self.contextMonitor, predictorName)
348 elif predictorClass == 'MemorizePredictor':
349 predictor = MemorizePredictor(
350 self.config, self.contextMonitor, predictorName)
351 elif predictorClass == 'DictionaryPredictor':
352 predictor = DictionaryPredictor(
353 self.config, self.contextMonitor, predictorName)
354 else:
355 predictor = None
356 if predictor:
357 self.append(predictor)
358
360 """Close every opened predictors database."""
361 for predictor in self:
362 predictor.close_database()
363
366 """Base class for predictors.
367
368 G{classtree Predictor}
369 """
370
371 __metaclass__ = abc.ABCMeta
372
373 - def __init__(self, config, contextMonitor):
374 """Predictor creator.
375
376 @param config:
377 The config is used to retrieve the predictor settings from the
378 config file.
379 @type config: L{prdct.Configuration}
380 @param contextMonitor:
381 The contextMonitor is needed because it allow the predictor to get
382 the input buffers tokens.
383 @type contextMonitor: L{cntxt.ContextMonitor}
384 """
385 self.contextMonitor = contextMonitor
386 self.name = "Predictor dosen't set any name"
387 self.config = config
388
389 @abc.abstractmethod
390 - def predict(self, maxPartialPredictionSize, stopList):
391 raise NotImplementedError("Method must be implemented")
392
393 @abc.abstractmethod
395 raise NotImplementedError("Method must be implemented")
396
399 """Compute prediction from n-gram model in database.
400
401 G{classtree WeightNgramPredictor}
402 """
403
404 - def __init__(self, config, contextMonitor, predictorName=None):
405 """WeightNgramPredictor creator.
406
407 @param config:
408 The config is used to retrieve the predictor settings from the
409 config file.
410 @type config: L{drvr.Configuration}
411 @param contextMonitor:
412 The contextMonitor is needed because it allow the predictor to get
413 the input buffers tokens.
414 @type contextMonitor: L{cntxt.ContextMonitor}
415 @param predictorName:
416 The custom name of the configuration using this predictor.
417 @type predictorName: str
418 """
419 Predictor.__init__(self, config, contextMonitor)
420 self.name = predictorName
421 self.db = None
422 self.dbFile = self.config.getas(self.name, 'DBFILENAME')
423 self.deltas = self.config.getas(self.name, 'DELTAS', 'floatlist')
424 self.learnMode = self.config.getas(self.name, 'learn')
425 self.maxN = len(self.deltas)
426 self.init_database_connector()
427
429 """Initialize the database connector.
430
431 Using the database file path, the n-gram maximum size and the learn
432 mode to initialize and open the database.
433 """
434 if self.dbFile and self.maxN > 0:
435 self.db = db.SqliteDatabaseConnector(self.dbFile, self.maxN)
436
437 - def predict(self, maxPartialPredictionSize, stopList=[]):
438 """Predict the next word according to the current context.
439
440 Use the input buffers (thanks to contextMonitor) and the n-gram database
441 to predict the most probable suggestions.
442 A suggestion is a word which can:
443 - Predict the end of the world. i.e. complete the actual partial
444 word (the user has not finished to input the word, we try to
445 predict the end of the word).
446 - Predict the next word (the user has type a separator after a word,
447 we try to predict the next word before he starts to type it).
448
449 In order to compute the suggestions, this method:
450 - Retrieve the last n tokens from the left input buffer ; where n is
451 the maximum n-grams size (max(n)) which is stored in the database.
452 - Loop for each n-gram size from max(n) to 1:
453 - Find n-grams of current n-gram size in the database which
454 match the last input tokens.
455 - Add each retrieved n-gram to the suggestion list if it is not
456 already in it and if we have not reach the maximum number of
457 suggestions yet.
458
459 @param maxPartialPredictionSize:
460 Maximum number of suggestion to compute. If this number is reached,
461 the suggestions list is immediatly return.
462 DatabaseConnector.ngram_table_tp() returns the records in descending
463 order according to their number of occurences so the most probable
464 suggestions will be added to the list first.
465 This result in no suggestion quality loss, regardless of the desired
466 number of suggestions.
467 @type maxPartialPredictionSize: int
468 @param stopList:
469 The stoplist is a list of undesirable words. Any suggestion which
470 is in the stopList won't be added to the suggestions list.
471 @type stopList: list
472
473 @return:
474 A list of every suggestions possible (limited to
475 maxPartialPredictionSize).
476 @rtype: L{prdct.Prediction}
477 """
478 tokens = [''] * self.maxN
479 for i in range(self.maxN):
480 tokens[self.maxN - 1 - i] = self.contextMonitor.left_token(i)
481 prefixCompletionCandidates = []
482 for k in reversed(range(self.maxN)):
483 if len(prefixCompletionCandidates) >= maxPartialPredictionSize:
484 break
485 prefixNgram = tokens[(len(tokens) - k - 1):]
486 partial = None
487 partial = self.db.ngram_table_tp(
488 prefixNgram,
489 maxPartialPredictionSize - len(prefixCompletionCandidates))
490 for p in partial:
491 if len(prefixCompletionCandidates) > maxPartialPredictionSize:
492 break
493 candidate = p[-2]
494 if candidate not in prefixCompletionCandidates:
495 if not candidate.lower() in stopList:
496 prefixCompletionCandidates.append(candidate)
497 return self.weight(prefixCompletionCandidates, tokens)
498
499 - def weight(self, prefixCompletionCandidates, tokens):
500 """Compute probability of suggestions and return the most probable ones.
501
502 The probability of a suggestion is based on its relative frequency
503 toward the whole set of suggestions and the number of single tokens in
504 the database.
505
506 @param prefixCompletionCandidates:
507 List of every suggestions returned by self.predict().
508 @type prefixCompletionCandidates: list
509 @param tokens:
510 The last input tokens.
511 @type tokens: list
512
513 @return:
514 List of every "good enought" suggestions.
515 @rtype: L{prdct.Prediction}
516 """
517 prediction = Prediction()
518 unigramCountsSum = self.db.sum_ngrams_occ(1)
519 for j, candidate in enumerate(prefixCompletionCandidates):
520 tokens[self.maxN - 1] = candidate
521 probability = 0
522 for k in range(self.maxN):
523 numerator = self.count(tokens, 0, k + 1)
524 denominator = unigramCountsSum
525 if numerator > 0:
526 denominator = self.count(tokens, -1, k)
527 frequency = 0
528 if denominator > 0:
529 frequency = float(numerator) / denominator
530 probability += self.deltas[k] * frequency
531 if probability > 0:
532 prediction.add_suggestion(
533 Suggestion(tokens[self.maxN - 1], probability))
534 return(prediction)
535
539
540 - def learn(self, change):
541 """Learn what need to be learnt by adding n-grams in database.
542
543 @param change:
544 The part of the left input buffer which represent the last change.
545 @type change: str
546 """
547 if self.learnMode is False:
548 return
549 ngramMap = self.make_ngram_map(change)
550 ngramMap = self.prefix_ngrams_with_input(change, ngramMap)
551 self.push_ngrams_in_db(ngramMap)
552
554 """Create a map associating n-grams (lists of words) and their count.
555
556 @param change:
557 The part of the left input buffer which represent the last change.
558 @type change: str
559 """
560 ngramMap = {}
561 for curCard in range(1, self.maxN + 1):
562 changeIdx = 0
563 changeSize = len(change)
564 ngramList = ()
565 for i in range(curCard - 1):
566 if changeIdx >= changeSize:
567 break
568 ngramList = ngramList + (change[changeIdx],)
569 changeIdx += 1
570 while changeIdx < changeSize:
571 ngramList = ngramList + (change[changeIdx],)
572 changeIdx += 1
573 try:
574 ngramMap[ngramList] = ngramMap[ngramList] + 1
575 except KeyError:
576 ngramMap[ngramList] = 1
577 ngramList = ngramList[1:]
578 curCard += 1
579 return ngramMap
580
632
634 """Update the database with the n-grams contained in the n-gram map.
635
636 Each n-gram of the n-gram map is pushed into the database with its
637 number of occurences (count).
638 If the n-gram is already in the database then its count (number of
639 occurences) is updated. If the n-gram is not in the database then it is
640 simply inserted in it.
641
642 @param ngramMap:
643 Dictionary associating n-grams with their number of occurences,
644 generated by self.make_ngram_map() and modified by
645 self.prefix_ngrams_with_input().
646 @type ngramMap: dict
647 """
648 for ngram in ngramMap:
649 count = self.db.ngram_count(ngram)
650 if count > 0:
651 self.db.update_ngram(ngram, count + ngramMap[ngram])
652 else:
653 self.db.insert_ngram(list(ngram), ngramMap[ngram])
654 self.db.commit()
655
656 - def count(self, tokens, offset, n):
657 """Make an n-gram then retrieve and return its 'count' entry in the db.
658
659 @param tokens:
660 The tokens used to make the n-gram.
661 @type tokens: list
662 @param offset:
663 Offsset of the first token in the tokens.
664 @type offset: int
665 @param n:
666 Size of the n-gram.
667 @type n: int
668 """
669 if n > 0:
670 ngram = tokens[len(tokens) - n + offset:len(tokens) + offset]
671 result = self.db.ngram_count(ngram)
672 else:
673 result = self.db.sum_ngrams_occ(1)
674 return result
675
678 """Compute predictions based on their last occurences and frequencies.
679
680 G{classtree LastOccurPredictor}
681 """
682
683 - def __init__(self, config, contextMonitor, predictorName=None):
684 """LastOccurPredictor creator.
685
686 @param config:
687 The config is used to retrieve the predictor settings from the
688 config file.
689 @type config: L{drvr.Configuration}
690 @param contextMonitor:
691 The contextMonitor is needed because it allow the predictor to get
692 the input buffers tokens.
693 @type contextMonitor: L{cntxt.ContextMonitor}
694 @param predictorName:
695 The custom name of the configuration using this predictor.
696 @type predictorName: str
697 """
698 Predictor.__init__(self, config, contextMonitor)
699 self.name = predictorName
700 self.lambdav = self.config.getas(self.name, 'lambda', 'int')
701 self.n0 = self.config.getas(self.name, 'n_0', 'int')
702 self.cutoffThreshold = self.config.getas(
703 self.name, 'cutoff_threshold', 'int')
704
705 - def predict(self, maxPartialPredictionSize, stopList=[]):
706 """Compute the predictions using a simple exponential decay method.
707
708 @param maxPartialPredictionSize:
709 Maximum number of suggestion to compute. If this number is reached,
710 the suggestions list is immediatly return.
711 DatabaseConnector.ngram_table_tp() returns the records in descending
712 order according to their number of occurences so the most probable
713 suggestions will be added to the list first.
714 This result in no suggestion quality loss, regardless of the desired
715 number of suggestions.
716 @type maxPartialPredictionSize: int
717 @param stopList:
718 The stoplist is a list of undesirable words. Any suggestion which
719 is in the stopList won't be added to the suggestions list.
720 @type stopList: list
721
722 @return:
723 A list of every suggestions possible (limited to
724 maxPartialPredictionSize).
725 @rtype: L{prdct.Prediction}
726 """
727 result = Prediction()
728 prefix = self.contextMonitor.prefix()
729 if prefix:
730 index = 1
731 token = self.contextMonitor.left_token(index)
732 prob = 0
733 while (token and
734 len(result) < maxPartialPredictionSize and
735 index <= self.cutoffThreshold):
736 if token.startswith(prefix):
737 if not token.lower() in stopList:
738 prob = self.n0 * exp(- (self.lambdav * (index - 1)))
739 result.add_suggestion(Suggestion(token, prob))
740 index += 1
741 token = self.contextMonitor.left_token(index)
742 return result
743
745 """This predictor has no ability to learn."""
746 pass
747
750 """Predict words based on memorized (learnt) input tokens patterns.
751
752 This predictor is capable of tokens memorization. It memorize the inputed
753 tokens and try to predict the suggestion using memorized tokens and n-grams
754 (group of consecutive tokens).
755
756 G{classtree MemorizePredictor}
757 """
758
759 - def __init__(self, config, contextMonitor, predictorName=None):
760 """MemorizePredictor creator.
761
762 @param config:
763 The config is used to retrieve the predictor settings from the
764 config file.
765 @type config: L{drvr.Configuration}
766 @param contextMonitor:
767 The contextMonitor is needed because it allow the predictor to get
768 the input buffers tokens.
769 @type contextMonitor: L{cntxt.ContextMonitor}
770 @param predictorName:
771 The custom name of the configuration using this predictor.
772 @type predictorName: str
773 """
774 Predictor.__init__(self, config, contextMonitor)
775 self.name = predictorName
776 self.memory = self.config.getas(self.name, 'memory')
777 self.trigger = self.config.getas(self.name, 'trigger', 'int')
778 self.learnMode = self.config.getas(self.name, 'learn')
779
780 - def predict(self, maxPartialPredictionSize, stopList):
781 """Predict words based on memorized input tokens.
782
783 @param maxPartialPredictionSize:
784 Maximum number of suggestion to compute. If this number is reached,
785 the suggestions list is immediatly return.
786 DatabaseConnector.ngram_table_tp() returns the records in descending
787 order according to their number of occurences so the most probable
788 suggestions will be added to the list first.
789 This result in no suggestion quality loss, regardless of the desired
790 number of suggestions.
791 @type maxPartialPredictionSize: int
792 @param stopList:
793 The stoplist is a list of undesirable words. Any suggestion which
794 is in the stopList won't be added to the suggestions list.
795 @type stopList: list
796
797 @return:
798 A list of every suggestions possible (limited to
799 maxPartialPredictionSize).
800 @rtype: L{prdct.Prediction}
801 """
802 result = Prediction()
803 memTrigger = []
804 try:
805 memFile = open(self.memory, 'r+')
806 except FileNotFoundError:
807 lg_error('Cannot open file ' + self.memory)
808 return
809 if self.init_mem_trigg(memTrigger):
810 rollingWindow = ''
811 if self.init_rolling_window(rollingWindow, memFile):
812 token = ''
813 while memFile.write(token):
814 if memTrigger == rollingWindow:
815 if not token.lower() in stopList:
816 result.add_suggestion(Suggestion(token, 1.))
817 self.update_rolling_window(rollingWindow, token)
818 memFile.close()
819 return result
820
821 - def learn(self, change):
822 """Learn what need to be learnt by tokens in the memory file.
823
824 @param change:
825 The part of the left input buffer which represent the last change.
826 @type change: str
827 """
828 if self.learnMode is False:
829 return
830 try:
831 memFile = open(self.memory, 'a')
832 except FileNotFoundError:
833 lg_error('Cannot open file ' + self.memory)
834 return
835 for tok in change:
836 memFile.write(tok + '\n')
837 memFile.close()
838
840 result = False
841 for i in range(self.trigger, 0, -1):
842 memTrigger.append(self.contextMonitor.left_token(i))
843 if not '' in memTrigger:
844 result = True
845 return result
846
848 tmp = [x.strip('\n') for x in memFile.readlines()]
849 token = ''
850 count = 0
851 while count < self.trigger and tmp[count]:
852 count += 1
853 return count == self.trigger
854
856 rollingWindow = rollingWindow[1:]
857 rollingWindow += token
858
861 """Very simple word predictor using a dictionary.
862
863 The dictionary is a file containing one word per line. This predictor does
864 not use n-grams and is therefore less effective than the predictors using
865 n-grams because it does not consider context.
866
867 G{classtree DictionaryPredictor}
868 """
869
870 - def __init__(self, config, contextMonitor, predictorName):
871 """DictionaryPredictor creator.
872
873 @note: The string.lower() and string.strip() methods have a great impact
874 on performance (the profile module show that they require almost
875 1 second of processing time when calculating suggestions for 10
876 contexts. So this constructor no more directly use the dictionary
877 file. A database is created instead.
878 Every words of the dictionary are lowered and stripped then added
879 to the database.
880 Doing so, the performance of the predictor are way better.
881 Profiling a script querying suggestions for 10 successive contexts
882 show the improvement profits:
883 - lower()ing and strip()ping each word of the file on each
884 predict() call::
885 ncalls tottime percall cumtime percall filename:lineno
886 690048 0.468 0.000 0.468 0.000 :0(lower)
887 - Creating an improved list upon initialization and using it on
888 each predict() call (previous optimization method)::
889 ncalls tottime percall cumtime percall filename:lineno
890 100046 0.059 0.000 0.059 0.000 :0(lower)
891 It is approx. 800% faster. But this profiling mix
892 initialization and later computation. It means than most of
893 the time of the previous profiling line is spend in
894 initializing the list, computation on each predict() call are
895 even more profitable.
896 - Creating a database and querying it on each predict() call::
897 ncalls tottime percall cumtime percall filename:lineno
898 100046 0.059 0.000 0.059 0.000 :0(lower)
899 It is not faster than the previous method but the database
900 must only be created once. And once it is created the
901 initialization time is (near) null and the querying time on
902 each predict() call is even faster.
903
904 @change:
905 - 08/06/15: Method now create an ordered optimized list containing
906 dictionary words upon initialization in order to increase the
907 speed of the predictor.
908 - 13/06/15: Method now use a database containing the dictionary
909 words. See: L{minr.DictMiner}
910
911 @param config:
912 The config is used to retrieve the predictor settings from the
913 config file.
914 @type config: L{drvr.Configuration}
915 @param contextMonitor:
916 The contextMonitor is needed because it allow the predictor to get
917 the input buffers tokens.
918 @type contextMonitor: L{cntxt.ContextMonitor}
919 @param predictorName:
920 The custom name of the configuration using this predictor.
921 @type predictorName: str
922 """
923 Predictor.__init__(self, config, contextMonitor)
924 self.name = predictorName
925 self.dbFile = self.config.getas(self.name, 'dbfilename')
926 self.db = None
927 self.prob = self.config.getas(self.name, 'probability', 'float')
928 self.init_database_connector()
929
931 """Initialize the database connector.
932
933 Using the database file path, the n-gram maximum size and the learn
934 mode to initialize and open the database.
935 """
936 if self.dbFile:
937 self.db = db.SqliteDatabaseConnector(self.dbFile)
938
940 """Select the dictionary range where words starts with the given prefix.
941
942 A suggested word must complete the given token, it means that suggested
943 words all start with this token, here called the prefix.
944 This method create a list containing the suggested words for the
945 given prefix, i.e. every words of the dictionary list starting with
946 the prefix.
947 It is easy as the dictionary list is ordered. For instance:
948
949 If the prefix is::
950 'hell'
951
952 And the dictionary list is::
953 ['bird', 'blue', 'given', 'hair', 'hellish', 'hello', 'red', 'zip']
954
955 We first remove every words of the list one by one until we reach a word
956 which actualy starts with the prefix 'hell', then we have::
957 ['hellish', 'hello', 'red', 'zip']
958
959 Finaly we scan every words of the remaining list and when we reach a
960 word which does not starts with the given prefix then we know that every
961 remaining words won't start with the prefix neither as the list is
962 ordered, so we have::
963 ['hellish', 'hello']
964
965 @warning: This method has become useless since the words are now stored
966 in a database.
967
968 @param prefix:
969 The prefix from which suggested words range is computed.
970 @type prefix: str
971 """
972 rangeWords = []
973 for word in self.dictWords:
974 if word.startswith(prefix):
975 rangeWords = self.dictWords[self.dictWords.index(word):]
976 break
977 for word in rangeWords:
978 if not word.startswith(prefix):
979 rangeWords = rangeWords[:rangeWords.index(word)]
980 break
981 return rangeWords
982
983 - def predict(self, maxPartialPredictionSize, stopList):
984 """Complete the actual word or predict the next word using dictionary.
985
986 Use the input buffers (thanks to contextMonitor) and the word dictionary
987 to predict the most probable suggestions.
988 A suggestion is a word which can:
989 - Predict the end of the world. i.e. complete the actual partial
990 word (the user has not finished to input the word, we try to
991 predict the end of the word).
992 - Predict the next word (the user has type a separator after a
993 word, we try to predict the next word before he starts to type
994 it).
995
996 In order to compute the suggestions, this method:
997 - Retrieve the last token from the left input buffer.
998 - Loop for each word in the dictionary:
999 - If the word starts with the last token retrieved: add it to
1000 the suggestion list if we have not reach the maximum number of
1001 suggestions yet.
1002 It is not necessary to check if the word is already in the
1003 suggestion list because in a dictionary a word should only
1004 appear once. In any case, the combiner will merge the
1005 duplicate suggestions.
1006
1007 @param maxPartialPredictionSize:
1008 Maximum number of suggestion to compute. If this number is reached,
1009 the suggestions list is immediatly return.
1010 DatabaseConnector.ngram_table_tp() returns the records in descending
1011 order according to their number of occurences so the most probable
1012 suggestions will be added to the list first.
1013 This result in no suggestion quality loss, regardless of the desired
1014 number of suggestions.
1015 @type maxPartialPredictionSize: int
1016 @param stopList:
1017 The stoplist is a list of undesirable words. Any suggestion which
1018 is in the stopList won't be added to the suggestions list.
1019 @type stopList: list
1020
1021 @return:
1022 A list of every suggestions possible (limited to
1023 maxPartialPredictionSize).
1024 @rtype: L{prdct.Prediction}
1025 """
1026 result = Prediction()
1027 prefix = self.contextMonitor.prefix().lower()
1028 count = 0
1029 candidates = self.db.ngram_table_tp([prefix], maxPartialPredictionSize)
1030 for candidate in candidates:
1031 if count > maxPartialPredictionSize:
1032 break
1033 candidate = candidate[-2]
1034 if not candidate in stopList:
1035 result.add_suggestion(Suggestion(candidate, self.prob))
1036 count += 1
1037 return result
1038
1040 """This predictor has no ability to learn."""
1041 pass
1042