Package tashaphyne ::
Module stemming
|
|
1
2 """
3 Arabic Light Stemmer: a class which provides a configurable stemmer
4 and segmentor for arabic text.
5
6 Features:
7 =========
8
9 - Arabic word Light Stemming.
10 - Root Extraction.
11 - Word Segmentation
12 - Word normalization
13 - Default Arabic Affixes list.
14 - An customizable Light stemmer: possibility of change
15 stemmer options and data.
16 - Data independent stemmer
17
18
19 @author: Taha Zerrouki <taha_zerrouki at gmail dot com>
20 @author: Taha Zerrouki
21 @contact: taha dot zerrouki at gmail dot com
22 @copyright: Arabtechies, Arabeyes, Taha Zerrouki
23 @license: GPL
24 @date:2017/02/15
25 @version:0.3
26 """
27
28 import re, sys
29 sys.path.append('tashaphyne/lib/')
30 import pyarabic.araby as araby
31
32 import tashaphyne.normalize as normalize
33 import tashaphyne.stem_const as stem_const
34
35
37 """
38 ArabicLightStemmer: a class which proved a configurable stemmer
39 and segmentor for arabic text.
40
41 Features:
42 =========
43
44 - Arabic word Light Stemming.
45 - Root Extraction.
46 - Word Segmentation
47 - Word normalization
48 - Default Arabic Affixes list.
49 - An customizable Light stemmer: possibility of change
50 stemmer options and data.
51 - Data independent stemmer
52
53
54 @author: Taha Zerrouki <taha_zerrouki at gmail dot com>
55 @author: Taha Zerrouki
56 @contact: taha dot zerrouki at gmail dot com
57 @copyright: Arabtechies, Arabeyes, Taha Zerrouki
58 @license: GPL
59 @date:2017/02/15
60 @version:0.3
61 """
63
64
65 self.prefix_letters = stem_const.DEFAULT_PREFIX_LETTERS
66 self.suffix_letters = stem_const.DEFAULT_SUFFIX_LETTERS
67 self.infix_letters = stem_const.DEFAULT_INFIX_LETTERS
68 self.max_prefix_length = stem_const.DEFAULT_MAX_PREFIX
69 self.max_suffix_length = stem_const.DEFAULT_MAX_SUFFIX
70 self.min_stem_length = stem_const.DEFAULT_MIN_STEM
71 self.joker = stem_const.DEFAULT_JOKER
72 self.prefix_list = stem_const.DEFAULT_PREFIX_LIST
73 self.suffix_list = stem_const.DEFAULT_SUFFIX_LIST
74 self.word = u""
75 self.unvocalized = u""
76 self.normalized = u""
77 self.starword = u""
78 self.root = u""
79 self.left = 0
80 self.right = 0
81 self.segment_list = []
82
83
84 self.token_pat = re.compile(ur"[^\w\u064b-\u0652']+", re.UNICODE)
85 self.prefixes_tree = self._create_prefix_tree(self.prefix_list)
86 self.suffixes_tree = self._create_suffix_tree(self.suffix_list)
87
88
89
91 """ return the prefixation letters.
92 This constant take DEFAULT_PREFIX_LETTERS by default.
93 @return: return a letters.
94 @rtype: unicode.
95 """
96 return self.prefix_letters
97
99 """ set the prefixation letters.
100 This constant take DEFAULT_PREFIX_LETTERS by default.
101 @param new_prefix_letters: letters to be striped from a word,
102 e.g.new_prefix_letters = u"وف":.
103 @type new_prefix_letters: unicode.
104 """
105 self.prefix_letters = new_prefix_letters
106
108 """ return the suffixation letters.
109 This constant take DEFAULT_SUFFIX_LETTERS by default.
110 @return: return a letters.
111 @rtype: unicode.
112 """
113 return self.suffix_letters
114
116 """ set the suffixation letters.
117 This constant take DEFAULT_SUFFIX_LETTERS by default.
118 @param new_suffix_letters: letters to be striped from the end of a word,
119 e.g.new_suffix_letters = u"ةون":.
120 @type new_suffix_letters: unicode.
121 """
122 self.suffix_letters = new_suffix_letters
123
125 """ get the inffixation letters.
126 This constant take DEFAULT_INFIX_LETTERS by default.
127 @return: infixes letters.
128 @rtype: unicode.
129 """
130 return self.infix_letters
131
133 """ set the inffixation letters.
134 This constant take DEFAULT_INFIX_LETTERS by default.
135 @param new_infix_letters: letters to be striped from the middle
136 of a word, e.g.new_infix_letters = u"أوي":.
137 @type new_infix_letters: unicode.
138 """
139 self.infix_letters = new_infix_letters
140
141
143 """ get the joker letter.
144 This constant take DEFAULT_JOKER by default.
145 @return: joker letter.
146 @rtype: unicode.
147 """
148 return self.joker
149
151 """ set the joker letter.
152 This constant take DEFAULT_JOKER by default.
153 @param new_joker: joker letter.
154 @type new_joker: unicode.
155 """
156 if len(new_joker)>1:
157 new_joker = new_joker[0]
158 self.joker = new_joker
159
161 """ return the constant of max length of the prefix used by the stemmer.
162 This constant take DEFAULT_MAX_PREFIX_LENGTH by default.
163 @return: return a number.
164 @rtype: integer.
165 """
166 return self.max_prefix_length
167
169 """ Set the constant of max length of the prefix used by the stemmer.
170 This constant take DEFAULT_MAX_PREFIX_LENGTH by default.
171 @param new_max_prefix_length: the new max prefix length constant.
172 @type new_max_prefix_length: integer.
173 """
174 self.max_prefix_length = new_max_prefix_length
175
177 """ return the constant of max length of the suffix used by the stemmer.
178 This constant take DEFAULT_MAX_SUFFIX_LENGTH by default.
179 @return: return a number.
180 @rtype: integer.
181 """
182 return self.max_suffix_length
183
185 """ Set the constant of max length of the suffix used by the stemmer.
186 This constant take DEFAULT_MAX_SUFFIX_LENGTH by default.
187 @param new_max_suffix_length: the new max suffix length constant.
188 @type new_max_suffix_length: integer.
189 """
190 self.max_suffix_length = new_max_suffix_length
191
193 """ return the constant of min length of the stem used by the stemmer.
194 This constant take DEFAULT_MIN_STEM_LENGTH by default.
195 @return: return a number.
196 @rtype: integer.
197 """
198 return self.min_stem_length
199
201 """ Set the constant of min length of the stem used by the stemmer.
202 This constant take DEFAULT_MIN_STEM_LENGTH by default.
203 @param new_min_stem_length: the min stem length constant.
204 @type new_min_stem_length: integer.
205 """
206 self.min_stem_length = new_min_stem_length
207
209 """ return the prefixes list used by the stemmer.
210 This constant take DEFAULT_PREFIX_LIST by default.
211 @return: prefixes list.
212 @rtype: set().
213 """
214 return self.prefix_list
216 """ Set prefixes list used by the stemmer.
217 This constant take DEFAULT_PREFIX_LIST by default.
218 @param new_prefix_list: a set of prefixes.
219 @type new_prefix_list: set of unicode string.
220 """
221 self.prefix_list = new_prefix_list
222 self._create_prefix_tree(self.prefix_list)
223
225 """ return the suffixes list used by the stemmer.
226 This constant take DEFAULT_SUFFIX_LIST by default.
227 @return: suffixes list.
228 @rtype: set().
229 """
230 return self.suffix_list
231
233 """ Set suffixes list used by the stemmer.
234 This constant take DEFAULT_SUFFIX_LIST by default.
235 @param new_suffix_list: a set of suffixes.
236 @type new_suffix_list: set of unicode string.
237 """
238 self.suffix_list = new_suffix_list
239 self._create_suffix_tree(self.suffix_list)
240
241
243 """ Set the word to treat by the stemmer.
244 @param new_word: the new word.
245 @type new_word: unicode.
246 """
247 self.word = new_word
248
250 """ return the last word treated by the stemmer.
251 @return: word.
252 @rtype: unicode.
253 """
254 return self.word
255
256
257
258
260 """ return the starlike word treated by the stemmer.
261 All non affix letters are converted to a joker.
262 The joker take by default DEFAULT_JOKER = "*".
263
264 Exmaple:
265 >>> ArListem = ArabicLightStemmer()
266 >>> word = u'أفتصربونني'
267 >>> stem = ArListem.lightStem(word)
268 >>> print ArListem.get_starword()
269 أفت***ونني
270
271 @return: word.
272 @rtype: unicode.
273 """
274 return self.starword
275
276 - def get_root(self, prefix_index = -1, suffix_index = -1):
277 """ return the root of the treated word by the stemmer.
278 All non affix letters are converted to a joker.
279 All letters in the joker places are part of root.
280 The joker take by default DEFAULT_JOKER = "*".
281
282 Example:
283 >>> ArListem = ArabicLightStemmer()
284 >>> word = u'أفتصربونني'
285 >>> stem = ArListem.lightStem(word)
286 >>> print ArListem.get_starword()
287 أفت***ونني
288 >>> print ArListem.get_root()
289 ضرب
290
291 @param prefix_index: indicate the left stemming position
292 if = -1: not cosidered, and take the default word prefix lentgh.
293 @type prefix_index:integer.
294 @param suffix_index:indicate the right stemming position.
295 if = -1: not cosidered, and take the default word suffix position.
296 @type suffix_index: integer.
297 @return: root.
298 @rtype: unicode.
299 """
300 if prefix_index >= 0 or suffix_index >= 0:
301 self.extract_root(prefix_index, suffix_index)
302 return self.root
303
305 """ return the normalized form of the treated word by the stemmer.
306 Some letters are converted into normal form like Hamzat.
307
308 Example:
309 >>> word = u"استؤجرُ"
310 >>> ArListem = ArabicLightStemmer()
311 >>> stem = ArListem.lightStem(word)
312 >>> print ArListem.get_normalized()
313 استءجر
314
315 @return: normalized word.
316 @rtype: unicode.
317 """
318 return self.normalized
319
321 """ return the unvocalized form of the treated word by the stemmer.
322 Harakat are striped.
323
324 Example:
325 >>> word = u"الْعَرَبِيّةُ"
326 >>> ArListem = ArabicLightStemmer()
327 >>> stem = ArListem.lightStem(word)
328 >>> print ArListem.get_unvocalized()
329 العربية
330
331 @return: unvocalized word.
332 @rtype: unicode.
333 """
334 return self.unvocalized
335
337 """ return the the left position of stemming
338 (prefixe end position )in the word treated word by the stemmer.
339
340 Example:
341 >>> ArListem = ArabicLightStemmer()
342 >>> word = u'أفتصربونني'
343 >>> stem = ArListem.lightStem(word)
344 >>> print ArListem.get_starword()
345 أفت***ونني
346 >>> print ArListem.get_left()
347 3
348
349 @return: the left position of stemming.
350 @rtype: integer.
351 """
352 return self.left
353
355 """ return the the right position of stemming
356 (suffixe start position )in the word treated word by the stemmer.
357
358 Example:
359 >>> ArListem = ArabicLightStemmer()
360 >>> word = u'أفتصربونني'
361 >>> stem = ArListem.lightStem(word)
362 >>> print ArListem.get_starword()
363 أفت***ونني
364 >>> print ArListem.get_right()
365 6
366
367 @return: the right position of stemming.
368 @rtype: integer.
369 """
370
371 return self.right
372
373 - def get_stem(self, prefix_index = -1, suffix_index = -1):
374 """ return the stem of the treated word by the stemmer.
375
376 Example:
377 >>> ArListem = ArabicLightStemmer()
378 >>> word = u'أفتكاتبانني'
379 >>> stem = ArListem.lightStem(word)
380 >>> print ArListem.get_stem()
381 كاتب
382
383 @param prefix_index: indicate the left stemming position
384 if = -1: not cosidered, and take the default word prefix lentgh.
385 @type prefix_index:integer.
386 @param suffix_index:indicate the right stemming position.
387 if = -1: not cosidered, and take the default word suffix position.
388 @type suffix_index: integer.
389 @return: stem.
390 @rtype: unicode.
391 """
392 if prefix_index < 0:
393 left = self.left
394 else:
395 left = prefix_index
396 if suffix_index < 0:
397 right = self.right
398 else:
399 right = suffix_index
400 return self.unvocalized[left:right]
401
402 - def get_starstem(self, prefix_index = -1, suffix_index = -1):
403 """ return the star form stem of the treated word by the stemmer.
404 All non affix letters are converted to a joker.
405 The joker take by default DEFAULT_JOKER = "*".
406
407 Example:
408 >>> ArListem = ArabicLightStemmer()
409 >>> word = u'أفتكاتبانني'
410 >>> stem = ArListem.lightStem(word)
411 >>> print ArListem.get_stem()
412 كاتب
413 >>> print ArListem.get_starstem()
414 *ات*
415
416 @param prefix_index: indicate the left stemming position
417 if = -1: not cosidered, and take the default word prefix lentgh.
418 @type prefix_index:integer.
419 @param suffix_index:indicate the right stemming position.
420 if = -1: not cosidered, and take the default word suffix position.
421 @type suffix_index: integer.
422 @return: stared form of stem.
423 @rtype: unicode.
424 """
425 if prefix_index < 0 and suffix_index < 0:
426 return self.starword[self.left:self.right]
427 else:
428 left = self.left
429 right = self.right
430 if prefix_index >= 0:
431 left = prefix_index
432 if suffix_index >= 0:
433 right = suffix_index
434 if self.infix_letters != "":
435 newstarstem = re.sub(u"[^%s]"%self.infix_letters, \
436 self.joker, self.starword[left:right])
437 else:
438 newstarstem = self.joker*len(self.starword[left:right])
439
440 return newstarstem
441
442
443
444
446 """ return the prefix of the treated word by the stemmer.
447
448 Example:
449 >>> ArListem = ArabicLightStemmer()
450 >>> word = u'أفتكاتبانني'
451 >>> stem = ArListem.lightStem(word)
452 >>> print ArListem.get_prefix()
453 أفت
454
455 @param prefix_index: indicate the left stemming position
456 if = -1: not cosidered, and take the default word prefix lentgh.
457 @type prefix_index:integer.
458 @return: prefixe.
459 @rtype: unicode.
460 """
461 if prefix_index < 0:
462 return self.unvocalized[:self.left]
463 else:
464 return self.unvocalized[:prefix_index]
465
466
468 """ return the suffix of the treated word by the stemmer.
469
470 Example:
471 >>> ArListem = ArabicLightStemmer()
472 >>> word = u'أفتكاتبانني'
473 >>> stem = ArListem.lightStem(word)
474 >>> print ArListem.get_suffix()
475 انني
476
477 @param suffix_index:indicate the right stemming position.
478 if = -1: not cosidered, and take the default word suffix position.
479 @type suffix_index: integer.
480 @return: suffixe.
481 @rtype: unicode.
482 """
483 if suffix_index < 0:
484 return self.unvocalized[self.right:]
485 else:
486 return self.unvocalized[suffix_index:]
487
488 - def get_affix(self, prefix_index = -1, suffix_index = -1):
489 """ return the affix of the treated word by the stemmer.
490
491 Example:
492 >>> ArListem = ArabicLightStemmer()
493 >>> word = u'أفتكاتبانني'
494 >>> stem = ArListem.lightStem(word)
495 >>> print ArListem.get_affix()
496 أفت-انني
497
498 @param prefix_index: indicate the left stemming position
499 if = -1: not cosidered, and take the default word prefix lentgh.
500 @type prefix_index:integer.
501 @param suffix_index:indicate the right stemming position.
502 if = -1: not cosidered, and take the default word suffix position.
503 @type suffix_index: in4teger.
504 @return: suffixe.
505 @rtype: unicode.
506 """
507 return u"-".join([self.get_prefix(prefix_index), \
508 self.get_suffix(suffix_index)])
509
511 """ return the affix tuple of the treated word by the stemmer.
512
513 Example:
514 >>> ArListem = ArabicLightStemmer()
515 >>> word = u'أفتضاربانني'
516 >>> stem = ArListem.lightStem(word)
517 >>> print ArListem.get_affix_tuple()
518 {'prefix': u'أفت', 'root': u'ضرب', 'suffix': u'انني', 'stem': u'ضارب'}
519
520 @param prefix_index: indicate the left stemming position
521 if = -1: not cosidered, and take the default word prefix lentgh.
522 @type prefix_index:integer.
523 @param suffix_index:indicate the right stemming position.
524 if = -1: not cosidered, and take the default word suffix position.
525 @type suffix_index: integer.
526 @return: affix tuple.
527 @rtype: dict.
528 """
529 return {
530 'prefix':self.get_prefix(prefix_index),
531 'suffix':self.get_suffix(suffix_index),
532 'stem':self.get_stem(prefix_index, suffix_index),
533 'root':self.get_root(prefix_index, suffix_index), }
534
535
536
538 u"""
539 Stemming function, stem an arabic word, and return a stem.
540 This function store in the instance the stemming positions (left, right), then it's possible to get other calculted attributs like : stem, prefixe, suffixe, root.
541
542 Example:
543 >>> ArListem = ArabicLightStemmer()
544 >>> word = u'أفتضاربانني'
545 >>> stem = ArListem.light_stem(word)
546 >>> print ArListem.get_stem()
547 ضارب
548 >>> print ArListem.get_starstem()
549 *ا**
550 >>> print ArListem.get_left()
551 3
552 >>> print ArListem.get_right()
553 6
554 >>> print ArListem.get_root()
555 ضرب
556
557 @param word: the input word.
558 @type word: unicode.
559 @return: stem.
560 @rtype: unicode.
561 """
562 if word == u'':
563 return u''
564
565 self.transform2stars(word)
566
567
568 self.extract_root()
569 return self.get_stem()
570
655
657 """ return the root of the treated word by the stemmer.
658 All non affix letters are converted to a joker.
659 All letters in the joker places are part of root.
660 The joker take by default DEFAULT_JOKER = "*".
661
662 Example:
663 >>> ArListem = ArabicLightStemmer()
664 >>> word = u'أفتصربونني'
665 >>> stem = ArListem.lightStem(word)
666 >>> print ArListem.get_starword()
667 أفت***ونني
668 >>> print ArListem.get_root()
669 ضرب
670
671 @param prefix_index: indicate the left stemming position
672 if = -1: not cosidered, and take the default word prefix lentgh.
673 @type prefix_index:integer.
674 @param suffix_index:indicate the right stemming position.
675 if = -1: not cosidered, and take the default word suffix position.
676 @type suffix_index: integer.
677 @return: root.
678 @rtype: unicode.
679 """
680
681 starstem = self.get_starstem(prefix_index, suffix_index)
682 stem = self.get_stem(prefix_index, suffix_index)
683 root = u""
684 if len(starstem) == len(stem):
685 for i in range(len(stem)):
686 if starstem[i] == self.joker:
687 root += stem[i]
688 else:
689 root = stem
690 self.root = root
691 return root
692
693
695 """
696 Create a prefixes tree from given prefixes list
697 @param prefixes: list of prefixes
698 @type prefixes: list of unicode
699 @return : prefixes tree
700 @rtype: Tree stucture
701 """
702 prefixestree = {}
703 for prefix in prefixes:
704
705 branch = prefixestree
706 for char in prefix:
707 if not branch.has_key(char):
708 branch[char] = {}
709 branch = branch[char]
710
711 if branch.has_key('#'):
712 branch['#'][prefix] = "#"
713 else:
714 branch['#'] = {prefix:"#", }
715 self.prefixes_tree = prefixestree
716 return self.prefixes_tree
718 """
719 Create a suffixes tree from given suffixes list
720 @param suffixes: list of suffixes
721 @type suffixes: list of unicode
722 @return : suffixes tree
723 @rtype: Tree stucture
724 """
725 suffixestree = {}
726 for suffix in suffixes:
727
728 branch = suffixestree
729
730 for char in suffix[::-1]:
731 if not branch.has_key(char):
732 branch[char] = {}
733 branch = branch[char]
734
735 if branch.has_key('#'):
736 branch['#'][suffix] = "#"
737 else:
738 branch['#'] = {suffix:"#", }
739 self.suffixes_tree = suffixestree
740 return self.suffixes_tree
741
743 """
744 lookup for prefixes in the word
745 @param word: the given word
746 @type word: unicode
747 @return : list of prefixes starts positions
748 @rtype: list of int
749 """
750 branch = self.prefixes_tree
751 lefts = [0, ]
752 i = 0
753 while i < len(word) and branch.has_key(word[i]):
754 if branch.has_key('#'):
755
756 lefts.append(i)
757 if branch.has_key(word[i]):
758 branch = branch[word[i]]
759 else:
760
761 break
762 i += 1
763 if i < len(word) and branch.has_key('#') :
764 lefts.append(i)
765 return lefts
766
767
769 """
770 lookup for suffixes in the word
771 @param word: the given word
772 @type word: unicode
773 @return : list of suffixes starts positions
774 @rtype: list of int
775 """
776 branch = self.suffixes_tree
777 suffix = ''
778
779 rights = []
780 i = len(word)-1
781 while i >= 0 and branch.has_key(word[i]):
782 suffix = word[i]+suffix
783 if branch.has_key('#'):
784
785
786 rights.append(i+1)
787 if branch.has_key(word[i]):
788 branch = branch[word[i]]
789 else:
790
791 break
792 i -= 1
793 if i >= 0 and branch.has_key('#') :
794 rights.append(i+1)
795 return rights
796
797
798
799
801 """ generate a list of all posibble segmentation positions
802 (lef, right) of the treated word by the stemmer.
803
804 Example:
805 >>> ArListem = ArabicLightStemmer()
806 >>> word = u'فتضربين'
807 >>> print ArListem.segment(word)
808 set(([(1, 5), (2, 5), (0, 7)])
809
810 @return: List of segmentation
811 @rtype: set of tuple of integer.
812 """
813 self.word = word
814 self.unvocalized = araby.strip_tashkeel(word)
815
816 word = re.sub("[%s]"%(araby.ALEF_MADDA), araby.HAMZA+araby.ALEF, word)
817
818
819 lefts = self.lookup_prefixes(word)
820
821 rights = self.lookup_suffixes(word)
822 if lefts:
823 self.left = max(lefts)
824 else:
825 self.left = -1
826 if rights:
827 self.right = min(rights)
828 else:
829 self.right = -1
830
831 self.segment_list = set([(0, len(word))])
832
833 for i in lefts:
834 for j in rights:
835 if j >= i+2:
836 self.segment_list.add((i, j))
837 return self.segment_list
838
839
840
841
842
844 """ return a list of segmentation positions (left, right)
845 of the treated word by the stemmer.
846
847 Example:
848 >>> ArListem = ArabicLightStemmer()
849 >>> word = u'فتضربين'
850 >>> ArListem.segment(word)
851 >>> print ArListem.get_segment_list()
852 set(([(1, 5), (2, 5), (0, 7)])
853
854 @return: List of segmentation
855 @rtype: set of tuple of integer.
856 """
857 return self.segment_list
858
859
861 u""" return a list of affix tuple of the treated word by the stemmer.
862
863 Example:
864 >>> ArListem = ArabicLightStemmer()
865 >>> word = u'فتضربين'
866 >>> ArListem.segment(word)
867 >>> print ArListem.get_affix_list()
868 [{'prefix': u'ف', 'root': u'ضرب', 'suffix': u'\u064aن', 'stem': u'تضرب'},
869 {'prefix': u'فت', 'root': u'ضرب', 'suffix': u'\u064aن', 'stem': u'ضرب'},
870 {'prefix': u'', 'root': u'فضربن', 'suffix': u'', 'stem': u'فتضرب\u064aن'}]
871
872 @return: List of Affixes tuple
873 @rtype: list of dict.
874 """
875 affix_list = []
876 for item in self.segment_list:
877 affix_list.append(self.get_affix_tuple(item[0], item[1]))
878 return affix_list
879
880
881
882
883
884
886 """
887 Normalize a word.
888 Convert some leters forms into unified form.
889 @param word: the input word, if word is empty,
890 the word member of the class is normalized.
891 @type word: unicode.
892 @return: normalized word.
893 @rtype: unicode.
894 """
895
896 if word == u'' and self.word == u"":
897 return u""
898 elif word != u'':
899 self.word = word
900 else:
901 word = self.word
902 self.normalized = normalize.normalize_searchtext(word)
903 return self.normalized
904
906 """
907 Tokenize text into words
908 @param text: the input text.
909 @type text: unicode.
910 @return: list of words.
911 @rtype: list.
912 """
913 if not text :
914 return []
915 else:
916 mylist = self.token_pat.split(text)
917 if u'' in mylist:
918 mylist.remove(u'')
919 return mylist
920