Package tashaphyne :: Module stemming
[hide private]
[frames] | no frames]

Source Code for Module tashaphyne.stemming

  1  # -*- coding: UTF-8 -*- 
  2  """ 
  3  Arabic Light Stemmer: a class which provides a configurable stemmer  
  4  and segmentor for arabic text. 
  5   
  6  Features: 
  7  ========= 
  8   
  9      - Arabic word Light Stemming. 
 10      - Root Extraction. 
 11      - Word Segmentation 
 12      - Word normalization 
 13      - Default Arabic Affixes list. 
 14      - An customizable Light stemmer: possibility of change  
 15      stemmer options and data. 
 16      - Data independent stemmer 
 17   
 18   
 19  @author: Taha Zerrouki <taha_zerrouki at gmail dot com> 
 20  @author: Taha Zerrouki 
 21  @contact: taha dot zerrouki at gmail dot com 
 22  @copyright: Arabtechies,  Arabeyes,   Taha Zerrouki 
 23  @license: GPL 
 24  @date:2017/02/15 
 25  @version:0.3 
 26  """ 
 27   
 28  import  re, sys 
 29  sys.path.append('tashaphyne/lib/') 
 30  import pyarabic.araby as araby 
 31   
 32  import tashaphyne.normalize as normalize 
 33  import tashaphyne.stem_const as stem_const 
 34   
 35   
36 -class ArabicLightStemmer:
37 """ 38 ArabicLightStemmer: a class which proved a configurable stemmer 39 and segmentor for arabic text. 40 41 Features: 42 ========= 43 44 - Arabic word Light Stemming. 45 - Root Extraction. 46 - Word Segmentation 47 - Word normalization 48 - Default Arabic Affixes list. 49 - An customizable Light stemmer: possibility of change 50 stemmer options and data. 51 - Data independent stemmer 52 53 54 @author: Taha Zerrouki <taha_zerrouki at gmail dot com> 55 @author: Taha Zerrouki 56 @contact: taha dot zerrouki at gmail dot com 57 @copyright: Arabtechies, Arabeyes, Taha Zerrouki 58 @license: GPL 59 @date:2017/02/15 60 @version:0.3 61 """
62 - def __init__(self):
63 #load affix information 64 # pass 65 self.prefix_letters = stem_const.DEFAULT_PREFIX_LETTERS 66 self.suffix_letters = stem_const.DEFAULT_SUFFIX_LETTERS 67 self.infix_letters = stem_const.DEFAULT_INFIX_LETTERS 68 self.max_prefix_length = stem_const.DEFAULT_MAX_PREFIX 69 self.max_suffix_length = stem_const.DEFAULT_MAX_SUFFIX 70 self.min_stem_length = stem_const.DEFAULT_MIN_STEM 71 self.joker = stem_const.DEFAULT_JOKER 72 self.prefix_list = stem_const.DEFAULT_PREFIX_LIST 73 self.suffix_list = stem_const.DEFAULT_SUFFIX_LIST 74 self.word = u"" 75 self.unvocalized = u"" 76 self.normalized = u"" 77 self.starword = u"" 78 self.root = u"" 79 self.left = 0 80 self.right = 0 81 self.segment_list = [] 82 #token pattern 83 # letters and harakat 84 self.token_pat = re.compile(ur"[^\w\u064b-\u0652']+", re.UNICODE) 85 self.prefixes_tree = self._create_prefix_tree(self.prefix_list) 86 self.suffixes_tree = self._create_suffix_tree(self.suffix_list)
87 ###################################################################### 88 #{ Attribut Functions 89 ######################################################################
90 - def get_prefix_letters(self, ):
91 """ return the prefixation letters. 92 This constant take DEFAULT_PREFIX_LETTERS by default. 93 @return: return a letters. 94 @rtype: unicode. 95 """ 96 return self.prefix_letters
97
98 - def set_prefix_letters(self, new_prefix_letters):
99 """ set the prefixation letters. 100 This constant take DEFAULT_PREFIX_LETTERS by default. 101 @param new_prefix_letters: letters to be striped from a word, 102 e.g.new_prefix_letters = u"وف":. 103 @type new_prefix_letters: unicode. 104 """ 105 self.prefix_letters = new_prefix_letters
106
107 - def get_suffix_letters(self, ):
108 """ return the suffixation letters. 109 This constant take DEFAULT_SUFFIX_LETTERS by default. 110 @return: return a letters. 111 @rtype: unicode. 112 """ 113 return self.suffix_letters
114
115 - def set_suffix_letters(self, new_suffix_letters):
116 """ set the suffixation letters. 117 This constant take DEFAULT_SUFFIX_LETTERS by default. 118 @param new_suffix_letters: letters to be striped from the end of a word, 119 e.g.new_suffix_letters = u"ةون":. 120 @type new_suffix_letters: unicode. 121 """ 122 self.suffix_letters = new_suffix_letters
123
124 - def get_infix_letters(self, ):
125 """ get the inffixation letters. 126 This constant take DEFAULT_INFIX_LETTERS by default. 127 @return: infixes letters. 128 @rtype: unicode. 129 """ 130 return self.infix_letters
131
132 - def set_infix_letters(self, new_infix_letters):
133 """ set the inffixation letters. 134 This constant take DEFAULT_INFIX_LETTERS by default. 135 @param new_infix_letters: letters to be striped from the middle 136 of a word, e.g.new_infix_letters = u"أوي":. 137 @type new_infix_letters: unicode. 138 """ 139 self.infix_letters = new_infix_letters
140 141
142 - def get_joker(self, ):
143 """ get the joker letter. 144 This constant take DEFAULT_JOKER by default. 145 @return: joker letter. 146 @rtype: unicode. 147 """ 148 return self.joker
149
150 - def set_joker(self, new_joker):
151 """ set the joker letter. 152 This constant take DEFAULT_JOKER by default. 153 @param new_joker: joker letter. 154 @type new_joker: unicode. 155 """ 156 if len(new_joker)>1: 157 new_joker = new_joker[0] 158 self.joker = new_joker
159
160 - def get_max_prefix_length(self, ):
161 """ return the constant of max length of the prefix used by the stemmer. 162 This constant take DEFAULT_MAX_PREFIX_LENGTH by default. 163 @return: return a number. 164 @rtype: integer. 165 """ 166 return self.max_prefix_length
167
168 - def set_max_prefix_length(self, new_max_prefix_length):
169 """ Set the constant of max length of the prefix used by the stemmer. 170 This constant take DEFAULT_MAX_PREFIX_LENGTH by default. 171 @param new_max_prefix_length: the new max prefix length constant. 172 @type new_max_prefix_length: integer. 173 """ 174 self.max_prefix_length = new_max_prefix_length
175
176 - def get_max_suffix_length(self, ):
177 """ return the constant of max length of the suffix used by the stemmer. 178 This constant take DEFAULT_MAX_SUFFIX_LENGTH by default. 179 @return: return a number. 180 @rtype: integer. 181 """ 182 return self.max_suffix_length
183
184 - def set_max_suffix_length(self, new_max_suffix_length):
185 """ Set the constant of max length of the suffix used by the stemmer. 186 This constant take DEFAULT_MAX_SUFFIX_LENGTH by default. 187 @param new_max_suffix_length: the new max suffix length constant. 188 @type new_max_suffix_length: integer. 189 """ 190 self.max_suffix_length = new_max_suffix_length
191
192 - def get_min_stem_length(self, ):
193 """ return the constant of min length of the stem used by the stemmer. 194 This constant take DEFAULT_MIN_STEM_LENGTH by default. 195 @return: return a number. 196 @rtype: integer. 197 """ 198 return self.min_stem_length
199
200 - def set_min_stem_length(self, new_min_stem_length):
201 """ Set the constant of min length of the stem used by the stemmer. 202 This constant take DEFAULT_MIN_STEM_LENGTH by default. 203 @param new_min_stem_length: the min stem length constant. 204 @type new_min_stem_length: integer. 205 """ 206 self.min_stem_length = new_min_stem_length
207
208 - def get_prefix_list(self, ):
209 """ return the prefixes list used by the stemmer. 210 This constant take DEFAULT_PREFIX_LIST by default. 211 @return: prefixes list. 212 @rtype: set(). 213 """ 214 return self.prefix_list
215 - def set_prefix_list(self, new_prefix_list):
216 """ Set prefixes list used by the stemmer. 217 This constant take DEFAULT_PREFIX_LIST by default. 218 @param new_prefix_list: a set of prefixes. 219 @type new_prefix_list: set of unicode string. 220 """ 221 self.prefix_list = new_prefix_list 222 self._create_prefix_tree(self.prefix_list)
223
224 - def get_suffix_list(self, ):
225 """ return the suffixes list used by the stemmer. 226 This constant take DEFAULT_SUFFIX_LIST by default. 227 @return: suffixes list. 228 @rtype: set(). 229 """ 230 return self.suffix_list
231
232 - def set_suffix_list(self, new_suffix_list):
233 """ Set suffixes list used by the stemmer. 234 This constant take DEFAULT_SUFFIX_LIST by default. 235 @param new_suffix_list: a set of suffixes. 236 @type new_suffix_list: set of unicode string. 237 """ 238 self.suffix_list = new_suffix_list 239 self._create_suffix_tree(self.suffix_list)
240 241
242 - def set_word(self, new_word):
243 """ Set the word to treat by the stemmer. 244 @param new_word: the new word. 245 @type new_word: unicode. 246 """ 247 self.word = new_word
248
249 - def get_word(self):
250 """ return the last word treated by the stemmer. 251 @return: word. 252 @rtype: unicode. 253 """ 254 return self.word
255 ######################################################### 256 #{ Calculated Attribut Functions 257 ######################################################### 258
259 - def get_starword(self):
260 """ return the starlike word treated by the stemmer. 261 All non affix letters are converted to a joker. 262 The joker take by default DEFAULT_JOKER = "*". 263 264 Exmaple: 265 >>> ArListem = ArabicLightStemmer() 266 >>> word = u'أفتصربونني' 267 >>> stem = ArListem.lightStem(word) 268 >>> print ArListem.get_starword() 269 أفت***ونني 270 271 @return: word. 272 @rtype: unicode. 273 """ 274 return self.starword
275
276 - def get_root(self, prefix_index = -1, suffix_index = -1):
277 """ return the root of the treated word by the stemmer. 278 All non affix letters are converted to a joker. 279 All letters in the joker places are part of root. 280 The joker take by default DEFAULT_JOKER = "*". 281 282 Example: 283 >>> ArListem = ArabicLightStemmer() 284 >>> word = u'أفتصربونني' 285 >>> stem = ArListem.lightStem(word) 286 >>> print ArListem.get_starword() 287 أفت***ونني 288 >>> print ArListem.get_root() 289 ضرب 290 291 @param prefix_index: indicate the left stemming position 292 if = -1: not cosidered, and take the default word prefix lentgh. 293 @type prefix_index:integer. 294 @param suffix_index:indicate the right stemming position. 295 if = -1: not cosidered, and take the default word suffix position. 296 @type suffix_index: integer. 297 @return: root. 298 @rtype: unicode. 299 """ 300 if prefix_index >= 0 or suffix_index >= 0: 301 self.extract_root(prefix_index, suffix_index) 302 return self.root
303
304 - def get_normalized(self):
305 """ return the normalized form of the treated word by the stemmer. 306 Some letters are converted into normal form like Hamzat. 307 308 Example: 309 >>> word = u"استؤجرُ" 310 >>> ArListem = ArabicLightStemmer() 311 >>> stem = ArListem.lightStem(word) 312 >>> print ArListem.get_normalized() 313 استءجر 314 315 @return: normalized word. 316 @rtype: unicode. 317 """ 318 return self.normalized
319
320 - def get_unvocalized(self):
321 """ return the unvocalized form of the treated word by the stemmer. 322 Harakat are striped. 323 324 Example: 325 >>> word = u"الْعَرَبِيّةُ" 326 >>> ArListem = ArabicLightStemmer() 327 >>> stem = ArListem.lightStem(word) 328 >>> print ArListem.get_unvocalized() 329 العربية 330 331 @return: unvocalized word. 332 @rtype: unicode. 333 """ 334 return self.unvocalized
335
336 - def get_left(self):
337 """ return the the left position of stemming 338 (prefixe end position )in the word treated word by the stemmer. 339 340 Example: 341 >>> ArListem = ArabicLightStemmer() 342 >>> word = u'أفتصربونني' 343 >>> stem = ArListem.lightStem(word) 344 >>> print ArListem.get_starword() 345 أفت***ونني 346 >>> print ArListem.get_left() 347 3 348 349 @return: the left position of stemming. 350 @rtype: integer. 351 """ 352 return self.left
353
354 - def get_right(self):
355 """ return the the right position of stemming 356 (suffixe start position )in the word treated word by the stemmer. 357 358 Example: 359 >>> ArListem = ArabicLightStemmer() 360 >>> word = u'أفتصربونني' 361 >>> stem = ArListem.lightStem(word) 362 >>> print ArListem.get_starword() 363 أفت***ونني 364 >>> print ArListem.get_right() 365 6 366 367 @return: the right position of stemming. 368 @rtype: integer. 369 """ 370 371 return self.right
372
373 - def get_stem(self, prefix_index = -1, suffix_index = -1):
374 """ return the stem of the treated word by the stemmer. 375 376 Example: 377 >>> ArListem = ArabicLightStemmer() 378 >>> word = u'أفتكاتبانني' 379 >>> stem = ArListem.lightStem(word) 380 >>> print ArListem.get_stem() 381 كاتب 382 383 @param prefix_index: indicate the left stemming position 384 if = -1: not cosidered, and take the default word prefix lentgh. 385 @type prefix_index:integer. 386 @param suffix_index:indicate the right stemming position. 387 if = -1: not cosidered, and take the default word suffix position. 388 @type suffix_index: integer. 389 @return: stem. 390 @rtype: unicode. 391 """ 392 if prefix_index < 0: 393 left = self.left 394 else: 395 left = prefix_index 396 if suffix_index < 0: 397 right = self.right 398 else: 399 right = suffix_index 400 return self.unvocalized[left:right]
401
402 - def get_starstem(self, prefix_index = -1, suffix_index = -1):
403 """ return the star form stem of the treated word by the stemmer. 404 All non affix letters are converted to a joker. 405 The joker take by default DEFAULT_JOKER = "*". 406 407 Example: 408 >>> ArListem = ArabicLightStemmer() 409 >>> word = u'أفتكاتبانني' 410 >>> stem = ArListem.lightStem(word) 411 >>> print ArListem.get_stem() 412 كاتب 413 >>> print ArListem.get_starstem() 414 *ات* 415 416 @param prefix_index: indicate the left stemming position 417 if = -1: not cosidered, and take the default word prefix lentgh. 418 @type prefix_index:integer. 419 @param suffix_index:indicate the right stemming position. 420 if = -1: not cosidered, and take the default word suffix position. 421 @type suffix_index: integer. 422 @return: stared form of stem. 423 @rtype: unicode. 424 """ 425 if prefix_index < 0 and suffix_index < 0: 426 return self.starword[self.left:self.right] 427 else: 428 left = self.left 429 right = self.right 430 if prefix_index >= 0: 431 left = prefix_index 432 if suffix_index >= 0: 433 right = suffix_index 434 if self.infix_letters != "": 435 newstarstem = re.sub(u"[^%s]"%self.infix_letters, \ 436 self.joker, self.starword[left:right]) 437 else: 438 newstarstem = self.joker*len(self.starword[left:right]) 439 440 return newstarstem
441 442 # def get_prefix(self): 443 # return self.unvocalized[:self.left] 444
445 - def get_prefix(self, prefix_index = -1):
446 """ return the prefix of the treated word by the stemmer. 447 448 Example: 449 >>> ArListem = ArabicLightStemmer() 450 >>> word = u'أفتكاتبانني' 451 >>> stem = ArListem.lightStem(word) 452 >>> print ArListem.get_prefix() 453 أفت 454 455 @param prefix_index: indicate the left stemming position 456 if = -1: not cosidered, and take the default word prefix lentgh. 457 @type prefix_index:integer. 458 @return: prefixe. 459 @rtype: unicode. 460 """ 461 if prefix_index < 0: 462 return self.unvocalized[:self.left] 463 else: 464 return self.unvocalized[:prefix_index]
465 466
467 - def get_suffix(self, suffix_index = -1):
468 """ return the suffix of the treated word by the stemmer. 469 470 Example: 471 >>> ArListem = ArabicLightStemmer() 472 >>> word = u'أفتكاتبانني' 473 >>> stem = ArListem.lightStem(word) 474 >>> print ArListem.get_suffix() 475 انني 476 477 @param suffix_index:indicate the right stemming position. 478 if = -1: not cosidered, and take the default word suffix position. 479 @type suffix_index: integer. 480 @return: suffixe. 481 @rtype: unicode. 482 """ 483 if suffix_index < 0: 484 return self.unvocalized[self.right:] 485 else: 486 return self.unvocalized[suffix_index:]
487
488 - def get_affix(self, prefix_index = -1, suffix_index = -1):
489 """ return the affix of the treated word by the stemmer. 490 491 Example: 492 >>> ArListem = ArabicLightStemmer() 493 >>> word = u'أفتكاتبانني' 494 >>> stem = ArListem.lightStem(word) 495 >>> print ArListem.get_affix() 496 أفت-انني 497 498 @param prefix_index: indicate the left stemming position 499 if = -1: not cosidered, and take the default word prefix lentgh. 500 @type prefix_index:integer. 501 @param suffix_index:indicate the right stemming position. 502 if = -1: not cosidered, and take the default word suffix position. 503 @type suffix_index: in4teger. 504 @return: suffixe. 505 @rtype: unicode. 506 """ 507 return u"-".join([self.get_prefix(prefix_index), \ 508 self.get_suffix(suffix_index)])
509
510 - def get_affix_tuple(self, prefix_index = -1, suffix_index = 0):
511 """ return the affix tuple of the treated word by the stemmer. 512 513 Example: 514 >>> ArListem = ArabicLightStemmer() 515 >>> word = u'أفتضاربانني' 516 >>> stem = ArListem.lightStem(word) 517 >>> print ArListem.get_affix_tuple() 518 {'prefix': u'أفت', 'root': u'ضرب', 'suffix': u'انني', 'stem': u'ضارب'} 519 520 @param prefix_index: indicate the left stemming position 521 if = -1: not cosidered, and take the default word prefix lentgh. 522 @type prefix_index:integer. 523 @param suffix_index:indicate the right stemming position. 524 if = -1: not cosidered, and take the default word suffix position. 525 @type suffix_index: integer. 526 @return: affix tuple. 527 @rtype: dict. 528 """ 529 return { 530 'prefix':self.get_prefix(prefix_index), 531 'suffix':self.get_suffix(suffix_index), 532 'stem':self.get_stem(prefix_index, suffix_index), 533 'root':self.get_root(prefix_index, suffix_index), }
534 ######################################################### 535 #{ Stemming Functions 536 #########################################################
537 - def light_stem(self, word):
538 u""" 539 Stemming function, stem an arabic word, and return a stem. 540 This function store in the instance the stemming positions (left, right), then it's possible to get other calculted attributs like : stem, prefixe, suffixe, root. 541 542 Example: 543 >>> ArListem = ArabicLightStemmer() 544 >>> word = u'أفتضاربانني' 545 >>> stem = ArListem.light_stem(word) 546 >>> print ArListem.get_stem() 547 ضارب 548 >>> print ArListem.get_starstem() 549 *ا** 550 >>> print ArListem.get_left() 551 3 552 >>> print ArListem.get_right() 553 6 554 >>> print ArListem.get_root() 555 ضرب 556 557 @param word: the input word. 558 @type word: unicode. 559 @return: stem. 560 @rtype: unicode. 561 """ 562 if word == u'': 563 return u'' 564 #~ starword, left, right = self.transform2stars(word) 565 self.transform2stars(word) 566 567 #consititute the root 568 self.extract_root() 569 return self.get_stem()
570
571 - def transform2stars(self, word):
572 """ 573 Transform all non affixation letters into a star. 574 the star is a joker(by default '*'). 575 which indicates that the correspandent letter is an original. 576 this function is used by the stmmer to identify original letters. 577 and return a stared form and stemming positions (left, right) 578 579 Example: 580 >>> ArListem = ArabicLightStemmer() 581 >>> word = u'أفتضاربانني' 582 >>> starword, left, right = ArListem.transformToStrars(word) 583 (أفت*ا**انني, 3, 6) 584 585 @param word: the input word. 586 @type word: unicode 587 @return: (starword, left, right): 588 - starword : all original letters converted into a star 589 - left : the greater possible left stemming position. 590 - right : the greater possible right stemming position. 591 @rtype: tuple. 592 """ 593 self.word = word 594 word = araby.strip_tashkeel(word) 595 # word, harakat = araby.separate(word) 596 self.unvocalized = word 597 word = re.sub("[%s]"%(araby.ALEF_MADDA), araby.HAMZA+araby.ALEF, word) 598 word = re.sub("[^%s%s]"%(self.prefix_letters, self.suffix_letters), \ 599 self.joker, word) 600 #~ ln = len(word) 601 left = word.find(self.joker) 602 right = word.rfind(self.joker) 603 if left >= 0: 604 left = min(left, self.max_prefix_length-1) 605 right = max(right+1, len(word)-self.max_suffix_length) 606 prefix = word[:left] 607 stem = word[left:right] 608 suffix = word[right:] 609 prefix = re.sub("[^%s]"%self.prefix_letters, self.joker, prefix) 610 # avoid null infixes 611 if(self.infix_letters != u""): 612 stem = re.sub("[^%s]"%self.infix_letters, self.joker, stem) 613 suffix = re.sub("[^%s]"%self.suffix_letters, self.joker, suffix) 614 word = prefix+stem+suffix 615 616 left = word.find(self.joker) 617 right = word.rfind(self.joker) 618 # prefix_list = self.PREFIX_LIST 619 # suffix_list = self.SUFFIX_LIST 620 621 if left < 0: 622 left = min(self.max_prefix_length, len(word)-2) 623 if left >= 0: 624 prefix = word[:left] 625 while prefix != "" and prefix not in self.prefix_list: 626 prefix = prefix[:-1] 627 if right < 0: 628 right = max(len(prefix), len(word)-self.max_suffix_length) 629 suffix = word[right:] 630 631 while suffix != "" and suffix not in self.suffix_list: 632 suffix = suffix[1:] 633 left = len(prefix) 634 right = len(word)-len(suffix) 635 stem = word[left:right] 636 # convert stem into stars. 637 # a stem must starts with alef, or end with alef. 638 # any other infixes letter isnt infixe at the border of the stem. 639 #substitute all non infixes letters 640 if self.infix_letters != "": 641 stem = re.sub("[^%s]"%self.infix_letters, self.joker, stem) 642 643 # substitube teh in infixes the teh mst be in the first 644 # or second place, all others, are converted 645 # 646 # stem = stem[:2]+re.sub(TEH, self.joker, stem[2:]) 647 word = prefix+stem+suffix 648 # store result 649 self.left = left 650 self.right = right 651 self.starword = word 652 self.extract_root() 653 # return starword, left, right position of stem 654 return (word, left, right)
655
656 - def extract_root(self, prefix_index = -1, suffix_index = -1):
657 """ return the root of the treated word by the stemmer. 658 All non affix letters are converted to a joker. 659 All letters in the joker places are part of root. 660 The joker take by default DEFAULT_JOKER = "*". 661 662 Example: 663 >>> ArListem = ArabicLightStemmer() 664 >>> word = u'أفتصربونني' 665 >>> stem = ArListem.lightStem(word) 666 >>> print ArListem.get_starword() 667 أفت***ونني 668 >>> print ArListem.get_root() 669 ضرب 670 671 @param prefix_index: indicate the left stemming position 672 if = -1: not cosidered, and take the default word prefix lentgh. 673 @type prefix_index:integer. 674 @param suffix_index:indicate the right stemming position. 675 if = -1: not cosidered, and take the default word suffix position. 676 @type suffix_index: integer. 677 @return: root. 678 @rtype: unicode. 679 """ 680 681 starstem = self.get_starstem(prefix_index, suffix_index) 682 stem = self.get_stem(prefix_index, suffix_index) 683 root = u"" 684 if len(starstem) == len(stem): 685 for i in range(len(stem)): 686 if starstem[i] == self.joker: 687 root += stem[i] 688 else: 689 root = stem 690 self.root = root 691 return root
692 693
694 - def _create_prefix_tree(self, prefixes):
695 """ 696 Create a prefixes tree from given prefixes list 697 @param prefixes: list of prefixes 698 @type prefixes: list of unicode 699 @return : prefixes tree 700 @rtype: Tree stucture 701 """ 702 prefixestree = {} 703 for prefix in prefixes: 704 # print prefix.encode('utf8') 705 branch = prefixestree 706 for char in prefix: 707 if not branch.has_key(char): 708 branch[char] = {} 709 branch = branch[char] 710 # branch['#'] = '#' # the hash # as an end postion 711 if branch.has_key('#'): 712 branch['#'][prefix] = "#" 713 else: 714 branch['#'] = {prefix:"#", } 715 self.prefixes_tree = prefixestree 716 return self.prefixes_tree
717 - def _create_suffix_tree(self, suffixes):
718 """ 719 Create a suffixes tree from given suffixes list 720 @param suffixes: list of suffixes 721 @type suffixes: list of unicode 722 @return : suffixes tree 723 @rtype: Tree stucture 724 """ 725 suffixestree = {} 726 for suffix in suffixes: 727 # print (u"'%s'"%suffix).encode('utf8') 728 branch = suffixestree 729 #reverse a string 730 for char in suffix[::-1]: 731 if not branch.has_key(char): 732 branch[char] = {} 733 branch = branch[char] 734 # branch['#'] = '#' # the hash # as an end postion 735 if branch.has_key('#'): 736 branch['#'][suffix] = "#" 737 else: 738 branch['#'] = {suffix:"#", } 739 self.suffixes_tree = suffixestree 740 return self.suffixes_tree
741
742 - def lookup_prefixes(self, word):
743 """ 744 lookup for prefixes in the word 745 @param word: the given word 746 @type word: unicode 747 @return : list of prefixes starts positions 748 @rtype: list of int 749 """ 750 branch = self.prefixes_tree 751 lefts = [0, ] 752 i = 0 753 while i < len(word) and branch.has_key(word[i]): 754 if branch.has_key('#'): 755 # if branch['#'].has_key(word[:i]): 756 lefts.append(i) 757 if branch.has_key(word[i]): 758 branch = branch[word[i]] 759 else: 760 # i += 1 761 break 762 i += 1 763 if i < len(word) and branch.has_key('#') : 764 lefts.append(i) 765 return lefts
766 767
768 - def lookup_suffixes(self, word):
769 """ 770 lookup for suffixes in the word 771 @param word: the given word 772 @type word: unicode 773 @return : list of suffixes starts positions 774 @rtype: list of int 775 """ 776 branch = self.suffixes_tree 777 suffix = '' 778 # rights = [len(word)-1, ] 779 rights = [] 780 i = len(word)-1 781 while i >= 0 and branch.has_key(word[i]): 782 suffix = word[i]+suffix 783 if branch.has_key('#'): 784 # if branch['#'].has_key(word[i:]): 785 # rights.append(i) 786 rights.append(i+1) 787 if branch.has_key(word[i]): 788 branch = branch[word[i]] 789 else: 790 # i -= 1 791 break 792 i -= 1 793 if i >= 0 and branch.has_key('#') :#and branch['#'].has_key(word[i+1:]): 794 rights.append(i+1) 795 return rights
796 ######################################################### 797 #{ Segmentation Functions 798 ######################################################### 799
800 - def segment(self, word):
801 """ generate a list of all posibble segmentation positions 802 (lef, right) of the treated word by the stemmer. 803 804 Example: 805 >>> ArListem = ArabicLightStemmer() 806 >>> word = u'فتضربين' 807 >>> print ArListem.segment(word) 808 set(([(1, 5), (2, 5), (0, 7)]) 809 810 @return: List of segmentation 811 @rtype: set of tuple of integer. 812 """ 813 self.word = word 814 self.unvocalized = araby.strip_tashkeel(word) 815 # word, harakat = araby.separate(word) 816 word = re.sub("[%s]"%(araby.ALEF_MADDA), araby.HAMZA+araby.ALEF, word) 817 818 # get all lefts position of prefixes 819 lefts = self.lookup_prefixes(word) 820 # get all rights position of suffixes 821 rights = self.lookup_suffixes(word) 822 if lefts: 823 self.left = max(lefts) 824 else: 825 self.left = -1 826 if rights: 827 self.right = min(rights) 828 else: 829 self.right = -1 830 #~ ln = len(word) 831 self.segment_list = set([(0, len(word))]) 832 # print lefts, rights 833 for i in lefts: 834 for j in rights: 835 if j >= i+2: 836 self.segment_list.add((i, j)) 837 return self.segment_list
838 839 840 # ######################################################### 841 # #{ Segmentation Functions 842 # #########################################################
843 - def get_segment_list(self):
844 """ return a list of segmentation positions (left, right) 845 of the treated word by the stemmer. 846 847 Example: 848 >>> ArListem = ArabicLightStemmer() 849 >>> word = u'فتضربين' 850 >>> ArListem.segment(word) 851 >>> print ArListem.get_segment_list() 852 set(([(1, 5), (2, 5), (0, 7)]) 853 854 @return: List of segmentation 855 @rtype: set of tuple of integer. 856 """ 857 return self.segment_list
858 859
860 - def get_affix_list(self, ):
861 u""" return a list of affix tuple of the treated word by the stemmer. 862 863 Example: 864 >>> ArListem = ArabicLightStemmer() 865 >>> word = u'فتضربين' 866 >>> ArListem.segment(word) 867 >>> print ArListem.get_affix_list() 868 [{'prefix': u'ف', 'root': u'ضرب', 'suffix': u'\u064aن', 'stem': u'تضرب'}, 869 {'prefix': u'فت', 'root': u'ضرب', 'suffix': u'\u064aن', 'stem': u'ضرب'}, 870 {'prefix': u'', 'root': u'فضربن', 'suffix': u'', 'stem': u'فتضرب\u064aن'}] 871 872 @return: List of Affixes tuple 873 @rtype: list of dict. 874 """ 875 affix_list = [] 876 for item in self.segment_list: 877 affix_list.append(self.get_affix_tuple(item[0], item[1])) 878 return affix_list
879 880 881 ############################################################### 882 #{ General Functions 883 ############################################################### 884
885 - def normalize(self, word = u""):
886 """ 887 Normalize a word. 888 Convert some leters forms into unified form. 889 @param word: the input word, if word is empty, 890 the word member of the class is normalized. 891 @type word: unicode. 892 @return: normalized word. 893 @rtype: unicode. 894 """ 895 896 if word == u'' and self.word == u"": 897 return u"" 898 elif word != u'': 899 self.word = word 900 else: 901 word = self.word 902 self.normalized = normalize.normalize_searchtext(word) 903 return self.normalized
904
905 - def tokenize(self, text = u""):
906 """ 907 Tokenize text into words 908 @param text: the input text. 909 @type text: unicode. 910 @return: list of words. 911 @rtype: list. 912 """ 913 if not text : 914 return [] 915 else: 916 mylist = self.token_pat.split(text) 917 if u'' in mylist: 918 mylist.remove(u'') 919 return mylist
920