Package aranalex :: Module analex
[hide private]
[frames] | no frames]

Source Code for Module aranalex.analex

  1  #!/usr/bin/python 
  2  # -*- coding=utf-8 -*- 
  3  #------------------------------------------------------------------------------- 
  4  # Name:        analex 
  5  # Purpose:     Arabic lexical analyser, provides feature to stem arabic words as noun, verb, stopword  
  6  # 
  7  # Author:      Taha Zerrouki (taha.zerrouki[at]gmail.com) 
  8  # 
  9  # Created:     31-10-2011 
 10  # Copyright:   (c) Taha Zerrouki 2011 
 11  # Licence:     GPL 
 12  #------------------------------------------------------------------------------- 
 13  import re 
 14  import pyarabic.araby as araby  # basic arabic text functions 
 15  import stem_noun                # noun stemming  
 16  import stem_verb                # verb stemming 
 17  import stem_pounct_const # pounctaution constants  
 18  import core.stopwords as stopwords #s topwords list 
 19  import core.wordtag as wordtag 
 20   
21 -class analex :
22 """ 23 Arabic text morphological analyzer. 24 Provides routins to alanyze text. 25 Can treat text as verbs or as nouns. 26 """ 27 28
29 - def __init__(self):
30 """ 31 Create Analex instance. 32 """ 33 #print "len stop words",len(stopwords.STOPWORDS); 34 self.nounstemmer=stem_noun.nounStemmer(); # to stem nouns 35 self.verbstemmer=stem_verb.verbStemmer(); # to stem verbs 36 self.tagger=wordtag.WordTagger(); 37 self.debug=False; # to allow to print internal data 38 self.limit=10000; # limit words in the text 39 40 # the words contain arabic letters and harakat. 41 # the unicode considers arabic harakats as marks not letters, 42 # then we add harakat to the regluar expression to tokenize 43 marks=u"".join(araby.TASHKEEL)# contains [FATHA,DAMMA,KASRA,SUKUN,DAMMATAN,KASRATAN,FATHATAN,SHADDA]) 44 # used to tokenize arabic text 45 self.token_pat=re.compile(u"([\w%s]+)"%marks,re.UNICODE); 46 # allow partial vocalization support, 47 #~The text is analyzed as partial or fully vocalized. 48 self.partial_vocalization_support=True;
49 50
51 - def text_treat(self,text):
52 """ deprecated: treat text to eliminate pountuation. 53 @param text: input text; 54 @type text: unicode; 55 @return : treated text. 56 @rtype: unicode. 57 """ 58 return text;
59 60
61 - def tokenize(self,text=u""):
62 """ 63 Tokenize text into words 64 @param text: the input text. 65 @type text: unicode. 66 @return: list of words. 67 @rtype: list. 68 """ 69 if text==u'': 70 return []; 71 else: 72 mylist= self.token_pat.split(text) 73 for i in range(len(mylist)): 74 mylist[i]=re.sub("\s",'',mylist[i]); 75 while u'' in mylist: mylist.remove(u''); 76 #print u"'".join(mylist).encode('utf8'); 77 return mylist;
78 79
80 - def text_tokenize(self,text):
81 """ 82 Tokenize text into words, after treatement. 83 @param text: the input text. 84 @type text: unicode. 85 @return: list of words. 86 @rtype: list. 87 """ 88 text=self.text_treat(text); 89 list_word=self.tokenize(text); 90 return list_word;
91
92 - def set_debug(self,debug):
93 """ 94 Set the debug attribute to allow printing internal analysis results. 95 @param debug: the debug value. 96 @type debug: True/False. 97 """ 98 self.debug=debug; 99 self.nounstemmer.set_debug(debug); # to set debug on noun stemming 100 self.verbstemmer.set_debug(debug); # to set debug on verb stemming
101
102 - def set_limit(self,limit):
103 """ 104 Set the number of word treated in text. 105 @param limit: the word number limit. 106 @type limit: integer. 107 """ 108 self.limit=limit;
109 110
111 - def check_text(self,text, mode='all'):
112 """ 113 Analyze text morphologically 114 @param text: the input text. 115 @type text: unicode. 116 @param mode: the mode of analysis as 'verbs', 'nouns', or 'all'. 117 @type mode: unicode. 118 @return: list of dictionaries of analyzed words with tags. 119 @rtype: list. 120 """ 121 list_word=self.text_tokenize(text); 122 resulted_text=u"" 123 resulted_data=[]; 124 checkedWords={}; 125 if mode=='all': 126 for word in list_word [:self.limit]: 127 if checkedWords.has_key(word): 128 resulted_data.append(checkedWords[word]); 129 else: 130 one_data_list=self.check_word(word); 131 resulted_data.append(one_data_list); 132 checkedWords[word]=one_data_list; 133 elif mode=='nouns': 134 for word in list_word[:self.limit] : 135 one_data_list=self.check_word_as_noun(word); 136 resulted_data.append(one_data_list); 137 elif mode=='verbs': 138 for word in list_word[:self.limit] : 139 one_data_list=self.check_word_as_verb(word); 140 resulted_data.append(one_data_list); 141 return resulted_data;
142 143
144 - def check_text_as_nouns(self,text):
145 """ 146 Analyze text morphologically as nouns 147 @param text: the input text. 148 @type text: unicode. 149 @return: list of dictionaries of analyzed words with tags. 150 @rtype: list. 151 """ 152 return self.check_text(text,"nouns");
153 154
155 - def check_text_as_verbs(self,text):
156 """ 157 Analyze text morphologically as verbs 158 @param text: the input text. 159 @type text: unicode. 160 @return: list of dictionaries of analyzed words with tags. 161 @rtype: list. 162 """ 163 return self.check_text(text,"verbs");
164 165
166 - def check_word(self,word):
167 """ 168 Analyze one word morphologically as verbs 169 @param word: the input word. 170 @type word: unicode. 171 @return: list of dictionaries of analyzed words with tags. 172 @rtype: list. 173 """ 174 word=araby.stripTatweel(word); 175 word_vocalised=word; 176 word_nm=araby.stripTashkeel(word); 177 resulted_text=u""; 178 resulted_data=[]; 179 180 # if word is a pounctuation 181 resulted_data+=self.check_word_as_pounct(word_nm); 182 183 # if word is stopword 184 resulted_data+=self.check_word_as_stopword(word_nm); 185 186 # Todo: if the word is a stop word we have some problems, 187 # the stop word can also be another normal word (verb or noun), 188 # we must consider it in future works 189 if len(resulted_data)==0: 190 191 #TodDo guessing word type 192 193 #if word is verb 194 if self.tagger.is_verb(word_nm): 195 resulted_data+=self.check_word_as_verb(word_nm); 196 197 #if word is noun 198 if self.tagger.is_noun(word_nm): 199 resulted_data+=self.check_word_as_noun(word_nm); 200 201 #check if the word is nomralized and sollution are equivalent 202 resulted_data=self.check_normalized(word_vocalised,resulted_data) 203 #check if the word is shadda like 204 resulted_data=self.check_shadda(word_vocalised,resulted_data) 205 206 #check if the word is vocalized like results 207 if self.partial_vocalization_support: 208 resulted_data=self.check_partial_vocalized(word_vocalised,resulted_data); 209 210 if len(resulted_data)==0: 211 resulted_data.append({ 212 'word':word, 213 'procletic':'', 214 'encletic':'', 215 'prefix':'', 216 'suffix':'', 217 'stem':'', 218 'original':'', 219 'vocalized':'', 220 'tags':u'', 221 'type':'unknown', 222 'root':'', 223 'template':'', 224 }); 225 return resulted_data;
226 227
228 - def check_normalized(self, word_vocalised,resulted_data):
229 """ 230 If the entred word is like the found word in dictionary, to treat some normalized cases, 231 the analyzer return the vocalized like words; 232 ُIf the word is ذئب, the normalized form is ذءب, which can give from dictionary ذئبـ ذؤب. 233 this function filter normalized resulted word according the given word, and give ذئب. 234 @param word_vocalised: the input word. 235 @type word_vocalised: unicode. 236 @param resulted_data: the founded resulat from dictionary. 237 @type resulted_data: list of dict. 238 @return: list of dictionaries of analyzed words with tags. 239 @rtype: list. 240 """ 241 #print word_vocalised.encode('utf8'); 242 filtred_data=[]; 243 inputword=araby.stripTashkeel(word_vocalised) 244 for item in resulted_data: 245 if item.has_key('vocalized') : 246 outputword=araby.stripTashkeel(item['vocalized']) 247 if inputword==outputword: 248 #item['tags']+=':a'; 249 filtred_data.append(item); 250 return filtred_data;
251 252
253 - def check_shadda(self, word_vocalised,resulted_data):
254 """ 255 if the entred word is like the found word in dictionary, to treat some normalized cases, 256 the analyzer return the vocalized like words. 257 This function treat the Shadda case. 258 @param word_vocalised: the input word. 259 @type word_vocalised: unicode. 260 @param resulted_data: the founded resulat from dictionary. 261 @type resulted_data: list of dict. 262 @return: list of dictionaries of analyzed words with tags. 263 @rtype: list. 264 """ 265 #print word_vocalised.encode('utf8'); 266 filtred_data=[]; 267 for item in resulted_data: 268 if item.has_key('vocalized') and araby.shaddalike(word_vocalised, item['vocalized']): 269 #item['tags']+=':a'; 270 filtred_data.append(item); 271 return filtred_data;
272 273
274 - def check_partial_vocalized(self, word_vocalised,resulted_data):
275 """ 276 if the entred word is vocalized fully or partially, 277 the analyzer return the vocalized like words; 278 This function treat the partial vocalized case. 279 @param word_vocalised: the input word. 280 @type word_vocalised: unicode. 281 @param resulted_data: the founded resulat from dictionary. 282 @type resulted_data: list of dict. 283 @return: list of dictionaries of analyzed words with tags. 284 @rtype: list. 285 """ 286 #print word_vocalised.encode('utf8'); 287 filtred_data=[]; 288 if not araby.isVocalized(word_vocalised): 289 return resulted_data; 290 else: 291 #compare the vocalized output with the vocalized input 292 #print ' is vocalized'; 293 for item in resulted_data: 294 if item.has_key('vocalized') and araby.vocalizedlike(word_vocalised,item['vocalized']): 295 item['tags']+=':v'; 296 filtred_data.append(item); 297 return filtred_data;
298 299
300 - def check_word_as_stopword(self,word):
301 """ 302 Check if the word is a stopword, 303 @param word: the input word. 304 @type word: unicode. 305 @return: list of dictionaries of analyzed words with tags. 306 @rtype: list. 307 """ 308 detailed_result=[] 309 if stopwords.STOPWORDS.has_key(word): 310 detailed_result.append({ 311 'word':word, 312 'procletic':stopwords.STOPWORDS[word]['procletic'], 313 'encletic':stopwords.STOPWORDS[word]['encletic'], 314 'prefix':'', 315 'suffix':'', 316 'stem':stopwords.STOPWORDS[word]['stem'], 317 'original':stopwords.STOPWORDS[word]['original'], 318 'vocalized':stopwords.STOPWORDS[word]['vocalized'], 319 'tags':stopwords.STOPWORDS[word]['tags'], 320 'type':'STOPWORD', 321 'root':'', 322 'template':'', 323 }); 324 return detailed_result
325 326 327
328 - def check_word_as_pounct(self,word):
329 """ 330 Check if the word is a pounctuation, 331 @param word: the input word. 332 @type word: unicode. 333 @return: list of dictionaries of analyzed words with tags. 334 @rtype: list. 335 """ 336 detailed_result=[] 337 if word.isnumeric(): 338 detailed_result.append({ 339 'word':word, 340 'procletic':'', 341 'encletic':'', 342 'prefix':'', 343 'suffix':'', 344 'stem':'', 345 'original':'', 346 'vocalized':'', 347 'tags':self.get_number_tags(word), 348 'type':'NUMBER', 349 'root':'', 350 'template':'', 351 }); 352 if stem_pounct_const.POUNCTUATION.has_key(word): 353 354 detailed_result.append({ 355 'word':word, 356 'procletic':'', 357 'encletic':'', 358 'prefix':'', 359 'suffix':'', 360 'stem':'', 361 'original':'', 362 'vocalized':'', 363 'tags':stem_pounct_const.POUNCTUATION[word]['tags'], 364 'type':'POUNCT', 365 'root':'', 366 'template':'', 367 }); 368 return detailed_result;
369 370
371 - def check_word_as_verb(self,verb):
372 """ 373 Analyze the word as verb. 374 @param verb: the input word. 375 @type verb: unicode. 376 @return: list of dictionaries of analyzed words with tags. 377 @rtype: list. 378 """ 379 detailed_result=self.verbstemmer.stemming_verb(verb) 380 return detailed_result;
381 382
383 - def check_word_as_noun(self,noun):
384 """ 385 Analyze the word as noun. 386 @param noun: the input word. 387 @type noun: unicode. 388 @return: list of dictionaries of analyzed words with tags. 389 @rtype: list. 390 """ 391 392 detailed_result=self.nounstemmer.stemming_noun(noun) 393 return detailed_result;
394 395
396 - def context_analyze(self,result):
397 """ 398 Deprecated: Analyze the context. 399 @param result: analysis result. 400 @type result: list of dict. 401 @return: filtred relust according to context. 402 @rtype: list. 403 """ 404 detailed_result=result; 405 return detailed_result;
406 - def get_number_tags(self, word):
407 """ 408 Check the numbers and return tags. 409 @param word: the input word. 410 @type word: unicode. 411 @return: tags. 412 @rtype: text. 413 """ 414 return u"عدد";
415