Package aranalex ::
Module analex
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13 import re
14 import pyarabic.araby as araby
15 import stem_noun
16 import stem_verb
17 import stem_pounct_const
18 import core.stopwords as stopwords
19 import core.wordtag as wordtag
20
22 """
23 Arabic text morphological analyzer.
24 Provides routins to alanyze text.
25 Can treat text as verbs or as nouns.
26 """
27
28
30 """
31 Create Analex instance.
32 """
33
34 self.nounstemmer=stem_noun.nounStemmer();
35 self.verbstemmer=stem_verb.verbStemmer();
36 self.tagger=wordtag.WordTagger();
37 self.debug=False;
38 self.limit=10000;
39
40
41
42
43 marks=u"".join(araby.TASHKEEL)
44
45 self.token_pat=re.compile(u"([\w%s]+)"%marks,re.UNICODE);
46
47
48 self.partial_vocalization_support=True;
49
50
51 - def text_treat(self,text):
52 """ deprecated: treat text to eliminate pountuation.
53 @param text: input text;
54 @type text: unicode;
55 @return : treated text.
56 @rtype: unicode.
57 """
58 return text;
59
60
62 """
63 Tokenize text into words
64 @param text: the input text.
65 @type text: unicode.
66 @return: list of words.
67 @rtype: list.
68 """
69 if text==u'':
70 return [];
71 else:
72 mylist= self.token_pat.split(text)
73 for i in range(len(mylist)):
74 mylist[i]=re.sub("\s",'',mylist[i]);
75 while u'' in mylist: mylist.remove(u'');
76
77 return mylist;
78
79
80 - def text_tokenize(self,text):
81 """
82 Tokenize text into words, after treatement.
83 @param text: the input text.
84 @type text: unicode.
85 @return: list of words.
86 @rtype: list.
87 """
88 text=self.text_treat(text);
89 list_word=self.tokenize(text);
90 return list_word;
91
93 """
94 Set the debug attribute to allow printing internal analysis results.
95 @param debug: the debug value.
96 @type debug: True/False.
97 """
98 self.debug=debug;
99 self.nounstemmer.set_debug(debug);
100 self.verbstemmer.set_debug(debug);
101
103 """
104 Set the number of word treated in text.
105 @param limit: the word number limit.
106 @type limit: integer.
107 """
108 self.limit=limit;
109
110
111 - def check_text(self,text, mode='all'):
112 """
113 Analyze text morphologically
114 @param text: the input text.
115 @type text: unicode.
116 @param mode: the mode of analysis as 'verbs', 'nouns', or 'all'.
117 @type mode: unicode.
118 @return: list of dictionaries of analyzed words with tags.
119 @rtype: list.
120 """
121 list_word=self.text_tokenize(text);
122 resulted_text=u""
123 resulted_data=[];
124 checkedWords={};
125 if mode=='all':
126 for word in list_word [:self.limit]:
127 if checkedWords.has_key(word):
128 resulted_data.append(checkedWords[word]);
129 else:
130 one_data_list=self.check_word(word);
131 resulted_data.append(one_data_list);
132 checkedWords[word]=one_data_list;
133 elif mode=='nouns':
134 for word in list_word[:self.limit] :
135 one_data_list=self.check_word_as_noun(word);
136 resulted_data.append(one_data_list);
137 elif mode=='verbs':
138 for word in list_word[:self.limit] :
139 one_data_list=self.check_word_as_verb(word);
140 resulted_data.append(one_data_list);
141 return resulted_data;
142
143
144 - def check_text_as_nouns(self,text):
145 """
146 Analyze text morphologically as nouns
147 @param text: the input text.
148 @type text: unicode.
149 @return: list of dictionaries of analyzed words with tags.
150 @rtype: list.
151 """
152 return self.check_text(text,"nouns");
153
154
155 - def check_text_as_verbs(self,text):
156 """
157 Analyze text morphologically as verbs
158 @param text: the input text.
159 @type text: unicode.
160 @return: list of dictionaries of analyzed words with tags.
161 @rtype: list.
162 """
163 return self.check_text(text,"verbs");
164
165
167 """
168 Analyze one word morphologically as verbs
169 @param word: the input word.
170 @type word: unicode.
171 @return: list of dictionaries of analyzed words with tags.
172 @rtype: list.
173 """
174 word=araby.stripTatweel(word);
175 word_vocalised=word;
176 word_nm=araby.stripTashkeel(word);
177 resulted_text=u"";
178 resulted_data=[];
179
180
181 resulted_data+=self.check_word_as_pounct(word_nm);
182
183
184 resulted_data+=self.check_word_as_stopword(word_nm);
185
186
187
188
189 if len(resulted_data)==0:
190
191
192
193
194 if self.tagger.is_verb(word_nm):
195 resulted_data+=self.check_word_as_verb(word_nm);
196
197
198 if self.tagger.is_noun(word_nm):
199 resulted_data+=self.check_word_as_noun(word_nm);
200
201
202 resulted_data=self.check_normalized(word_vocalised,resulted_data)
203
204 resulted_data=self.check_shadda(word_vocalised,resulted_data)
205
206
207 if self.partial_vocalization_support:
208 resulted_data=self.check_partial_vocalized(word_vocalised,resulted_data);
209
210 if len(resulted_data)==0:
211 resulted_data.append({
212 'word':word,
213 'procletic':'',
214 'encletic':'',
215 'prefix':'',
216 'suffix':'',
217 'stem':'',
218 'original':'',
219 'vocalized':'',
220 'tags':u'',
221 'type':'unknown',
222 'root':'',
223 'template':'',
224 });
225 return resulted_data;
226
227
229 """
230 If the entred word is like the found word in dictionary, to treat some normalized cases,
231 the analyzer return the vocalized like words;
232 ُIf the word is ذئب, the normalized form is ذءب, which can give from dictionary ذئبـ ذؤب.
233 this function filter normalized resulted word according the given word, and give ذئب.
234 @param word_vocalised: the input word.
235 @type word_vocalised: unicode.
236 @param resulted_data: the founded resulat from dictionary.
237 @type resulted_data: list of dict.
238 @return: list of dictionaries of analyzed words with tags.
239 @rtype: list.
240 """
241
242 filtred_data=[];
243 inputword=araby.stripTashkeel(word_vocalised)
244 for item in resulted_data:
245 if item.has_key('vocalized') :
246 outputword=araby.stripTashkeel(item['vocalized'])
247 if inputword==outputword:
248
249 filtred_data.append(item);
250 return filtred_data;
251
252
254 """
255 if the entred word is like the found word in dictionary, to treat some normalized cases,
256 the analyzer return the vocalized like words.
257 This function treat the Shadda case.
258 @param word_vocalised: the input word.
259 @type word_vocalised: unicode.
260 @param resulted_data: the founded resulat from dictionary.
261 @type resulted_data: list of dict.
262 @return: list of dictionaries of analyzed words with tags.
263 @rtype: list.
264 """
265
266 filtred_data=[];
267 for item in resulted_data:
268 if item.has_key('vocalized') and araby.shaddalike(word_vocalised, item['vocalized']):
269
270 filtred_data.append(item);
271 return filtred_data;
272
273
275 """
276 if the entred word is vocalized fully or partially,
277 the analyzer return the vocalized like words;
278 This function treat the partial vocalized case.
279 @param word_vocalised: the input word.
280 @type word_vocalised: unicode.
281 @param resulted_data: the founded resulat from dictionary.
282 @type resulted_data: list of dict.
283 @return: list of dictionaries of analyzed words with tags.
284 @rtype: list.
285 """
286
287 filtred_data=[];
288 if not araby.isVocalized(word_vocalised):
289 return resulted_data;
290 else:
291
292
293 for item in resulted_data:
294 if item.has_key('vocalized') and araby.vocalizedlike(word_vocalised,item['vocalized']):
295 item['tags']+=':v';
296 filtred_data.append(item);
297 return filtred_data;
298
299
301 """
302 Check if the word is a stopword,
303 @param word: the input word.
304 @type word: unicode.
305 @return: list of dictionaries of analyzed words with tags.
306 @rtype: list.
307 """
308 detailed_result=[]
309 if stopwords.STOPWORDS.has_key(word):
310 detailed_result.append({
311 'word':word,
312 'procletic':stopwords.STOPWORDS[word]['procletic'],
313 'encletic':stopwords.STOPWORDS[word]['encletic'],
314 'prefix':'',
315 'suffix':'',
316 'stem':stopwords.STOPWORDS[word]['stem'],
317 'original':stopwords.STOPWORDS[word]['original'],
318 'vocalized':stopwords.STOPWORDS[word]['vocalized'],
319 'tags':stopwords.STOPWORDS[word]['tags'],
320 'type':'STOPWORD',
321 'root':'',
322 'template':'',
323 });
324 return detailed_result
325
326
327
329 """
330 Check if the word is a pounctuation,
331 @param word: the input word.
332 @type word: unicode.
333 @return: list of dictionaries of analyzed words with tags.
334 @rtype: list.
335 """
336 detailed_result=[]
337 if word.isnumeric():
338 detailed_result.append({
339 'word':word,
340 'procletic':'',
341 'encletic':'',
342 'prefix':'',
343 'suffix':'',
344 'stem':'',
345 'original':'',
346 'vocalized':'',
347 'tags':self.get_number_tags(word),
348 'type':'NUMBER',
349 'root':'',
350 'template':'',
351 });
352 if stem_pounct_const.POUNCTUATION.has_key(word):
353
354 detailed_result.append({
355 'word':word,
356 'procletic':'',
357 'encletic':'',
358 'prefix':'',
359 'suffix':'',
360 'stem':'',
361 'original':'',
362 'vocalized':'',
363 'tags':stem_pounct_const.POUNCTUATION[word]['tags'],
364 'type':'POUNCT',
365 'root':'',
366 'template':'',
367 });
368 return detailed_result;
369
370
372 """
373 Analyze the word as verb.
374 @param verb: the input word.
375 @type verb: unicode.
376 @return: list of dictionaries of analyzed words with tags.
377 @rtype: list.
378 """
379 detailed_result=self.verbstemmer.stemming_verb(verb)
380 return detailed_result;
381
382
384 """
385 Analyze the word as noun.
386 @param noun: the input word.
387 @type noun: unicode.
388 @return: list of dictionaries of analyzed words with tags.
389 @rtype: list.
390 """
391
392 detailed_result=self.nounstemmer.stemming_noun(noun)
393 return detailed_result;
394
395
396 - def context_analyze(self,result):
397 """
398 Deprecated: Analyze the context.
399 @param result: analysis result.
400 @type result: list of dict.
401 @return: filtred relust according to context.
402 @rtype: list.
403 """
404 detailed_result=result;
405 return detailed_result;
415