Package aranalex :: Module stem_noun
[hide private]
[frames] | no frames]

Source Code for Module aranalex.stem_noun

#!/usr/bin/python
# -*- coding=utf-8 -*-
#-------------------------------------------------------------------------------
# Name:        stem_noun
# Purpose:     Arabic lexical analyser, provides feature for stemming arabic word as noun
#
# Author:      Taha Zerrouki (taha.zerrouki[at]gmail.com)
#
# Created:     31-10-2011
# Copyright:   (c) Taha Zerrouki 2011
# Licence:     GPL
#-------------------------------------------------------------------------------

import re
import pyarabic.araby as araby
import tashaphyne.stemming
import tashaphyne.normalize
import stem_noun_const 
import arabicdictionary 
#import dictionaries.noun_dictionary  as noun_dictionary
#Todo:  remove all individual constants of arabic letters, Done
NOUN_DICTIONARY_INDEX={
u'id':0,
u'vocalized':1,
u'unvocalized':2,
u'wordtype':3,
u'root':4,
u'normalized':5,
u'stamped':6,
u'original':7,
u'mankous':8,
u'feminable':9,
u'number':10,
u'dualable':11,
u'masculin_plural':12,
u'feminin_plural':13,
u'broken_plural':14,
u'mamnou3_sarf':15,
u'relative':16,
u'w_suffix':17,
u'hm_suffix':18,
u'kal_prefix':19,
u'ha_suffix':20,
u'k_suffix':21,
u'annex':22,
u'definition':23,
u'note':24,
}

class nounStemmer:
        """
        Arabic noun stemmer
        """

        def __init__(self, debug=False):
                # create a stemmer object for stemming enclitics and procletics
                self.compStemmer=tashaphyne.stemming.ArabicLightStemmer();

                # configure the stemmer object
                self.compStemmer.set_infix_letters(stem_noun_const.COMP_INFIX_LETTERS);
                self.compStemmer.set_prefix_letters(stem_noun_const.COMP_PREFIX_LETTERS);
                self.compStemmer.set_suffix_letters(stem_noun_const.COMP_SUFFIX_LETTERS);
                self.compStemmer.set_max_prefix_length(stem_noun_const.COMP_MAX_PREFIX);
                self.compStemmer.set_max_suffix_length(stem_noun_const.COMP_MAX_SUFFIX);
                self.compStemmer.set_min_stem_length(stem_noun_const.COMP_MIN_STEM);
                self.compStemmer.set_prefix_list(stem_noun_const.COMP_PREFIX_LIST);
                self.compStemmer.set_suffix_list(stem_noun_const.COMP_SUFFIX_LIST);


                # create a stemmer object for stemming conjugated verb
                self.conjStemmer=tashaphyne.stemming.ArabicLightStemmer();

                # configure the stemmer object
                self.conjStemmer.set_infix_letters(stem_noun_const.CONJ_INFIX_LETTERS);
                self.conjStemmer.set_prefix_letters(stem_noun_const.CONJ_PREFIX_LETTERS);
                self.conjStemmer.set_suffix_letters(stem_noun_const.CONJ_SUFFIX_LETTERS);
                self.conjStemmer.set_max_prefix_length(stem_noun_const.CONJ_MAX_PREFIX);
                self.conjStemmer.set_max_suffix_length(stem_noun_const.CONJ_MAX_SUFFIX);
                self.conjStemmer.set_min_stem_length(stem_noun_const.CONJ_MIN_STEM);
                self.conjStemmer.set_prefix_list(stem_noun_const.CONJ_PREFIX_LIST);
                self.conjStemmer.set_suffix_list(stem_noun_const.CONJ_SUFFIX_LIST);

                self.nounDictionary=arabicdictionary.arabicDictionary("nouns", NOUN_DICTIONARY_INDEX)
                
#               self.TriVerbTable_INDEX={};
                self.Table_affix_INDEX={};
                self.NOUN_DICTIONARY_STAMP={
        }
                # allow to print internal results.
                self.debug=debug;

        def stemming_noun(self,noun):
                """
                Analyze word morphologically as noun
                @param noun: the input noun.
                @type noun: unicode.
                @return: list of dictionaries of analyzed words with tags.
                @rtype: list.
                """     
                list_found=[];
                detailed_result=[];
                display_conj_result=False;
                noun=noun.strip();
                noun_list=[noun];
                if noun.find(araby.ALEF_MADDA)>=0:
                        noun_list.append(noun.replace(araby.ALEF_MADDA, araby.ALEF_HAMZA_ABOVE+araby.ALEF_HAMZA_ABOVE))
#                       noun_list.append(HAMZA+ALEF+noun[1:])

                for noun in noun_list:

                        list_seg_comp=self.compStemmer.segment(noun);
                        list_seg_comp=self.verify_affix(noun,list_seg_comp,stem_noun_const.COMP_NOUN_AFFIXES);



                        for seg in list_seg_comp:
                                procletic=noun[:seg[0]];
                                stem=noun[seg[0]:seg[1]]
                                encletic=noun[seg[1]:]
                                secondsuffix=u'';

                                proaffix=u'-'.join([procletic,encletic])
                                if self.debug: print "\t", "-".join([procletic,stem,encletic]).encode("utf8") ;


                                # ajusting nouns variant
                                list_stem=[stem];
                                if encletic!="":
                                        annexing=True;
                                        if stem.endswith(araby.YEH):
                                                list_stem.append(stem+araby.NOON);
                                        elif stem.endswith(araby.WAW):
                                                list_stem.append(stem+araby.NOON);
                                        elif stem.endswith(araby.ALEF):
                                                list_stem.append(stem[:-1]+araby.ALEF_MAKSURA);
                                        elif stem.endswith(araby.TEH):
                                                list_stem.append(stem[:-1]+araby.TEH_MARBUTA);

                                else: annexing=False;

                # stem reduced noun : level two
                                result=[];
                                for stem in list_stem:
                                        result+=self.steming_second_level(noun,stem,procletic,encletic);
                                if self.debug:print noun2.encode("utf8")+"\t"+str(len(result))+'\t['+(u'\t'.join(result)).encode("utf8")+"]";
                                detailed_result+=result;
##              for detailed in detailed_result:
##                      for key in detailed.keys():
##                              print key,detailed[key].encode('utf8'),
##
##                      print len(list_found)-len(detailed_result) ;
                return detailed_result#list_found;


        def steming_second_level(self,noun,noun2,procletic,encletic):
                """
                Analyze word morphologically by stemming the conjugation affixes.
                @param noun: the input noun.
                @type noun: unicode.
                @param noun2: the noun stemed from syntaxic affixes.
                @type noun2: unicode.
                @param procletic: the syntaxic prefixe extracted in the fisrt stage.
                @type procletic: unicode.
                @param encletic: the syntaxic suffixe extracted in the fisrt stage.
                @type encletic: unicode.
                @return: list of dictionaries of analyzed words with tags.
                @rtype: list.
                """     
                detailed_result=[];
                #segment the coinjugated verb
                list_seg_conj=self.conjStemmer.segment(noun2);
                # verify affix compatibility
                list_seg_conj=self.verify_affix(noun2,list_seg_conj,stem_noun_const.NOMINAL_CONJUGATION_AFFIX);


                # add vocalized forms of suffixes
                list_seg_conj_voc=[];
                for seg_conj in list_seg_conj:

                        prefix_conj=noun2[:seg_conj[0]];
                        stem_conj=noun2[seg_conj[0]:seg_conj[1]]
                        suffix_conj=noun2[seg_conj[1]:]
                        affix_conj=prefix_conj+'-'+suffix_conj;
                        # get all vocalized form of suffixes
                        for vocalized_suffix in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['vocalized']:
                                seg_conj_voc={'prefix':'','suffix':vocalized_suffix,'stem':stem_conj}
                                # verify compatibility between procletics and afix
                                if (self.is_compatible_proaffix_affix(procletic, encletic, vocalized_suffix)):
                                # verify the existing of a noun stamp in the dictionary
                                # if self.NOUN_DICTIONARY_STAMP.has_key(stamp):
                                        # list_seg_conj2.append(seg_conj)
                                        list_seg_conj_voc.append(seg_conj_voc)

                list_seg_conj=list_seg_conj_voc;
                for seg_conj in list_seg_conj:
                        prefix_conj=seg_conj['prefix'];
                        stem_conj=seg_conj['stem']
                        suffix_conj=seg_conj['suffix']
                        has_plural_suffix=((u"جمع" in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']) or( u"مثنى" in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']))
                        #print "has_plural", has_plural_suffix;
                        affix_conj='-'.join([prefix_conj,suffix_conj])
                        # noirmalize hamza before gessing  deffirents origines
                        stem_conj=tashaphyne.normalize.normalize_hamza(stem_conj)
                        if self.debug:
                                print "*\t", "-".join([str(len(stem_conj)),prefix_conj,stem_conj,suffix_conj]).encode("utf8") ;
                        # generate possible stems
                        # add stripped letters to the stem to constitute possible noun list
                        possible_noun_list=self.getStemVariants(stem_conj,prefix_conj,suffix_conj);
                        if self.debug:
                                print "\tpossible original nouns:  ","\t".join(possible_noun_list).encode('utf8');

                        # search the noun in the dictionary
                        # we can return the tashkeel
                        infnoun_form_list=[];
                        for infnoun in possible_noun_list:
                                # get the noun and get all its forms from the dict
                                # if the noun has plural suffix, don't look up in broken plural dictionary
                                infnoun_foundL=self.nounDictionary.lookup(infnoun);
                                #infnoun_found=self.find_nouns_in_dictionary(infnoun,has_plural_suffix);
##                                                      listsingle=self.find_broken_plural(infnoun);
##                                                      print ' *****','-'.join(listsingle).encode('utf8')
                                if len(infnoun_foundL)>0:
                                        if self.debug: print "\t in dict",infnoun.encode('utf8');
                                else:
                                        if self.debug: print infnoun.encode('utf8'),"not found in dictionary"
                                infnoun_form_list+=infnoun_foundL;

                        for id in infnoun_form_list:
                                noun_tuple=self.nounDictionary.getEntryById(id);
                                infnoun=noun_tuple['vocalized'];
                                noun_tags=()
                                original=noun_tuple['original'];
                                wordtype=noun_tuple['wordtype'];                                
                                detailed_result.append({
                                'word':noun,
                                'procletic':procletic,
                                'encletic':encletic,
                                'prefix':prefix_conj,
                                'suffix':suffix_conj,
                                'stem':stem_conj,
                                'original':original,
                                'vocalized':self.vocalize(infnoun,procletic,prefix_conj,suffix_conj,encletic),
                                'tags':u':'.join(noun_tags+stem_noun_const.COMP_PREFIX_LIST_TAGS[procletic]['tags']+stem_noun_const.COMP_SUFFIX_LIST_TAGS[encletic]['tags']+stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']),
                                'type':u':'.join(['Noun',wordtype]),#'Noun',
                                'root':'',
                                'template':'',
                                });
                return detailed_result;

        def find_broken_plural(self,broken, VocalisedEntree=False):
                """
                Look up for the broken plural in dictionary.
                @param broken: the input word.
                @type broken: unicode.
                @param VocalisedEntree: the entry is vocalized or not.
                @type VocalisedEntree: Boolean.
                @return: list of found words.
                @rtype: list.
                """             
                liste=[];
                if VocalisedEntree: noun_nm=ar_strip_marks_keepshadda(broken)
                else:
                        noun_nm=broken;
                normalized=normalize_hamza(noun_nm);
                if self.BROKENPLURAL_DICTIONARY_INDEX.has_key(normalized):
                        #print normalized.encode('utf8'), 'ok';
                        for vocalized in self.BROKENPLURAL_DICTIONARY_INDEX[normalized]:
                                if VocalisedEntree:
                                        #if vocalized==broken:
                                        liste.append((vocalized,BrokenPluralTable[vocalized]['single']));
                                else:
                                        liste.append((vocalized,BrokenPluralTable[vocalized]['single']));
                return liste;


        def verify_affix(self,word,list_seg,affix_list):
                """
                Verify possible affixes in the resulted segments according to the given affixes list.
                @param word: the input word.
                @type word: unicode.
                @param list_seg: list of word segments indexes (numbers).
                @type list_seg: list of pairs.
                @return: list of acceped segments.
                @rtype: list of pairs.
                """     
                list_segTemp=set(list_seg);
                # empty the list_seg
                list_seg=set();
                #look up in a affix list
                for s in list_segTemp:
                        affix=affix='-'.join([word[:s[0]],word[s[1]:]]);
                        if affix in affix_list:
                                list_seg.add(s);
                return list_seg;


        def getStemVariants(self,stem,prefix,suffix):
                """
                Generate the Noun stem variants according to the affixes.
                For example مدرستي=>مدرست+ي => مدرسة +ي.
                Return a list of possible cases.
                @param stem: the input stem.
                @type stem: unicode.
                @param prefix: prefixe.
                @type prefix: unicode.
                @param suffix: suffixe.
                @type suffix: unicode.
                @return: list of stem variants.
                @rtype: list of unicode.
                """
                #some cases must have some correction
                #determinate the prefix and suffix types
                # create a list, the first item is the verb without changes
                prefix_possible_noun_list= set([stem])
                # Prefix
                prefix=araby.stripTashkeel(prefix);
                suffix=araby.stripTashkeel(suffix);
                possible_noun_list=prefix_possible_noun_list;
                if suffix in (araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA,araby.YEH, araby.YEH+araby.ALEF+araby.TEH):
                        possible_noun=stem+araby.TEH_MARBUTA;
                        possible_noun_list.add(possible_noun)
                if suffix=="" or suffix==araby.YEH+araby.NOON or suffix==araby.WAW+araby.NOON:
                        possible_noun=stem+araby.YEH;
                        possible_noun_list.add(possible_noun)
                if stem.endswith(araby.YEH):
                        possible_noun=stem[:-1]+araby.ALEF_MAKSURA;
                        possible_noun_list.add(possible_noun)
                #to be validated
                validated_list=possible_noun_list;
                return validated_list


        def is_compatible_proaffix_affix(self,procletic, encletic, suffix):
                """
                Verify if proaffixes (sytaxic affixes) are compatable with affixes ( conjugation) 
                @param procletic: first level prefix.
                @type procletic: unicode.
                @param encletic: first level suffix.
                @type encletic: unicode.
                @param suffix: second level suffix.
                @type suffix: unicode.
                @return: compatible.
                @rtype: True/False.
                """     
                if procletic==u'' and encletic==u'':  return True;
                procletic_tags=stem_noun_const.COMP_PREFIX_LIST_TAGS[procletic]['tags'];
                encletic_tags=stem_noun_const.COMP_SUFFIX_LIST_TAGS[encletic]['tags'];
                #prefix_tags=CONJ_PREFIX_LIST_TAGS[procletic]['tags'];
                suffix_tags=stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix]['tags'];              
                if u"تعريف" in procletic_tags and u"مضاف" in suffix_tags and not u'منسوب' in suffix_tags:
                        return False;
                if u"تعريف" in procletic_tags and u"تنوين" in suffix_tags:
                        return False;
                if u"مضاف" in encletic_tags and u"تنوين" in suffix_tags:
                        return False;
                if u"مضاف" in encletic_tags and u"لايضاف" in suffix_tags:
                        return False;
                if u"جر" in procletic_tags and u"مجرور" not in suffix_tags:
                        return False;
                return True


        def create_index_broken_plural(self):
                
                """Deprecated: create index from the broken_plural dictionary
                to accelerate the search in the dictionary for broken_plural
                """
                for key in BrokenPluralTable.keys():
                        vocnoun=key
                        unvnoun=araby.stripTashkeel(vocnoun);
                        normnoun=normalize_hamza(unvnoun);
                        #transitive=BrokenPluralTable[key]
                        #stamp=noun_stamp(normnoun);
                        if self.BROKENPLURAL_DICTIONARY_INDEX.has_key(normnoun):
                                self.BROKENPLURAL_DICTIONARY_INDEX[normnoun].append(vocnoun);
                        else:
                                self.BROKENPLURAL_DICTIONARY_INDEX[normnoun]=[vocnoun,];


        def vocalize(self,noun, proclitic,prefix,suffix,enclitic):
                """
                Join the  noun and its affixes, and get the vocalized form
                @param noun: noun found in dictionary.
                @type noun: unicode.
                @param proclitic: first level prefix.
                @type proclitic: unicode.
                @param prefix: second level suffix.
                @type prefix: unicode.
                @param suffix: second level suffix.
                @type suffix: unicode.
                @param enclitic: first level suffix.
                @type enclitic: unicode.                
                @return: vocalized word.
                @rtype: unicode.
                """
                enclitic_voc=stem_noun_const.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0];
                proclitic_voc=stem_noun_const.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0];
                suffix_voc=suffix;#CONJ_SUFFIX_LIST_TAGS[suffix]["vocalized"][0];
                #adjust some some harakat
                
                #strip last if tanwin or harakat
                if noun[-1:] in araby.HARAKAT:#(DAMMATAN,FATHATAN,KASRATAN,FATHA,DAMMA,KASRA):
                        noun=noun[:-1];
                #add shadda if the first letter is sunny and the prefix ends by al definition
                if proclitic.endswith(araby.ALEF+araby.LAM) and araby.isSun(noun[0]):
                        noun=u''.join([noun[0],araby.SHADDA,noun[1:]]);
                        #strip the Skun from the lam
                        if proclitic_voc.endswith(araby.SUKUN):
                                proclitic_voc=proclitic_voc[:-1];
                noun=self.getWordVariant(noun,suffix);
                noun=self.getWordVariant(noun,enclitic);                
                suffix_voc=self.getSuffixVariant(noun, suffix_voc,enclitic);
                return ''.join([ proclitic_voc,prefix,noun,suffix_voc,enclitic_voc]);


        def getSuffixVariant(self,word, suffix,enclitic):
                """
                Get the suffix variant to be joined to the word.
                For example: word = مدرس, suffix=ة, encletic=ي. The suffix is convert to Teh.
                @param word: word found in dictionary.
                @type word: unicode.
                @param suffix: second level suffix.
                @type suffix: unicode.
                @param enclitic: first level suffix.
                @type enclitic: unicode.                
                @return: variant of suffix.
                @rtype: unicode.
                """
                enclitic_nm=araby.stripTashkeel(enclitic)
                #if the word ends by a haraka
                if suffix.find(araby.TEH_MARBUTA)>=0 and len (enclitic_nm)>0:
                        suffix=re.sub(araby.TEH_MARBUTA,araby.TEH,suffix);
                if      enclitic_nm==u"" and word[-1:] in (araby.ALEF_MAKSURA, araby.YEH,araby.ALEF) and suffix in araby.HARAKAT  :
                        suffix=u"";
                return suffix;


        def getWordVariant(self, word,suffix):
                """
                Get the word variant to be joined to the suffix.
                For example: word = ةمدرس, suffix=ي. The word is converted to مدرست.
                @param word: word found in dictionary.
                @type word: unicode.
                @param suffix: suffix ( firts or second level).
                @type suffix: unicode.
                @return: variant of word.
                @rtype: unicode.
                """
                word_stem=word;
                #HARAKAT=(FATHA,DAMMA,KASRA,SUKUN, DAMMA, DAMMATAN, KASRATAN, FATHATAN);
                suffix_nm=araby.stripTashkeel(suffix)
                #if the word ends by a haraka
                if word_stem[-1:] in araby.HARAKAT:
                        word_stem=word_stem[:-1]

                if word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm in (araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA,araby.YEH, araby.YEH+araby.ALEF+araby.TEH):
                        word_stem=word_stem[:-1];
                elif word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm!=u"":
                        word_stem=word_stem[:-1]+araby.TEH;

                elif word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm!=u"":
                        word_stem = word_stem[:-1]+araby.YEH;                   
                elif word_stem.endswith(araby.HAMZA) and suffix_nm!=u"":
                        if suffix.startswith(araby.DAMMA):
                                word_stem = word_stem[:-1] + araby.WAW_HAMZA;
                        elif suffix.startswith(araby.KASRA):
                                word_stem = word_stem[:-1] + araby.YEH_HAMZA;
                                
                return word_stem;
                
        def set_debug(self,debug):
                """
                Set the debug attribute to allow printing internal analysis results.
                @param debug: the debug value.
                @type debug: True/False.
                """
                self.debug=debug;