tashaphyne.normalize

1 #!/usr/bin/python 2 # -*- coding=utf-8 -*- 3 """ 4 Utility functions used by to prepare an arabic text to search and index . 5 """ 6 import re 7 import tashaphyne.arabic_const as arabconst 8 9 10 ###################################################################### 11 #{ Indivudual Functions 12 ###################################################################### 13 14 #--------------------------------------

15 -def strip_tashkeel(text):

16 """Strip vowel from a text and return a result text. 17 The striped marks are : 18 - FATHA, DAMMA, KASRA 19 - SUKUN 20 - SHADDA 21 - FATHATAN, DAMMATAN, KASRATAN, , , . 22 Example: 23 >>> text=u"الْعَرَبِيّةُ" 24 >>> strip_tashkeel(text) 25 العربية 26 27 @param text: arabic text. 28 @type text: unicode. 29 @return: return a striped text. 30 @rtype: unicode. 31 """ 32 return arabconst.HARAKAT_PAT.sub('', text)

33 34 35 #strip tatweel from a text and return a result text 36 #--------------------------------------

37 -def strip_tatweel(text):

38 """ 39 Strip tatweel from a text and return a result text. 40 41 Example: 42 >>> text=u"العـــــربية" 43 >>> strip_tatweel(text) 44 العربية 45 46 @param text: arabic text. 47 @type text: unicode. 48 @return: return a striped text. 49 @rtype: unicode. 50 """ 51 return re.sub(ur'[%s]' % arabconst.TATWEEL, '', text)

52 53 54 #--------------------------------------

55 -def normalize_hamza(text):

56 """Normalize Hamza forms into one form, and return a result text. 57 The converted letters are : 58 - The converted lettersinto HAMZA are: WAW_HAMZA,YEH_HAMZA 59 - The converted lettersinto ALEF are: ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW ,HAMZA_ABOVE, HAMZA_BELOW 60 61 Example: 62 >>> text=u"أهؤلاء من أولئكُ" 63 >>> normalize_hamza(text) 64 اهءلاء من اولءكُ 65 66 @param text: arabic text. 67 @type text: unicode. 68 @return: return a converted text. 69 @rtype: unicode. 70 """ 71 text = arabconst.ALEFAT_PAT.sub(arabconst.ALEF, text) 72 return arabconst.HAMZAT_PAT.sub(arabconst.HAMZA, text)

73 74 #--------------------------------------

75 -def normalize_lamalef(text):

76 """Normalize Lam Alef ligatures into two letters (LAM and ALEF), 77 and return a result text. 78 Some systems present lamAlef ligature as a single letter, this function convert it into two letters, 79 The converted letters into LAM and ALEF are : 80 - LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE 81 82 Example: 83 >>> text=u"لانها لالئ الاسلام" 84 >>> normalize_lamalef(text) 85 لانها لالئ الاسلام 86 87 @param text: arabic text. 88 @type text: unicode. 89 @return: return a converted text. 90 @rtype: unicode. 91 """ 92 return arabconst.LAMALEFAT_PAT.sub(\ 93 u'%s%s'%(arabconst.LAM, arabconst.ALEF), text)

94 95 #--------------------------------------

96 -def normalize_spellerrors(text):

97 """Normalize some spellerrors like, 98 TEH_MARBUTA into HEH,ALEF_MAKSURA into YEH, and return a result text. 99 In some context users omit the difference between TEH_MARBUTA and HEH, and ALEF_MAKSURA and YEh. 100 The conversions are: 101 - TEH_MARBUTA into HEH 102 - ALEF_MAKSURA into YEH 103 104 Example: 105 >>> text=u"اشترت سلمى دمية وحلوى" 106 >>> normalize_spellerrors(text) 107 اشترت سلمي دميه وحلوي 108 109 @param text: arabic text. 110 @type text: unicode. 111 @return: return a converted text. 112 @rtype: unicode. 113 """ 114 text = re.sub(ur'[%s]' % arabconst.TEH_MARBUTA, arabconst.HEH, text) 115 return re.sub(ur'[%s]' % arabconst.ALEF_MAKSURA, arabconst.YEH, text)

116 117 ###################################################################### 118 #{ Normalize One Function 119 ###################################################################### 120

121 -def normalize_searchtext(text):

122 """Normalize input text and return a result text. 123 Normalize a text by : 124 - strip tashkeel 125 - strip tatweel 126 - normalize Hamza 127 - normalize Lam Alef. 128 - normalize Teh Marbuta and Alef Maksura 129 Example: 130 >>> text=u'أستشتري دمـــى آلية لأبنائك قبل الإغلاق' 131 >>> normalize_searchtext(text) 132 استشتري دمي اليه لابناءك قبل الاغلاق 133 134 @param text: arabic text. 135 @type text: unicode. 136 @return: return a normalized text. 137 @rtype: unicode. 138 """ 139 text = strip_tashkeel(text) 140 text = strip_tatweel(text) 141 text = normalize_lamalef(text) 142 text = normalize_hamza(text) 143 text = normalize_spellerrors(text) 144 return text

145

Source Code for Module tashaphyne.normalize