Package tashaphyne ::
Module normalize
|
|
1
2
3 """
4 Utility functions used by to prepare an arabic text to search and index .
5 """
6 import re
7 import tashaphyne.arabic_const as arabconst
8
9
10
11
12
13
14
16 """Strip vowel from a text and return a result text.
17 The striped marks are :
18 - FATHA, DAMMA, KASRA
19 - SUKUN
20 - SHADDA
21 - FATHATAN, DAMMATAN, KASRATAN, , , .
22 Example:
23 >>> text=u"الْعَرَبِيّةُ"
24 >>> strip_tashkeel(text)
25 العربية
26
27 @param text: arabic text.
28 @type text: unicode.
29 @return: return a striped text.
30 @rtype: unicode.
31 """
32 return arabconst.HARAKAT_PAT.sub('', text)
33
34
35
36
38 """
39 Strip tatweel from a text and return a result text.
40
41 Example:
42 >>> text=u"العـــــربية"
43 >>> strip_tatweel(text)
44 العربية
45
46 @param text: arabic text.
47 @type text: unicode.
48 @return: return a striped text.
49 @rtype: unicode.
50 """
51 return re.sub(ur'[%s]' % arabconst.TATWEEL, '', text)
52
53
54
56 """Normalize Hamza forms into one form, and return a result text.
57 The converted letters are :
58 - The converted lettersinto HAMZA are: WAW_HAMZA,YEH_HAMZA
59 - The converted lettersinto ALEF are: ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW ,HAMZA_ABOVE, HAMZA_BELOW
60
61 Example:
62 >>> text=u"أهؤلاء من أولئكُ"
63 >>> normalize_hamza(text)
64 اهءلاء من اولءكُ
65
66 @param text: arabic text.
67 @type text: unicode.
68 @return: return a converted text.
69 @rtype: unicode.
70 """
71 text = arabconst.ALEFAT_PAT.sub(arabconst.ALEF, text)
72 return arabconst.HAMZAT_PAT.sub(arabconst.HAMZA, text)
73
74
76 """Normalize Lam Alef ligatures into two letters (LAM and ALEF),
77 and return a result text.
78 Some systems present lamAlef ligature as a single letter, this function convert it into two letters,
79 The converted letters into LAM and ALEF are :
80 - LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE
81
82 Example:
83 >>> text=u"لانها لالئ الاسلام"
84 >>> normalize_lamalef(text)
85 لانها لالئ الاسلام
86
87 @param text: arabic text.
88 @type text: unicode.
89 @return: return a converted text.
90 @rtype: unicode.
91 """
92 return arabconst.LAMALEFAT_PAT.sub(\
93 u'%s%s'%(arabconst.LAM, arabconst.ALEF), text)
94
95
97 """Normalize some spellerrors like,
98 TEH_MARBUTA into HEH,ALEF_MAKSURA into YEH, and return a result text.
99 In some context users omit the difference between TEH_MARBUTA and HEH, and ALEF_MAKSURA and YEh.
100 The conversions are:
101 - TEH_MARBUTA into HEH
102 - ALEF_MAKSURA into YEH
103
104 Example:
105 >>> text=u"اشترت سلمى دمية وحلوى"
106 >>> normalize_spellerrors(text)
107 اشترت سلمي دميه وحلوي
108
109 @param text: arabic text.
110 @type text: unicode.
111 @return: return a converted text.
112 @rtype: unicode.
113 """
114 text = re.sub(ur'[%s]' % arabconst.TEH_MARBUTA, arabconst.HEH, text)
115 return re.sub(ur'[%s]' % arabconst.ALEF_MAKSURA, arabconst.YEH, text)
116
117
118
119
120
122 """Normalize input text and return a result text.
123 Normalize a text by :
124 - strip tashkeel
125 - strip tatweel
126 - normalize Hamza
127 - normalize Lam Alef.
128 - normalize Teh Marbuta and Alef Maksura
129 Example:
130 >>> text=u'أستشتري دمـــى آلية لأبنائك قبل الإغلاق'
131 >>> normalize_searchtext(text)
132 استشتري دمي اليه لابناءك قبل الاغلاق
133
134 @param text: arabic text.
135 @type text: unicode.
136 @return: return a normalized text.
137 @rtype: unicode.
138 """
139 text = strip_tashkeel(text)
140 text = strip_tatweel(text)
141 text = normalize_lamalef(text)
142 text = normalize_hamza(text)
143 text = normalize_spellerrors(text)
144 return text
145