1 """
2 This modules parses the text layer of a KAF or NAF file
3 """
4 from lxml import etree
5
6
8 """
9 This class represents a single token (NAF/KAF wf object)
10 """
11 - def __init__(self,node=None,type='NAF'):
12 """
13 Constructor of the object
14 @type node: xml Element or None (to create and empty one)
15 @param node: this is the node of the element. If it is None it will create a new object
16 @type type: string
17 @param type: the type of the object (KAF or NAF)
18 """
19 self.type = type
20
21 if node is None:
22 self.node = etree.Element('wf')
23 else:
24 self.node = node
25
27 """
28 Returns the node of the element
29 @rtype: xml Element
30 @return: the node of the element
31 """
32 return self.node
33
35 """
36 Set the identifier for the token
37 @type this_id: string
38 @param this_id: the identifier
39 """
40 if self.type == 'NAF':
41 return self.node.set('id',this_id)
42 elif self.type == 'KAF':
43 return self.node.set('wid',this_id)
44
46 """
47 Returns the token identifier
48 @rtype: string
49 @return: the token identifier
50 """
51 if self.type == 'NAF':
52 return self.node.get('id')
53 elif self.type == 'KAF':
54 return self.node.get('wid')
55
56 - def set_text(self,this_text):
57 """
58 Set the text for the token
59 @type this_text: string
60 @param this_text: the text
61 """
62 self.node.text = etree.CDATA(this_text)
63
65 """
66 Returns the text of the token
67 @rtype: string
68 @return: text of the token
69 """
70 return self.node.text
71
73 """
74 Set the sentence for the token
75 @type this_sent: string
76 @param this_sent: the sentence identifier
77 """
78 self.node.set('sent',this_sent)
79
81 """
82 Returns the sentence of the token
83 @rtype: string
84 @return: sentence of the token
85 """
86 return self.node.get('sent')
87
89 """
90 Returns the offset of the token
91 @rtype: string
92 @return: the offset
93 """
94 return self.node.get('offset')
95
96
98 """
99 Set the offset for the token
100 @type offset: string
101 @param offset: the offset
102 """
103 self.node.set('offset',offset)
104
106 """
107 Returns the length of the token
108 @rtype: string
109 @return: the length
110 """
111 return self.node.get('length')
112
113
115 """
116 Set the length for the token
117 @type length: string
118 @param length: the length
119 """
120 self.node.set('length',length)
121
123 """
124 Returns the paragraph for the token
125 @rtype p: string
126 @return p: the paragraph identifier
127 """
128 return self.node.get('para')
129
131 """
132 Set the paragraph for the token
133 @type p: string
134 @param p: the paragraph identifier
135 """
136 self.node.set('para',p)
137
138
140 """
141 This class encapsulates the text layer
142 """
143 - def __init__(self,node=None,type='NAF'):
144 """
145 Constructor of the object
146 @type node: xml Element or None (to create and empty one)
147 @param node: this is the node of the element. If it is None it will create a new object
148 @type type: string
149 @param type: the type of the object (KAF or NAF)
150 """
151 self.idx = {}
152 self.type = type
153 if node is None:
154 self.node = etree.Element('text')
155 else:
156 self.node = node
157 for wf_node in self.__get_wf_nodes():
158 if self.type == 'NAF': label_id = 'id'
159 elif self.type == 'KAF': label_id = 'wid'
160 self.idx[wf_node.get(label_id)] = wf_node
161
162 - def get_node(self):
163 """
164 Returns the node of the element
165 @rtype: xml Element
166 @return: the node of the element
167 """
168 return self.node
169
171 """
172 Converts the object to KAF (if it is NAF)
173 """
174 if self.type == 'NAF':
175 self.type = 'KAF'
176 for node in self.__get_wf_nodes():
177 node.set('wid',node.get('id'))
178 del node.attrib['id']
179
181 """
182 Converts the object to NAF
183 """
184 if self.type == 'KAF':
185 self.type = 'NAF'
186 for node in self.__get_wf_nodes():
187 node.set('id',node.get('wid'))
188 del node.attrib['wid']
189
190 - def __get_wf_nodes(self):
191 for wf_node in self.node.findall('wf'):
192 yield wf_node
193
194 - def __iter__(self):
195 """
196 Iterator that returns all the tokens
197 @rtype: L{Cwf}
198 @return: single token objects
199 """
200 for wf_node in self.__get_wf_nodes():
201 yield Cwf(node=wf_node,type=self.type)
202
203 - def get_wf(self,token_id):
204 """
205 Returns the token object for the given token identifier
206 @type token_id: string
207 @param token_id: the token identifier
208 @rtype: L{Cwf}
209 @return: the token object
210 """
211 wf_node = self.idx.get(token_id)
212 if wf_node is not None:
213 return Cwf(node=wf_node,type=self.type)
214 else:
215 for wf_node in self.__get_wf_nodes():
216 if self.type == 'NAF': label_id = 'id'
217 elif self.type == 'KAF': label_id = 'wid'
218 if wf_node.get(label_id) == token_id:
219 return Cwf(node=wf_node, type=self.type)
220 return None
221
222 - def add_wf(self,wf_obj):
223 """
224 Adds a token object to the text layer
225 @type wf_obj: L{Cwf}
226 @param wf_obj: token object
227 """
228 self.node.append(wf_obj.get_node())
229
230
231 - def remove_tokens_of_sentence(self,sentence_id):
232 """
233 Removes the tokens of the given sentence
234 @type sentence_id: string
235 @param sentence_id: the sentence identifier
236 """
237 nodes_to_remove = set()
238 for wf in self:
239 if wf.get_sent() == sentence_id:
240 nodes_to_remove.add(wf.get_node())
241
242 for node in nodes_to_remove:
243 self.node.remove(node)
244