Module text_data
[hide private]
[frames] | no frames]

Source Code for Module text_data

  1  """ 
  2  This modules parses the text layer of a KAF or NAF file 
  3  """ 
  4  from lxml import etree 
  5   
  6   
7 -class Cwf:
8 """ 9 This class represents a single token (NAF/KAF wf object) 10 """
11 - def __init__(self,node=None,type='NAF'):
12 """ 13 Constructor of the object 14 @type node: xml Element or None (to create and empty one) 15 @param node: this is the node of the element. If it is None it will create a new object 16 @type type: string 17 @param type: the type of the object (KAF or NAF) 18 """ 19 self.type = type 20 ##self.id = '' self.sent = '' self.para = '' self.page = '' self.offset = '' self.lenght = '' s 21 if node is None: 22 self.node = etree.Element('wf') 23 else: 24 self.node = node
25
26 - def get_node(self):
27 """ 28 Returns the node of the element 29 @rtype: xml Element 30 @return: the node of the element 31 """ 32 return self.node
33
34 - def set_id(self,this_id):
35 """ 36 Set the identifier for the token 37 @type this_id: string 38 @param this_id: the identifier 39 """ 40 if self.type == 'NAF': 41 return self.node.set('id',this_id) 42 elif self.type == 'KAF': 43 return self.node.set('wid',this_id)
44
45 - def get_id(self):
46 """ 47 Returns the token identifier 48 @rtype: string 49 @return: the token identifier 50 """ 51 if self.type == 'NAF': 52 return self.node.get('id') 53 elif self.type == 'KAF': 54 return self.node.get('wid')
55
56 - def set_text(self,this_text):
57 """ 58 Set the text for the token 59 @type this_text: string 60 @param this_text: the text 61 """ 62 self.node.text = etree.CDATA(this_text)
63
64 - def get_text(self):
65 """ 66 Returns the text of the token 67 @rtype: string 68 @return: text of the token 69 """ 70 return self.node.text
71
72 - def set_sent(self,this_sent):
73 """ 74 Set the sentence for the token 75 @type this_sent: string 76 @param this_sent: the sentence identifier 77 """ 78 self.node.set('sent',this_sent)
79
80 - def get_sent(self):
81 """ 82 Returns the sentence of the token 83 @rtype: string 84 @return: sentence of the token 85 """ 86 return self.node.get('sent')
87
88 - def get_offset(self):
89 """ 90 Returns the offset of the token 91 @rtype: string 92 @return: the offset 93 """ 94 return self.node.get('offset')
95 96
97 - def set_offset(self,offset):
98 """ 99 Set the offset for the token 100 @type offset: string 101 @param offset: the offset 102 """ 103 self.node.set('offset',offset)
104
105 - def get_length(self):
106 """ 107 Returns the length of the token 108 @rtype: string 109 @return: the length 110 """ 111 return self.node.get('length')
112 113
114 - def set_length(self,length):
115 """ 116 Set the length for the token 117 @type length: string 118 @param length: the length 119 """ 120 self.node.set('length',length)
121
122 - def get_para(self):
123 """ 124 Returns the paragraph for the token 125 @rtype p: string 126 @return p: the paragraph identifier 127 """ 128 return self.node.get('para')
129
130 - def set_para(self,p):
131 """ 132 Set the paragraph for the token 133 @type p: string 134 @param p: the paragraph identifier 135 """ 136 self.node.set('para',p)
137 138
139 -class Ctext:
140 """ 141 This class encapsulates the text layer 142 """
143 - def __init__(self,node=None,type='NAF'):
144 """ 145 Constructor of the object 146 @type node: xml Element or None (to create and empty one) 147 @param node: this is the node of the element. If it is None it will create a new object 148 @type type: string 149 @param type: the type of the object (KAF or NAF) 150 """ 151 self.idx = {} 152 self.type = type 153 if node is None: 154 self.node = etree.Element('text') 155 else: 156 self.node = node 157 for wf_node in self.__get_wf_nodes(): 158 if self.type == 'NAF': label_id = 'id' 159 elif self.type == 'KAF': label_id = 'wid' 160 self.idx[wf_node.get(label_id)] = wf_node
161
162 - def get_node(self):
163 """ 164 Returns the node of the element 165 @rtype: xml Element 166 @return: the node of the element 167 """ 168 return self.node
169
170 - def to_kaf(self):
171 """ 172 Converts the object to KAF (if it is NAF) 173 """ 174 if self.type == 'NAF': 175 self.type = 'KAF' 176 for node in self.__get_wf_nodes(): 177 node.set('wid',node.get('id')) 178 del node.attrib['id']
179
180 - def to_naf(self):
181 """ 182 Converts the object to NAF 183 """ 184 if self.type == 'KAF': 185 self.type = 'NAF' 186 for node in self.__get_wf_nodes(): 187 node.set('id',node.get('wid')) 188 del node.attrib['wid']
189
190 - def __get_wf_nodes(self):
191 for wf_node in self.node.findall('wf'): 192 yield wf_node
193
194 - def __iter__(self):
195 """ 196 Iterator that returns all the tokens 197 @rtype: L{Cwf} 198 @return: single token objects 199 """ 200 for wf_node in self.__get_wf_nodes(): 201 yield Cwf(node=wf_node,type=self.type)
202
203 - def get_wf(self,token_id):
204 """ 205 Returns the token object for the given token identifier 206 @type token_id: string 207 @param token_id: the token identifier 208 @rtype: L{Cwf} 209 @return: the token object 210 """ 211 wf_node = self.idx.get(token_id) 212 if wf_node is not None: 213 return Cwf(node=wf_node,type=self.type) 214 else: 215 for wf_node in self.__get_wf_nodes(): 216 if self.type == 'NAF': label_id = 'id' 217 elif self.type == 'KAF': label_id = 'wid' 218 if wf_node.get(label_id) == token_id: 219 return Cwf(node=wf_node, type=self.type) 220 return None
221
222 - def add_wf(self,wf_obj):
223 """ 224 Adds a token object to the text layer 225 @type wf_obj: L{Cwf} 226 @param wf_obj: token object 227 """ 228 self.node.append(wf_obj.get_node())
229 230
231 - def remove_tokens_of_sentence(self,sentence_id):
232 """ 233 Removes the tokens of the given sentence 234 @type sentence_id: string 235 @param sentence_id: the sentence identifier 236 """ 237 nodes_to_remove = set() 238 for wf in self: 239 if wf.get_sent() == sentence_id: 240 nodes_to_remove.add(wf.get_node()) 241 242 for node in nodes_to_remove: 243 self.node.remove(node)
244