text_data

1 """ 2 This modules parses the text layer of a KAF or NAF file 3 """ 4 from lxml import etree 5 6

7 -class Cwf:

8 """ 9 This class represents a single token (NAF/KAF wf object) 10 """

11 - def __init__(self,node=None,type='NAF'):

12 """ 13 Constructor of the object 14 @type node: xml Element or None (to create and empty one) 15 @param node: this is the node of the element. If it is None it will create a new object 16 @type type: string 17 @param type: the type of the object (KAF or NAF) 18 """ 19 self.type = type 20 ##self.id = '' self.sent = '' self.para = '' self.page = '' self.offset = '' self.lenght = '' s 21 if node is None: 22 self.node = etree.Element('wf') 23 else: 24 self.node = node

25

26 - def get_node(self):

27 """ 28 Returns the node of the element 29 @rtype: xml Element 30 @return: the node of the element 31 """ 32 return self.node

33

34 - def set_id(self,this_id):

35 """ 36 Set the identifier for the token 37 @type this_id: string 38 @param this_id: the identifier 39 """ 40 if self.type == 'NAF': 41 return self.node.set('id',this_id) 42 elif self.type == 'KAF': 43 return self.node.set('wid',this_id)

44

45 - def get_id(self):

46 """ 47 Returns the token identifier 48 @rtype: string 49 @return: the token identifier 50 """ 51 if self.type == 'NAF': 52 return self.node.get('id') 53 elif self.type == 'KAF': 54 return self.node.get('wid')

55

56 - def set_text(self,this_text):

57 """ 58 Set the text for the token 59 @type this_text: string 60 @param this_text: the text 61 """ 62 self.node.text = etree.CDATA(this_text)

63

64 - def get_text(self):

65 """ 66 Returns the text of the token 67 @rtype: string 68 @return: text of the token 69 """ 70 return self.node.text

71

72 - def set_sent(self,this_sent):

73 """ 74 Set the sentence for the token 75 @type this_sent: string 76 @param this_sent: the sentence identifier 77 """ 78 self.node.set('sent',this_sent)

79

80 - def get_sent(self):

81 """ 82 Returns the sentence of the token 83 @rtype: string 84 @return: sentence of the token 85 """ 86 return self.node.get('sent')

87

88 - def get_offset(self):

89 """ 90 Returns the offset of the token 91 @rtype: string 92 @return: the offset 93 """ 94 return self.node.get('offset')

95 96

97 - def set_offset(self,offset):

98 """ 99 Set the offset for the token 100 @type offset: string 101 @param offset: the offset 102 """ 103 self.node.set('offset',offset)

104

105 - def get_length(self):

106 """ 107 Returns the length of the token 108 @rtype: string 109 @return: the length 110 """ 111 return self.node.get('length')

112 113

114 - def set_length(self,length):

115 """ 116 Set the length for the token 117 @type length: string 118 @param length: the length 119 """ 120 self.node.set('length',length)

121

122 - def get_para(self):

123 """ 124 Returns the paragraph for the token 125 @rtype p: string 126 @return p: the paragraph identifier 127 """ 128 return self.node.get('para')

129

130 - def set_para(self,p):

131 """ 132 Set the paragraph for the token 133 @type p: string 134 @param p: the paragraph identifier 135 """ 136 self.node.set('para',p)

137 138

139 -class Ctext:

140 """ 141 This class encapsulates the text layer 142 """

143 - def __init__(self,node=None,type='NAF'):

144 """ 145 Constructor of the object 146 @type node: xml Element or None (to create and empty one) 147 @param node: this is the node of the element. If it is None it will create a new object 148 @type type: string 149 @param type: the type of the object (KAF or NAF) 150 """ 151 self.idx = {} 152 self.type = type 153 if node is None: 154 self.node = etree.Element('text') 155 else: 156 self.node = node 157 for wf_node in self.__get_wf_nodes(): 158 if self.type == 'NAF': label_id = 'id' 159 elif self.type == 'KAF': label_id = 'wid' 160 self.idx[wf_node.get(label_id)] = wf_node

161

162 - def get_node(self):

163 """ 164 Returns the node of the element 165 @rtype: xml Element 166 @return: the node of the element 167 """ 168 return self.node

169

170 - def to_kaf(self):

171 """ 172 Converts the object to KAF (if it is NAF) 173 """ 174 if self.type == 'NAF': 175 self.type = 'KAF' 176 for node in self.__get_wf_nodes(): 177 node.set('wid',node.get('id')) 178 del node.attrib['id']

179

180 - def to_naf(self):

181 """ 182 Converts the object to NAF 183 """ 184 if self.type == 'KAF': 185 self.type = 'NAF' 186 for node in self.__get_wf_nodes(): 187 node.set('id',node.get('wid')) 188 del node.attrib['wid']

189

190 - def __get_wf_nodes(self):

191 for wf_node in self.node.findall('wf'): 192 yield wf_node

193

194 - def __iter__(self):

195 """ 196 Iterator that returns all the tokens 197 @rtype: L{Cwf} 198 @return: single token objects 199 """ 200 for wf_node in self.__get_wf_nodes(): 201 yield Cwf(node=wf_node,type=self.type)

202

203 - def get_wf(self,token_id):

204 """ 205 Returns the token object for the given token identifier 206 @type token_id: string 207 @param token_id: the token identifier 208 @rtype: L{Cwf} 209 @return: the token object 210 """ 211 wf_node = self.idx.get(token_id) 212 if wf_node is not None: 213 return Cwf(node=wf_node,type=self.type) 214 else: 215 for wf_node in self.__get_wf_nodes(): 216 if self.type == 'NAF': label_id = 'id' 217 elif self.type == 'KAF': label_id = 'wid' 218 if wf_node.get(label_id) == token_id: 219 return Cwf(node=wf_node, type=self.type) 220 return None

221

222 - def add_wf(self,wf_obj):

223 """ 224 Adds a token object to the text layer 225 @type wf_obj: L{Cwf} 226 @param wf_obj: token object 227 """ 228 self.node.append(wf_obj.get_node())

229 230

231 - def remove_tokens_of_sentence(self,sentence_id):

232 """ 233 Removes the tokens of the given sentence 234 @type sentence_id: string 235 @param sentence_id: the sentence identifier 236 """ 237 nodes_to_remove = set() 238 for wf in self: 239 if wf.get_sent() == sentence_id: 240 nodes_to_remove.add(wf.get_node()) 241 242 for node in nodes_to_remove: 243 self.node.remove(node)

244

Source Code for Module text_data