Package feature_extractor ::
Module dependency
|
|
1 """
2 This module allows to extract elaborated information from the dependency layer of a KAF/NAF object
3 """
4
5 from operator import itemgetter
6 import sys
7
9 vect = my_dict.items()
10 if len(vect) !=0:
11 vect.sort(key=itemgetter(1),reverse=True)
12 return vect[0]
13 return None
14
16 """
17 This is the main class for the information extraction
18 """
20 """
21 The constructor of this class take one kaf/naf object
22 @type knaf_obj: kaf/naf object
23 @param knaf_obj: the kaf/naf object
24 """
25 self.naf = knaf_obj
26 self.relations_for_term = {}
27 self.reverse_relations_for_term = {}
28 self.prefix_for_reverse = ''
29
30
31 already_linked = {}
32 for dep in knaf_obj.get_dependencies():
33 term_from = dep.get_from()
34 term_to = dep.get_to()
35 rfunc = dep.get_function()
36
37
38
39
40
41
42
43
44
45
46
47 '''
48 if term_from in already_linked and term_to in already_linked[term_from]:
49 #There could be a cycle, skip this
50 print>>sys.stderr,'Skipped from',term_from,'to',term_to,'func',rfunc,' cycle detected'
51 continue
52 else:
53 #Include term_from as linked with term_to for future...
54 if term_to not in already_linked:
55 already_linked[term_to] = set()
56 already_linked[term_to].add(term_from)
57 '''
58
59
60
61
62 if term_from in self.relations_for_term:
63 self.relations_for_term[term_from].append((rfunc,term_to))
64 else:
65 self.relations_for_term[term_from] = [(rfunc,term_to)]
66
67 if term_to in self.reverse_relations_for_term:
68 self.reverse_relations_for_term[term_to].append((self.prefix_for_reverse+rfunc,term_from))
69 else:
70 self.reverse_relations_for_term[term_to] = [(self.prefix_for_reverse+rfunc,term_from)]
71
72
73 self.paths_for_termid={}
74 self.sentence_for_termid={}
75 self.top_relation_for_term = {}
76 self.root_for_sentence = {}
77
78 for term_obj in knaf_obj.get_terms():
79 termid = term_obj.get_id()
80
81
82 span_ids = term_obj.get_span().get_span_ids()
83 token_obj = knaf_obj.get_token(span_ids[0])
84 if token_obj is None:
85 continue
86
87 sentence = token_obj.get_sent()
88
89 self.sentence_for_termid[termid] = sentence
90
91
92
93
94
95
96 inversed = self.__propagate_node(termid,already_propagated=[])
97 paths = self.__reverse_propagate_node(termid,already_propagated=[])
98
99
100 if len(inversed) != 0:
101 for ip in inversed:
102 if len(ip)!=0:
103 self.top_relation_for_term[termid] = ip[-1]
104 root = ip[-1][1]
105 if sentence not in self.root_for_sentence:
106 self.root_for_sentence[sentence] = {}
107
108 if root not in self.root_for_sentence[sentence]:
109 self.root_for_sentence[sentence][root]=0
110 else:
111 self.root_for_sentence[sentence][root]+=1
112 break
113
114 self.paths_for_termid[termid] = paths + inversed
115
116 '''
117 print termid
118 print 'DIRECT RELS'
119 for p in paths:
120 print ' ',p
121
122 print 'INDIRECT RELS'
123 for p in inversed:
124 print ' ',p
125 '''
126
127
128 for sent_id, distr in self.root_for_sentence.items():
129
130 most_freq,c = get_max_distr_dict(distr)
131 self.root_for_sentence[sent_id] = most_freq
132
133
134
135
137 paths = []
138
139 relations = self.relations_for_term.get(node)
140
141 if relations is None:
142 paths = [[]]
143 elif node in already_propagated:
144 paths = [[]]
145
146 else:
147 already_propagated.append(node)
148 for func, target_node in relations:
149 new_paths = self.__propagate_node(target_node, already_propagated)
150 for new_path in new_paths:
151 new_path.insert(0,(func,target_node))
152 paths.append(new_path)
153 return paths
154
156 paths = []
157 relations = self.reverse_relations_for_term.get(node)
158
159 if relations is None:
160 paths = [[]]
161 elif node in already_propagated:
162 paths = [[]]
163 else:
164 already_propagated.append(node)
165 for func, target_node in relations:
166 new_paths = self.__reverse_propagate_node(target_node,already_propagated)
167 for new_path in new_paths:
168 new_path.insert(0,(func,target_node))
169 paths.append(new_path)
170 return paths
171
172
173
175 """
176 Returns the list of dependency labels of the shortest path between two terms
177 @type term1: string
178 @param term1: the term identifier for one term
179 @type term2: string
180 @param term2: the term identifier for the other term
181 @rtype: list
182 @return: list of dependency relations
183 """
184 dep_path = None
185 if term1 == term2: dep_path = []
186 else:
187 paths1 = self.paths_for_termid[term1]
188 paths2 = self.paths_for_termid[term2]
189
190
191 hits = []
192 for num1, p1 in enumerate(paths1):
193 ids1 = [ my_id for my_func, my_id in p1]
194 if term2 in ids1:
195 idx1 = ids1.index(term2)
196 hits.append((term2,idx1+0,idx1,0,num1,None))
197
198 for num2,p2 in enumerate(paths2):
199 ids2 = [ my_id for my_func, my_id in p2]
200 if term1 in ids2:
201 idx2=ids2.index(term1)
202 hits.append((term1,0+idx2,0,idx2,None,num2))
203
204
205 for num1, p1 in enumerate(paths1):
206
207 ids1 = [ my_id for my_func, my_id in p1]
208
209 for num2, p2 in enumerate(paths2):
210
211 ids2 = [ my_id for my_func, my_id in p2]
212
213 common_ids = set(ids1) & set(ids2)
214
215 for common_id in common_ids:
216 idx1 = ids1.index(common_id)
217 idx2 = ids2.index(common_id)
218 hits.append((common_id,idx1+idx2,idx1,idx2,num1,num2))
219
220
221 if len(hits) != 0:
222 dep_path = []
223 hits.sort(key=itemgetter(1))
224 best_hit = hits[0]
225 common_id, _, idx1, idx2, numpath1, numpath2 = best_hit
226
227 if numpath2 is None:
228 path1 = paths1[numpath1]
229 my_rels1 = path1[:idx1+1]
230
231
232 for func,node in my_rels1:
233 dep_path.append(func)
234
235
236
237
238
239
240
241
242 elif numpath1 is None:
243 path2 = paths2[numpath2]
244 my_rels2 = path2[:idx2+1]
245
246
247 for func,node in my_rels2:
248 dep_path.append(func)
249
250
251
252
253
254
255
256
257 else:
258 path1 = paths1[numpath1]
259 my_rels1 = path1[:idx1+1]
260
261 path2 = paths2[numpath2]
262 my_rels2 = path2[:idx2+1]
263
264
265
266 for func,node in my_rels1:
267 dep_path.append(func)
268
269
270
271 for func,node in my_rels2[-1::-1]:
272 dep_path.append(func)
273
274
275
276
277
278
279
280
281
282
283
284 return dep_path
285
286
288 """
289 Returns the list of dependency labels of the shortest path between two span of terms
290 @type span1: list
291 @param span1: list of term identifiers
292 @type span2: list
293 @param span2: list of term identifiers
294 @rtype: list
295 @return: list of dependency relations
296 """
297 shortest_path = None
298
299 for term1 in span1:
300 for term2 in span2:
301 this_path = self.get_shortest_path(term1, term2)
302
303 if shortest_path is None or (this_path is not None and len(this_path)<len(shortest_path)):
304 shortest_path = this_path
305 return shortest_path
306
307
309 """
310 Returns the dependency path from the term to the root
311 @type termid: string
312 @param termid: the term identifier
313 @rtype: list
314 @return: list of dependency relations
315 """
316
317 root = None
318 sentence = self.sentence_for_termid.get(termid)
319
320 if sentence is None:
321 top_node = self.top_relation_for_term.get(termid)
322 if top_node is not None:
323 root = top_node[1]
324 else:
325 return None
326 else:
327 if sentence in self.root_for_sentence:
328 root = self.root_for_sentence[sentence]
329 else:
330
331 return None
332
333 path = self.get_shortest_path(termid, root)
334 return path
335
336
337
339 """
340 Returns the dependency path from a span of terms to the root
341 @type span: list
342 @param span: list of term identifiers
343 @rtype: list
344 @return: list of dependency relations
345 """
346 shortest_path = None
347 for termid in span:
348 this_path = self.get_path_to_root(termid)
349
350 if shortest_path is None or (this_path is not None and len(this_path) < len(shortest_path)):
351 shortest_path = this_path
352 return shortest_path
353
354
356 """
357 Returns the complete list of dependents and embedded dependents of a certain term.
358 """
359 counter += 1
360 deps = self.relations_for_term
361 if term_id in deps and len(deps.get(term_id)) > 0:
362 for dep in deps.get(term_id):
363 if not dep[1] in relations:
364 relations.append(dep[1])
365 if dep[1] in deps:
366 deprelations = self.get_full_dependents(dep[1], relations, counter)
367 for deprel in deprelations:
368 if not deprel in relations:
369 relations.append(deprel)
370
371 return relations
372