Package feature_extractor ::
Module constituency
|
|
1
2
3 """
4 This module provides methods for extracting elaborated information from the constituency layer in a KAF/NAF file
5 """
6
7 from operator import itemgetter
8 from collections import defaultdict
9
10 import copy
11
13 """
14 This is the main class that allows the extraction of information
15 """
17 """
18 Constructor from a KAfPArser object
19 @type knaf_obj: KAfParser
20 @param knaf_obj: the KAF/NAF object
21 """
22 self.naf = knaf_obj
23
24
25 self.terminals = {}
26 self.terminal_for_term = {}
27 self.label_for_nonter = {}
28 self.reachable_from = {}
29
30 self.extract_info_from_naf(knaf_obj)
31
32
33 self.paths_for_terminal= {}
34 for terminal_id in self.terminals.keys():
35 paths = self.__expand_node(terminal_id,False)
36 self.paths_for_terminal[terminal_id] = paths
37
38
39
40 self.terms_subsumed_by_nonter = {}
41 for terminal_id, span_terms in self.terminals.items():
42 for path in self.paths_for_terminal[terminal_id]:
43 for nonter in path:
44 if nonter not in self.terms_subsumed_by_nonter:
45 self.terms_subsumed_by_nonter[nonter] = set()
46 for termid in span_terms:
47 self.terms_subsumed_by_nonter[nonter].add(termid)
48
49
50
51
52
53
54
55
56
57
58
59
61 all_nonter = set()
62 for terminal in self.terminals.keys():
63 for path in self.paths_for_terminal[terminal]:
64 first_non_ter_phrase = path[1]
65 subsumed = self.terms_subsumed_by_nonter[first_non_ter_phrase]
66 this_type = self.label_for_nonter[first_non_ter_phrase]
67 print terminal, this_type, subsumed
68 return None
69
70 ter_for_nonter = {}
71 for nonter in all_nonter:
72 for terminal in self.terminals.keys():
73 for path in self.paths_for_terminal[terminal]:
74 if nonter in path:
75 if nonter in ter_for_nonter:
76 ter_for_nonter[nonter].append(terminal)
77 else:
78 ter_for_nonter[nonter] = [terminal]
79
80 visited = set()
81 for nonter, list_term in ter_for_nonter.items():
82 for ter in list_term:
83 print ter
84 visited.add(ter)
85
86
87
89 """
90 Returns the deepest phrase type for the term identifier and the list of subsumed by the same element
91 @type termid: string
92 @param termid: term identifier
93 @rtype: (string,list)
94 @return: the label and list of terms subsumed
95 """
96 terminal_id = self.terminal_for_term.get(termid)
97 label = None
98 subsumed = []
99 if terminal_id is not None:
100 first_path = self.paths_for_terminal[terminal_id][0]
101 first_phrase_id = first_path[1]
102 label = self.label_for_nonter.get(first_phrase_id)
103 subsumed = self.terms_subsumed_by_nonter.get(first_phrase_id,[])
104 return label,sorted(list(subsumed))
105
106
108 """
109 Returns the deepest common subsumer among two terms
110 @type from_tid: string
111 @param from_tid: one term id
112 @type to_tid: string
113 @param to_tid: another term id
114 @rtype: string
115 @return: the term identifier of the common subsumer
116 """
117 termid_from = self.terminal_for_term.get(from_tid)
118 termid_to = self.terminal_for_term.get(to_tid)
119
120 path_from = self.paths_for_terminal[termid_from][0]
121 path_to = self.paths_for_terminal[termid_to][0]
122 common_nodes = set(path_from) & set(path_to)
123 if len(common_nodes) == 0:
124 return None
125 else:
126 indexes = []
127 for common_node in common_nodes:
128 index1 = path_from.index(common_node)
129 index2 = path_to.index(common_node)
130 indexes.append((common_node,index1+index2))
131 indexes.sort(key=itemgetter(1))
132 shortest_common = indexes[0][0]
133 return shortest_common
134
135
137 """
138 This function returns the path (in terms of phrase types) from one term to another
139 @type from_tid: string
140 @param from_tid: one term id
141 @type to_tid: string
142 @param to_tid: another term id
143 @rtype: list
144 @return: the path, list of phrase types
145 """
146 shortest_subsumer = self.get_least_common_subsumer(from_tid, to_tid)
147
148
149
150 termid_from = self.terminal_for_term.get(from_tid)
151 termid_to = self.terminal_for_term.get(to_tid)
152
153 path_from = self.paths_for_terminal[termid_from][0]
154 path_to = self.paths_for_terminal[termid_to][0]
155
156 if shortest_subsumer is None:
157 return None
158
159 complete_path = []
160 for node in path_from:
161 complete_path.append(node)
162 if node == shortest_subsumer: break
163
164 begin=False
165 for node in path_to[-1::-1]:
166 if begin:
167 complete_path.append(node)
168
169 if node==shortest_subsumer:
170 begin=True
171 labels = [self.label_for_nonter[nonter] for nonter in complete_path]
172 return labels
173
174
176 """
177 This function returns the path (in terms of phrase types) from one term the root
178 @type termid: string
179 @param termid: one term id
180 @rtype: list
181 @return: the path, list of phrase types
182 """
183 terminal_id = self.terminal_for_term.get(termid)
184 paths = self.paths_for_terminal[terminal_id]
185 labels = [self.label_for_nonter[nonter] for nonter in paths[0]]
186 return labels
187
189
190
191 self.paths_for_terminal = {}
192 for tree in knaf_obj.get_trees():
193 for terminal in tree.get_terminals():
194 ter_id = terminal.get_id()
195 span_ids = terminal.get_span().get_span_ids()
196 self.terminals[ter_id] = span_ids
197 for this_id in span_ids:
198 self.terminal_for_term[this_id] = ter_id
199
200
201 for non_terminal in tree.get_non_terminals():
202 nonter_id = non_terminal.get_id()
203 label = non_terminal.get_label()
204 self.label_for_nonter[nonter_id] = label
205
206
207 for edge in tree.get_edges():
208 node_from = edge.get_from()
209 node_to = edge.get_to()
210 if node_from not in self.reachable_from:
211 self.reachable_from[node_from] = [node_to]
212 else:
213 self.reachable_from[node_from].append(node_to)
214
215
216
218 '''
219 Returns the labels of the deepest node that subsumes all the terms in the list of terms id's provided
220 '''
221
222
223 count_per_no_terminal = defaultdict(int)
224
225
226 total_deep_per_no_terminal = defaultdict(int)
227 for term_id in list_terms:
228 terminal_id = self.terminal_for_term.get(term_id)
229 path = self.paths_for_terminal[terminal_id][0]
230 print term_id, path
231 for c,noter in enumerate(path):
232 count_per_no_terminal[noter] += 1
233 total_deep_per_no_terminal[noter] += c
234
235
236 deepest_and_common = None
237 deepest = 10000
238 for noterid, this_total in total_deep_per_no_terminal.items():
239 if count_per_no_terminal.get(noterid,-1) == len(list_terms):
240 if this_total < deepest:
241 deepest = this_total
242 deepest_and_common = noterid
243
244 label = None
245 if deepest_and_common is not None:
246 label = self.label_for_nonter[deepest_and_common]
247 return deepest_and_common, label
248
249
250
251
252
253
254
256 paths = []
257 possible_nodes = self.reachable_from.get(node,[])
258 if len(possible_nodes) == 0:
259 return [[node]]
260 else:
261 for possible_node in possible_nodes:
262 new_paths = self.__expand_node(possible_node)
263 for path in new_paths:
264 if include_this_node:
265 path.insert(0,node)
266 paths.append(path)
267 return paths
268
270 """
271 Returns the chunks for a certain type
272 @type chunk_type: string
273 @param chunk_type: type of the chunk
274 @rtype: list
275 @return: the chunks for that type
276 """
277 for nonter,this_type in self.label_for_nonter.items():
278 if this_type == chunk_type:
279 subsumed = self.terms_subsumed_by_nonter.get(nonter)
280 if subsumed is not None:
281 yield sorted(list(subsumed))
282
283
284
285
287 all_chunks = []
288 n=0
289 for nonter,this_type in self.label_for_nonter.items():
290 subsumed = self.terms_subsumed_by_nonter.get(nonter)
291 print nonter, this_type, subsumed
292 continue
293 if subsumed is not None:
294 if len(subsumed) > 1:
295 all_chunks.append(('chunk_%d' % n, this_type,subsumed))
296 else:
297 tid = self.terminal_for_term.get(list(subsumed)[0])
298 reachable = self.reachable_from[tid]
299 if reachable[0] != nonter:
300 all_chunks.append(('chunk_%d' % n, this_type,subsumed))
301
302
303
304 while True:
305 ids_to_be_removed = set()
306 for id1, type1, span1 in all_chunks:
307
308 for id2, type2, span2 in all_chunks:
309 if id1 != id2:
310
311 if len(span2) > len(span1):
312 common = span1 & span2
313 if len(common) == len(span1):
314 ids_to_be_removed.add(id2)
315
316 filtered_chunks = []
317 for this_id, this_type, this_span in all_chunks:
318 if this_id not in ids_to_be_removed:
319 filtered_chunks.append((this_id, this_type, this_span))
320
321 del all_chunks
322 all_chunks = copy.copy(filtered_chunks)
323 if len(ids_to_be_removed) == 0:
324 break
325
326 for this_id, this_type, this_span in all_chunks:
327 yield this_type, this_span
328
329
331 """
332 Returns all the chunks in which the term is contained
333 @type termid: string
334 @param termid: the term identifier
335 @rtype: list
336 @return: list of chunks
337 """
338 terminal_id = self.terminal_for_term.get(termid)
339 paths = self.paths_for_terminal[terminal_id]
340 for path in paths:
341 for node in path:
342 this_type = self.label_for_nonter[node]
343 subsumed = self.terms_subsumed_by_nonter.get(node)
344 if subsumed is not None:
345 yield this_type,sorted(list(subsumed))
346