feature_extractor.constituency

13 """ 14 This is the main class that allows the extraction of information 15 """

16 - def __init__(self,knaf_obj):

17 """ 18 Constructor from a KAfPArser object 19 @type knaf_obj: KAfParser 20 @param knaf_obj: the KAF/NAF object 21 """ 22 self.naf = knaf_obj 23 #Extract terminals, non terminals and edges 24 ## Extracted directly from 25 self.terminals = {} #terminal id --> list term ids 26 self.terminal_for_term = {} #term id --> terminal id 27 self.label_for_nonter = {} # nonter --> label 28 self.reachable_from = {} # node_from --> [nodeto1, nodeto2...] 29 30 self.extract_info_from_naf(knaf_obj) 31 32 #Extracting all posible paths from leave to root for each terminal id 33 self.paths_for_terminal= {} 34 for terminal_id in self.terminals.keys(): 35 paths = self.__expand_node(terminal_id,False) 36 self.paths_for_terminal[terminal_id] = paths 37 ####################################### 38 39 ## Create, for each non terminal, which are the terminals subsumed 40 self.terms_subsumed_by_nonter = {} ## ['nonter12'] = set('t1,'t2','t3','t4') 41 for terminal_id, span_terms in self.terminals.items(): 42 for path in self.paths_for_terminal[terminal_id]: 43 for nonter in path: 44 if nonter not in self.terms_subsumed_by_nonter: 45 self.terms_subsumed_by_nonter[nonter] = set() 46 for termid in span_terms: 47 self.terms_subsumed_by_nonter[nonter].add(termid)

48 49 ## To print the paths calculated 50 # for terminal in self.terminals.keys(): 51 # print terminal 52 # for path in self.paths_for_terminal[terminal]: 53 # sep=' ' 54 # for node in path: 55 # print sep,node,self.label_for_nonter.get(node,'?') 56 # sep+=' ' 57 # print '#'*20 58 59

60 - def get_deepest_phrases(self):

61 all_nonter = set() 62 for terminal in self.terminals.keys(): 63 for path in self.paths_for_terminal[terminal]: 64 first_non_ter_phrase = path[1] 65 subsumed = self.terms_subsumed_by_nonter[first_non_ter_phrase] 66 this_type = self.label_for_nonter[first_non_ter_phrase] 67 print terminal, this_type, subsumed 68 return None 69 70 ter_for_nonter = {} 71 for nonter in all_nonter: 72 for terminal in self.terminals.keys(): 73 for path in self.paths_for_terminal[terminal]: 74 if nonter in path: 75 if nonter in ter_for_nonter: 76 ter_for_nonter[nonter].append(terminal) 77 else: 78 ter_for_nonter[nonter] = [terminal] 79 80 visited = set() 81 for nonter, list_term in ter_for_nonter.items(): 82 for ter in list_term: 83 print ter 84 visited.add(ter)

85 86 87 ### Returns the label of the deepest phrase for the term id (termid as in the term layer)

88 - def get_deepest_phrase_for_termid(self,termid):

89 """ 90 Returns the deepest phrase type for the term identifier and the list of subsumed by the same element 91 @type termid: string 92 @param termid: term identifier 93 @rtype: (string,list) 94 @return: the label and list of terms subsumed 95 """ 96 terminal_id = self.terminal_for_term.get(termid) 97 label = None 98 subsumed = [] 99 if terminal_id is not None: 100 first_path = self.paths_for_terminal[terminal_id][0] 101 first_phrase_id = first_path[1] 102 label = self.label_for_nonter.get(first_phrase_id) 103 subsumed = self.terms_subsumed_by_nonter.get(first_phrase_id,[]) 104 return label,sorted(list(subsumed))

105 106

107 - def get_least_common_subsumer(self,from_tid,to_tid):

108 """ 109 Returns the deepest common subsumer among two terms 110 @type from_tid: string 111 @param from_tid: one term id 112 @type to_tid: string 113 @param to_tid: another term id 114 @rtype: string 115 @return: the term identifier of the common subsumer 116 """ 117 termid_from = self.terminal_for_term.get(from_tid) 118 termid_to = self.terminal_for_term.get(to_tid) 119 120 path_from = self.paths_for_terminal[termid_from][0] 121 path_to = self.paths_for_terminal[termid_to][0] 122 common_nodes = set(path_from) & set(path_to) 123 if len(common_nodes) == 0: 124 return None 125 else: 126 indexes = [] 127 for common_node in common_nodes: 128 index1 = path_from.index(common_node) 129 index2 = path_to.index(common_node) 130 indexes.append((common_node,index1+index2)) 131 indexes.sort(key=itemgetter(1)) 132 shortest_common = indexes[0][0] 133 return shortest_common

134 135

136 - def get_path_from_to(self,from_tid, to_tid):

137 """ 138 This function returns the path (in terms of phrase types) from one term to another 139 @type from_tid: string 140 @param from_tid: one term id 141 @type to_tid: string 142 @param to_tid: another term id 143 @rtype: list 144 @return: the path, list of phrase types 145 """ 146 shortest_subsumer = self.get_least_common_subsumer(from_tid, to_tid) 147 148 #print 'From:',self.naf.get_term(from_tid).get_lemma() 149 #print 'To:',self.naf.get_term(to_tid).get_lemma() 150 termid_from = self.terminal_for_term.get(from_tid) 151 termid_to = self.terminal_for_term.get(to_tid) 152 153 path_from = self.paths_for_terminal[termid_from][0] 154 path_to = self.paths_for_terminal[termid_to][0] 155 156 if shortest_subsumer is None: 157 return None 158 159 complete_path = [] 160 for node in path_from: 161 complete_path.append(node) 162 if node == shortest_subsumer: break 163 164 begin=False 165 for node in path_to[-1::-1]: 166 if begin: 167 complete_path.append(node) 168 169 if node==shortest_subsumer: 170 begin=True 171 labels = [self.label_for_nonter[nonter] for nonter in complete_path] 172 return labels

173 174

175 - def get_path_for_termid(self,termid):

176 """ 177 This function returns the path (in terms of phrase types) from one term the root 178 @type termid: string 179 @param termid: one term id 180 @rtype: list 181 @return: the path, list of phrase types 182 """ 183 terminal_id = self.terminal_for_term.get(termid) 184 paths = self.paths_for_terminal[terminal_id] 185 labels = [self.label_for_nonter[nonter] for nonter in paths[0]] 186 return labels

187

188 - def extract_info_from_naf(self,knaf_obj):

189 ## Generated internally 190 # For each terminal node, a list of paths through all the edges 191 self.paths_for_terminal = {} 192 for tree in knaf_obj.get_trees(): 193 for terminal in tree.get_terminals(): 194 ter_id = terminal.get_id() 195 span_ids = terminal.get_span().get_span_ids() 196 self.terminals[ter_id] = span_ids 197 for this_id in span_ids: 198 self.terminal_for_term[this_id] = ter_id 199 200 201 for non_terminal in tree.get_non_terminals(): 202 nonter_id = non_terminal.get_id() 203 label = non_terminal.get_label() 204 self.label_for_nonter[nonter_id] = label 205 206 207 for edge in tree.get_edges(): 208 node_from = edge.get_from() 209 node_to = edge.get_to() 210 if node_from not in self.reachable_from: 211 self.reachable_from[node_from] = [node_to] 212 else: 213 self.reachable_from[node_from].append(node_to)

214 215 216

217 - def get_deepest_subsumer(self,list_terms):

218 ''' 219 Returns the labels of the deepest node that subsumes all the terms in the list of terms id's provided 220 ''' 221 222 #To store with how many terms every nonterminal appears 223 count_per_no_terminal = defaultdict(int) 224 225 #To store the total deep of each noter for all the term ides (as we want the deepest) 226 total_deep_per_no_terminal = defaultdict(int) 227 for term_id in list_terms: 228 terminal_id = self.terminal_for_term.get(term_id) 229 path = self.paths_for_terminal[terminal_id][0] 230 print term_id, path 231 for c,noter in enumerate(path): 232 count_per_no_terminal[noter] += 1 233 total_deep_per_no_terminal[noter] += c 234 235 236 deepest_and_common = None 237 deepest = 10000 238 for noterid, this_total in total_deep_per_no_terminal.items(): 239 if count_per_no_terminal.get(noterid,-1) == len(list_terms): ##Only the nontarms that ocurr with all the term ids in the input 240 if this_total < deepest: 241 deepest = this_total 242 deepest_and_common = noterid 243 244 label = None 245 if deepest_and_common is not None: 246 label = self.label_for_nonter[deepest_and_common] 247 return deepest_and_common, label

248 #return label 249 250 251 ##Recursive function 252 ## Propagates the node through all the relations extracte from the edges information 253 ## It returns a list of lists, one for each path 254 ## Include_this_node is used for avoiding the first node

255 - def __expand_node(self,node,include_this_node=True):

256 paths = [] 257 possible_nodes = self.reachable_from.get(node,[]) 258 if len(possible_nodes) == 0: 259 return [[node]] 260 else: 261 for possible_node in possible_nodes: 262 new_paths = self.__expand_node(possible_node) 263 for path in new_paths: 264 if include_this_node: 265 path.insert(0,node) 266 paths.append(path) 267 return paths

268

269 - def get_chunks(self,chunk_type):

270 """ 271 Returns the chunks for a certain type 272 @type chunk_type: string 273 @param chunk_type: type of the chunk 274 @rtype: list 275 @return: the chunks for that type 276 """ 277 for nonter,this_type in self.label_for_nonter.items(): 278 if this_type == chunk_type: 279 subsumed = self.terms_subsumed_by_nonter.get(nonter) 280 if subsumed is not None: 281 yield sorted(list(subsumed))

282 283 284 285

286 - def get_all_deepest_chunks(self):

287 all_chunks = [] 288 n=0 289 for nonter,this_type in self.label_for_nonter.items(): 290 subsumed = self.terms_subsumed_by_nonter.get(nonter) 291 print nonter, this_type, subsumed 292 continue 293 if subsumed is not None: 294 if len(subsumed) > 1: 295 all_chunks.append(('chunk_%d' % n, this_type,subsumed)) 296 else: 297 tid = self.terminal_for_term.get(list(subsumed)[0]) 298 reachable = self.reachable_from[tid] 299 if reachable[0] != nonter: 300 all_chunks.append(('chunk_%d' % n, this_type,subsumed)) 301 302 303 304 while True: 305 ids_to_be_removed = set() 306 for id1, type1, span1 in all_chunks: 307 #We fixed id1 308 for id2, type2, span2 in all_chunks: 309 if id1 != id2: 310 #If id2 subsumes id1 is a candidate to be removed 311 if len(span2) > len(span1): 312 common = span1 & span2 313 if len(common) == len(span1): 314 ids_to_be_removed.add(id2) 315 316 filtered_chunks = [] 317 for this_id, this_type, this_span in all_chunks: 318 if this_id not in ids_to_be_removed: 319 filtered_chunks.append((this_id, this_type, this_span)) 320 321 del all_chunks 322 all_chunks = copy.copy(filtered_chunks) 323 if len(ids_to_be_removed) == 0: 324 break 325 326 for this_id, this_type, this_span in all_chunks: 327 yield this_type, this_span

328 329

330 - def get_all_chunks_for_term(self,termid):

331 """ 332 Returns all the chunks in which the term is contained 333 @type termid: string 334 @param termid: the term identifier 335 @rtype: list 336 @return: list of chunks 337 """ 338 terminal_id = self.terminal_for_term.get(termid) 339 paths = self.paths_for_terminal[terminal_id] 340 for path in paths: 341 for node in path: 342 this_type = self.label_for_nonter[node] 343 subsumed = self.terms_subsumed_by_nonter.get(node) 344 if subsumed is not None: 345 yield this_type,sorted(list(subsumed))

Source Code for Module feature_extractor.constituency