feature_extractor.dependency

16 """ 17 This is the main class for the information extraction 18 """

19 - def __init__(self,knaf_obj):

20 """ 21 The constructor of this class take one kaf/naf object 22 @type knaf_obj: kaf/naf object 23 @param knaf_obj: the kaf/naf object 24 """ 25 self.naf = knaf_obj 26 self.relations_for_term = {} 27 self.reverse_relations_for_term = {} 28 self.prefix_for_reverse = '' 29 30 31 already_linked = {} 32 for dep in knaf_obj.get_dependencies(): 33 term_from = dep.get_from() 34 term_to = dep.get_to() 35 rfunc = dep.get_function() 36 37 # Dependencies reversed are skipped... 38 #if rfunc.startswith('rhd/') or rfunc.startswith('whd/'): 39 # continue 40 41 # For detecting cycles like: 42 #  43 # <dep from="t19" to="t15" rfunc="rhd/body"/> 44 #  45 # <dep from="t15" to="t19" rfunc="hd/su"/> 46 47 ''' 48 if term_from in already_linked and term_to in already_linked[term_from]: 49 #There could be a cycle, skip this 50 print>>sys.stderr,'Skipped from',term_from,'to',term_to,'func',rfunc,' cycle detected' 51 continue 52 else: 53 #Include term_from as linked with term_to for future... 54 if term_to not in already_linked: 55 already_linked[term_to] = set() 56 already_linked[term_to].add(term_from) 57 ''' 58 59 60 61 62 if term_from in self.relations_for_term: 63 self.relations_for_term[term_from].append((rfunc,term_to)) 64 else: 65 self.relations_for_term[term_from] = [(rfunc,term_to)] 66 67 if term_to in self.reverse_relations_for_term: 68 self.reverse_relations_for_term[term_to].append((self.prefix_for_reverse+rfunc,term_from)) 69 else: 70 self.reverse_relations_for_term[term_to] = [(self.prefix_for_reverse+rfunc,term_from)] 71 72 73 self.paths_for_termid={} 74 self.sentence_for_termid={} 75 self.top_relation_for_term = {} ## termid --> (relation,topnode) 76 self.root_for_sentence = {} ## sentenceid --> termid 77 78 for term_obj in knaf_obj.get_terms(): 79 termid = term_obj.get_id() 80 81 #Calculating the sentence id for the term id 82 span_ids = term_obj.get_span().get_span_ids() 83 token_obj = knaf_obj.get_token(span_ids[0]) 84 if token_obj is None: 85 continue 86 87 sentence = token_obj.get_sent() 88 89 self.sentence_for_termid[termid] = sentence 90 ########################################### 91 92 #paths = self.__propagate_node(termid,[]) 93 #inversed = self.__reverse_propagate_node(termid) 94 95 ## Due to the change on direction of dependencies... 96 inversed = self.__propagate_node(termid,already_propagated=[]) 97 paths = self.__reverse_propagate_node(termid,already_propagated=[]) 98 99 ##Calculate the top relation for the node, the relation with the main root of the tree 100 if len(inversed) != 0: 101 for ip in inversed: 102 if len(ip)!=0: 103 self.top_relation_for_term[termid] = ip[-1] ## ex. ('NMOD', 't2') 104 root = ip[-1][1] 105 if sentence not in self.root_for_sentence: 106 self.root_for_sentence[sentence] = {} 107 108 if root not in self.root_for_sentence[sentence]: 109 self.root_for_sentence[sentence][root]=0 110 else: 111 self.root_for_sentence[sentence][root]+=1 112 break 113 114 self.paths_for_termid[termid] = paths + inversed 115 116 ''' 117 print termid 118 print 'DIRECT RELS' 119 for p in paths: 120 print ' ',p 121 122 print 'INDIRECT RELS' 123 for p in inversed: 124 print ' ',p 125 ''' 126 #### 127 128 for sent_id, distr in self.root_for_sentence.items(): 129 ## get_max_distr_dict imported from VUA_pylib.common 130 most_freq,c = get_max_distr_dict(distr) 131 self.root_for_sentence[sent_id] = most_freq

132 133 134 135

136 - def __propagate_node(self,node,already_propagated=[]):

137 paths = [] 138 139 relations = self.relations_for_term.get(node) 140 #print 'Propagate ',node,relations 141 if relations is None: ##Case base 142 paths = [[]] 143 elif node in already_propagated: 144 paths = [[]] 145 146 else: 147 already_propagated.append(node) 148 for func, target_node in relations: 149 new_paths = self.__propagate_node(target_node, already_propagated) 150 for new_path in new_paths: 151 new_path.insert(0,(func,target_node)) 152 paths.append(new_path) 153 return paths

154

155 - def __reverse_propagate_node(self,node,already_propagated=[]):

156 paths = [] 157 relations = self.reverse_relations_for_term.get(node) 158 #print 'Propagate reverse',node,relations,already_propagated 159 if relations is None: ##Case base 160 paths = [[]] 161 elif node in already_propagated: 162 paths = [[]] 163 else: 164 already_propagated.append(node) 165 for func, target_node in relations: 166 new_paths = self.__reverse_propagate_node(target_node,already_propagated) 167 for new_path in new_paths: 168 new_path.insert(0,(func,target_node)) 169 paths.append(new_path) 170 return paths

171 172 173 # Get the shortest path between 2 term ids

174 - def get_shortest_path(self,term1,term2):

175 """ 176 Returns the list of dependency labels of the shortest path between two terms 177 @type term1: string 178 @param term1: the term identifier for one term 179 @type term2: string 180 @param term2: the term identifier for the other term 181 @rtype: list 182 @return: list of dependency relations 183 """ 184 dep_path = None 185 if term1 == term2: dep_path = [] 186 else: 187 paths1 = self.paths_for_termid[term1] 188 paths2 = self.paths_for_termid[term2] 189 190 ##Check if term2 is on paths1 191 hits = [] ## list of (common_id,idx1,idx2,numpath1,numpath2) 192 for num1, p1 in enumerate(paths1): 193 ids1 = [ my_id for my_func, my_id in p1] 194 if term2 in ids1: 195 idx1 = ids1.index(term2) 196 hits.append((term2,idx1+0,idx1,0,num1,None)) 197 198 for num2,p2 in enumerate(paths2): 199 ids2 = [ my_id for my_func, my_id in p2] 200 if term1 in ids2: 201 idx2=ids2.index(term1) 202 hits.append((term1,0+idx2,0,idx2,None,num2)) 203 204 #Pair by pair 205 for num1, p1 in enumerate(paths1): 206 #print 'Path1',term1, p1 207 ids1 = [ my_id for my_func, my_id in p1] 208 #print 'IDS1',ids1 209 for num2, p2 in enumerate(paths2): 210 #print '\t',term2,p2 211 ids2 = [ my_id for my_func, my_id in p2] 212 #print ' IDS2',ids2 213 common_ids = set(ids1) & set(ids2) 214 #print ' cmmon',common_ids 215 for common_id in common_ids: 216 idx1 = ids1.index(common_id) 217 idx2 = ids2.index(common_id) 218 hits.append((common_id,idx1+idx2,idx1,idx2,num1,num2)) 219 220 221 if len(hits) != 0: 222 dep_path = [] 223 hits.sort(key=itemgetter(1)) 224 best_hit = hits[0] 225 common_id, _, idx1, idx2, numpath1, numpath2 = best_hit 226 227 if numpath2 is None: #term2 is in one of the paths of t1 228 path1 = paths1[numpath1] 229 my_rels1 = path1[:idx1+1] 230 ##complete_path = '' 231 ##complete_path_ids = '' 232 for func,node in my_rels1: 233 dep_path.append(func) 234 ##complete_path+=func+'#' 235 ##complete_path_ids+=node+'#' 236 237 #=========================================================== 238 # print 'CASE1',best_hit 239 # print complete_path 240 # print complete_path_ids 241 #=========================================================== 242 elif numpath1 is None: #term1 is in one of the paths of t2 243 path2 = paths2[numpath2] 244 my_rels2 = path2[:idx2+1] 245 ##complete_path = '' 246 ##complete_path_ids = '' 247 for func,node in my_rels2: 248 dep_path.append(func) 249 #complete_path+=func+'#' 250 #complete_path_ids+=node+'#' 251 252 #=========================================================== 253 # print 'CASE2',best_hit 254 # print complete_path 255 # print complete_path_ids 256 #=========================================================== 257 else: #There is a common node linking both 258 path1 = paths1[numpath1] 259 my_rels1 = path1[:idx1+1] 260 261 path2 = paths2[numpath2] 262 my_rels2 = path2[:idx2+1] 263 264 ##complete_path = '' 265 #complete_path_ids = '' 266 for func,node in my_rels1: 267 dep_path.append(func) 268 ##complete_path+=func+'#' 269 #complete_path_ids+=func+'->'+self.naf.get_term(node).get_lemma()+'->' 270 271 for func,node in my_rels2[-1::-1]: 272 dep_path.append(func) 273 ##complete_path+=func+'#' 274 #complete_path_ids+=func+'->'+self.naf.get_term(node).get_lemma()+'->' 275 #=========================================================== 276 # 277 # print complete_path 278 # print complete_path_ids 279 # print path2 280 # print my_rels1 281 # print my_rels2 282 # print 'CASE3',best_hit 283 #=========================================================== 284 return dep_path

285 286 ## Get the shortest dependency path between 2 sets of spans

287 - def get_shortest_path_spans(self,span1,span2):

288 """ 289 Returns the list of dependency labels of the shortest path between two span of terms 290 @type span1: list 291 @param span1: list of term identifiers 292 @type span2: list 293 @param span2: list of term identifiers 294 @rtype: list 295 @return: list of dependency relations 296 """ 297 shortest_path = None 298 299 for term1 in span1: 300 for term2 in span2: 301 this_path = self.get_shortest_path(term1, term2) 302 #print term1,term2, this_path 303 if shortest_path is None or (this_path is not None and len(this_path)<len(shortest_path)): 304 shortest_path = this_path 305 return shortest_path

306 307 # Get the dependency path to the sentence root for a term id

308 - def get_path_to_root(self,termid):

309 """ 310 Returns the dependency path from the term to the root 311 @type termid: string 312 @param termid: the term identifier 313 @rtype: list 314 @return: list of dependency relations 315 """ 316 # Get the sentence for the term 317 root = None 318 sentence = self.sentence_for_termid.get(termid) 319 320 if sentence is None: #try with the top node 321 top_node = self.top_relation_for_term.get(termid) 322 if top_node is not None: 323 root = top_node[1] 324 else: 325 return None 326 else: 327 if sentence in self.root_for_sentence: 328 root = self.root_for_sentence[sentence] 329 else: 330 ##There is no root for this sentence 331 return None 332 # In this point top_node should be properly set 333 path = self.get_shortest_path(termid, root) 334 return path

335 336 # Get the shortest dependency path to the sentence root for a span of ids 337 # extractor.get_shortest_path_to_root_span(['t444','t445','t446'])

338 - def get_shortest_path_to_root_span(self,span):

339 """ 340 Returns the dependency path from a span of terms to the root 341 @type span: list 342 @param span: list of term identifiers 343 @rtype: list 344 @return: list of dependency relations 345 """ 346 shortest_path = None 347 for termid in span: 348 this_path = self.get_path_to_root(termid) 349 ## In case of , or . or whatever, the path to the root usually is None, there are no dependencies... 350 if shortest_path is None or (this_path is not None and len(this_path) < len(shortest_path)): 351 shortest_path = this_path 352 return shortest_path

353 354 # Get all terms that are embedded under a given head (its dependents and dependents of its dependents

355 - def get_full_dependents(self, term_id, relations, counter = 0):

356 """ 357 Returns the complete list of dependents and embedded dependents of a certain term. 358 """ 359 counter += 1 360 deps = self.relations_for_term 361 if term_id in deps and len(deps.get(term_id)) > 0: 362 for dep in deps.get(term_id): 363 if not dep[1] in relations: 364 relations.append(dep[1]) 365 if dep[1] in deps: 366 deprelations = self.get_full_dependents(dep[1], relations, counter) 367 for deprel in deprelations: 368 if not deprel in relations: 369 relations.append(deprel) 370 371 return relations

Source Code for Module feature_extractor.dependency