Source code for discoursegraphs.readwrite.tiger

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Arne Neumann <discoursegraphs.programming@arne.cl>

"""
The ``tiger`` module converts a ``TigerXML`` file into a networkx-based
document graph.
"""

import sys
import os
import re
from lxml import etree, objectify
from networkx import write_gpickle

from discoursegraphs import DiscourseDocumentGraph
from discoursegraphs.util import natural_sort_key, ensure_unicode


[docs]class TigerDocumentGraph(DiscourseDocumentGraph): """ A directed graph with multiple edges (based on networkx.MultiDiGraph) that represents all the sentences contained in a TigerXML file. A ``TigerDocumentGraph`` contains a document root node (whose ID is stored in ``self.root``), which has an outgoing edge to the sentence root nodes of each sentence. Attributes ---------- corpus_id : str ID of the TigerXML document specified in the 'id' attribute of the <corpus> element root : str the ID of the root node of the document graph sentences : list of str sorted list of all sentence root node IDs (of sentences contained in this document graph) The attribute dict of each sentence root node contains a key ``tokens``, which maps to a sorted list of token node IDs (str). To print all tokens of a Tiger document, just do:: tdg = TigerDocumentGraph('/path/to/tiger.file') for sentence_root_node in tdg.sentences: for token_node_id in tdg.node[sentence_root_node]['tokens']: print tdg.node[token_node_id]['tiger:word'] """ def __init__(self, tiger_filepath, name=None): """ Creates a directed graph that represents all syntax annotated sentences in the given TigerXML file. Parameters ---------- tiger_filepath : str absolute or relative path to the TigerXML file to be parsed name : str or None the name or ID of the graph to be generated. If no name is given, the basename of the input file is used. """ # super calls __init__() of base class DiscourseDocumentGraph super(TigerDocumentGraph, self).__init__() utf8_parser = etree.XMLParser(encoding="utf-8") tigerxml_tree = etree.parse(tiger_filepath, utf8_parser) tigerxml_root = tigerxml_tree.getroot() if name is not None: self.name = os.path.basename(tiger_filepath) self.corpus_id = tigerxml_root.attrib['id'] # add root node of TigerDocumentGraph self.root = 'tiger:root_node' self.add_node(self.root, layers={'tiger'}) self.sentences = [] for sentence in tigerxml_root.iterfind('./body/s'): self.__add_sentence_to_document(sentence) self.sentences = sorted(self.sentences, key=natural_sort_key) def __add_sentence_to_document(self, sentence): """ Converts a sentence into a TigerSentenceGraph and adds all its nodes, edges (and their features) to this graph. This also adds an edge from the root node of this document graph to the root node of the sentence and appends the sentence root node ID to ``self.sentences``. Parameters ---------- sentence : lxml.etree._Element a sentence from a TigerXML file in etree element format """ sentence_graph = TigerSentenceGraph(sentence) sentence_root_node_id = sentence_graph.root self.add_nodes_from(sentence_graph.nodes(data=True)) self.add_edges_from(sentence_graph.edges(data=True)) self.add_edge(self.root, sentence_root_node_id, layers={'tiger', 'tiger:sentence'}) self.sentences.append(sentence_root_node_id)
[docs]class TigerSentenceGraph(DiscourseDocumentGraph): """ A directed graph (based on a networkx.MultiDiGraph) that represents one syntax annotated sentence extracted from a TigerXML file. Attributes ---------- root : str node ID of the root node of the sentence tokens : list of str a sorted list of terminal node IDs (i.e. token nodes) """ def __init__(self, sentence): """ Creates a directed graph from a syntax annotated sentence (i.e. a <s> element from a TigerXML file parsed into an lxml etree Element). For performance reasons, a sorted list of terminals (i.e. nodes representing tokens) is stored under ``self.tokens``. Parameters ---------- sentence : lxml.etree._Element a sentence from a TigerXML file in etree element format """ # super calls __init__() of base class DiscourseDocumentGraph super(TigerSentenceGraph, self).__init__() graph_element = sentence.find('./graph') sentence_root_id = graph_element.attrib['root'] # sentence.attrib is a lxml.etree._Attrib, which is 'dict-like' # but doesn't behave exactly like a dict (i.e. it threw an error # when I tried to update it) sentence_attributes = add_prefix(sentence.attrib, 'tiger:') # some sentences in the Tiger corpus are marked as discontinuous if 'discontinuous' in graph_element.attrib: sentence_attributes.update( {'tiger:discontinuous': graph_element.attrib['discontinuous']}) self.__add_vroot(sentence_root_id, sentence_attributes) self.__tigersentence2graph(sentence) self.__repair_unconnected_nodes() def __tigersentence2graph(self, sentence): """ Reads a sentence with syntax annotation (parsed from a TigerXML file) into this directed graph. Adds an attribute named 'tokens' to the sentence root node containing a sorted list of token node IDs. Parameters ---------- sentence : lxml.etree._Element a sentence from a TigerXML file in etree element format """ token_ids = [] for t in sentence.iterfind('./graph/terminals/t'): terminal_id = t.attrib['id'] token_ids.append(terminal_id) terminal_features = add_prefix(t.attrib, 'tiger:') # convert tokens to unicode terminal_features['tiger:word'] = ensure_unicode( terminal_features['tiger:word']) self.add_node(terminal_id, layers={'tiger', 'tiger:token'}, attr_dict=terminal_features) for secedge in t.iterfind('./secedge'): to_id = secedge.attrib['idref'] secedge_attribs = add_prefix(secedge.attrib, 'tiger:') if not to_id in self: # if graph doesn't contain to-node, yet self.add_node(to_id, layers={'tiger', 'tiger:secedge'}) self.add_edge(terminal_id, to_id, layers={'tiger', 'tiger:secedge'}, attr_dict=secedge_attribs) # add sorted list of all token node IDs to sentence root node # for performance reasons sorted_token_ids = sorted(token_ids, key=natural_sort_key) self.node[self.root].update({'tokens': sorted_token_ids}) for nt in sentence.iterfind('./graph/nonterminals/nt'): from_id = nt.attrib['id'] nt_feats = add_prefix(nt.attrib, 'tiger:') if from_id in self: # root node already exists, # but doesn't have a cat value self.node[from_id].update(nt_feats) else: self.add_node(from_id, layers={'tiger', 'tiger:syntax'}, attr_dict=nt_feats) for edge in nt.iterfind('./edge'): to_id = edge.attrib['idref'] if to_id not in self: # if graph doesn't contain to-node, yet self.add_node(to_id, layers={'tiger', 'tiger:secedge'}) edge_attribs = add_prefix(edge.attrib, 'tiger:') self.add_edge(from_id, to_id, layers={'tiger', 'tiger:edge'}, attr_dict=edge_attribs) for secedge in nt.iterfind('./secedge'): to_id = secedge.attrib['idref'] if to_id not in self: # if graph doesn't contain to-node, yet self.add_node(to_id, layers={'tiger', 'tiger:secedge'}) secedge_attribs = add_prefix(secedge.attrib, 'tiger:') self.add_edge(from_id, to_id, layers={'tiger', 'tiger:secedge'}, attr_dict=secedge_attribs) def __add_vroot(self, sentence_root_id, sentence_attributes): """ Adds a new node with the ID 'VROOT' to this sentence graph. The 'VROOT' node will have an outgoing edge to the node that has previously been considered the root node of the sentence and will have the attributes extracted from the <s> element of the corresponding sentence in the TigerXML file. The ``TigerSentenceGraph.root`` attribute will be set as well. Why do we do this? 'VROOT' (virtual root) nodes are commonly used in the Tiger corpus (version 2.1). They are useful whenever a sentence does not have any nonterminals (e.g. if there is no full syntax structure annotation in the case of a three word headline 'sentence'). Parameters ---------- sentence_root_id : str the ID of the root node of the sentence, extracted from the ``root`` attribute of the ``<graph>`` element of the corresponding sentence in the TigerXML file. sentence_attributes : dict of (str, str) a dictionary of sentence attributes extracted from the <s> element (corresponding to this sentence) of a TigerXML file. contains the attributes ``tiger:id``, ``tiger:art_id`` and ``tiger:orig_id``. """ old_root_node_id = sentence_root_id sentence_id = sentence_attributes['tiger:id'] new_root_node_id = 'VROOT-{0}'.format(sentence_id) self.add_node(old_root_node_id, layers={'tiger', 'tiger:sentence', 'tiger:sentence:root'}) self.add_node(new_root_node_id, layers={ 'tiger', 'tiger:sentence', 'tiger:sentence:vroot'}, attr_dict=sentence_attributes) self.add_edge(new_root_node_id, old_root_node_id, layers={'tiger', 'tiger:sentence', 'tiger:sentence:vroot'}) self.root = new_root_node_id def __repair_unconnected_nodes(self): """ Adds an edge from the 'VROOT' node to all previously unconnected nodes (token nodes, that either represent a punctuation mark or are part of a headline 'sentence' that has no full syntax structure annotation). """ unconnected_node_ids = get_unconnected_nodes(self) for unconnected_node_id in unconnected_node_ids: self.add_edge(self.root, unconnected_node_id, layers={'tiger', 'tiger:sentence'})
def _get_terminals_and_nonterminals(sentence_graph): """ Given a TigerSentenceGraph, returns a sorted list of terminal node IDs, as well as a sorted list of nonterminal node IDs. Parameters ---------- sentence_graph : TigerSentenceGraph a directed graph representing one syntax annotated sentence from a TigerXML file Returns ------- terminals, nonterminals : list of str a sorted list of terminal node IDs and a sorted list of nonterminal node IDs """ terminals = set() nonterminals = set() for node_id in sentence_graph.nodes_iter(): if sentence_graph.out_degree(node_id) > 0: # all nonterminals (incl. root) nonterminals.add(node_id) else: # terminals terminals.add(node_id) return sorted(list(terminals), key=natural_sort_key), \ sorted(list(nonterminals), key=natural_sort_key)
[docs]def get_unconnected_nodes(sentence_graph): """ Takes a TigerSentenceGraph and returns a list of node IDs of unconnected nodes. A node is unconnected, if it doesn't have any in- or outgoing edges. A node is NOT considered unconnected, if the graph only consists of that particular node. Parameters ---------- sentence_graph : TigerSentenceGraph a directed graph representing one syntax annotated sentence from a TigerXML file Returns ------- unconnected_node_ids : list of str a list of node IDs of unconnected nodes """ return [node for node in sentence_graph.nodes_iter() if sentence_graph.degree(node) == 0 and sentence_graph.number_of_nodes() > 1]
[docs]def add_prefix(dict_like, prefix): """ takes a dict (or dict-like object, e.g. etree._Attrib) and adds the given prefix to each key. Always returns a dict (via a typecast). Parameters ---------- dict_like : dict (or similar) a dictionary or a container that implements .items() prefix : str the prefix string to be prepended to each key in the input dict Returns ------- prefixed_dict : dict A dict, in which each key begins with the given prefix. """ if not isinstance(dict_like, dict): try: dict_like = dict(dict_like) except Error as e: raise ValueError("{0}\nCan't convert container to dict: " "{1}".format(e, dict_like)) return {prefix + k: v for (k, v) in dict_like.items()}
[docs]def tiger_tokenlist(tigerdoc_graph): """ extracts all tokens from a TigerDocumentGraph. Parameters ---------- tigerdoc_graph : TigerDocumentGraph a directed graph representing a TigerXML file and all the annotated sentences found in it. Returns ------- all_tiger_tokens : tuple of (unicode, str, str) a list of (unicode, str, str) tuples, where the first element is the token, the second is the sentence root node ID (of the) corresponding sentence and the third is the token node ID. """ all_tiger_tokens = [] for sent_id in tigerdoc_graph.sentences: tiger_sent_tokens = [(tigerdoc_graph.node[token_id]['tiger:word'], sent_id, token_id) for token_id in tigerdoc_graph.node[sent_id]['tokens']] all_tiger_tokens.extend(tiger_sent_tokens) return all_tiger_tokens #~ def graph2tigersentence(sentence_graph): #~ """ #~ @param sentence_graph: a directed graph containing a Tiger format #~ sentence structure and annotations (syntax and morphology) #~ @type sentence_graph: DiscourseDocumentGraph #~ """ #~ terminals, nonterminals = _get_terminals_and_nonterminals(sentence_graph) #~ sentence_root = objectify.Element('s', sentence_graph.metadata) #~ graph_root = objectify.SubElement(sentence_root, 'graph', root=sentence_graph.root) #~ terms_node = objectify.SubElement(graph_root, 'terminals') #~ for term_id in terminals: #~ terms_node.append(objectify.Element('t', sentence_graph.node[term_id], id=term_id)) #~ #~ nonterms_node = objectify.SubElement(graph_root, 'nonterminals') #~ for nt_id in nonterminals: #~ nt_node = objectify.Element('nt', sentence_graph.node[nt_id], id=nt_id) # ~ # edges are grouped by their source node in TigerXML #~ for out_edge in sentence_graph.out_edges(nt_id, data=True): #~ from_id, to_id, edge_attribs = out_edge #~ nt_node.append(objectify.Element('edge', edge_attribs)) #~ nonterms_node.append(nt_node) #~ #~ objectify.deannotate(sentence_root, cleanup_namespaces=True) #~ return sentence_root
if __name__ == '__main__': if len(sys.argv) != 3: sys.stderr.write( 'Usage: {0} TigerXML_input_file networkx_pickle_output_file\n'.format(sys.argv[0])) sys.exit(1) else: tiger_filepath = sys.argv[1] pickle_filepath = sys.argv[2] assert os.path.isfile(tiger_filepath) tiger_docgraph = TigerDocumentGraph(tiger_filepath) write_gpickle(tiger_docgraph, pickle_filepath)