#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Arne Neumann <discoursegraphs.programming@arne.cl>
"""
The ``tiger`` module converts a ``TigerXML`` file into a networkx-based
document graph.
"""
import sys
import os
import re
from lxml import etree, objectify
from networkx import write_gpickle
from discoursegraphs import DiscourseDocumentGraph
from discoursegraphs.util import natural_sort_key, ensure_unicode
[docs]class TigerDocumentGraph(DiscourseDocumentGraph):
"""
A directed graph with multiple edges (based on
networkx.MultiDiGraph) that represents all the
sentences contained in a TigerXML file. A ``TigerDocumentGraph``
contains a document root node (whose ID is stored in ``self.root``),
which has an outgoing edge to the sentence root nodes of each
sentence.
Attributes
----------
corpus_id : str
ID of the TigerXML document specified in the 'id' attribute
of the <corpus> element
root : str
the ID of the root node of the document graph
sentences : list of str
sorted list of all sentence root node IDs (of sentences
contained in this document graph)
The attribute dict of each sentence root node contains a key
``tokens``, which maps to a sorted list of token node IDs (str). To
print all tokens of a Tiger document, just do::
tdg = TigerDocumentGraph('/path/to/tiger.file')
for sentence_root_node in tdg.sentences:
for token_node_id in tdg.node[sentence_root_node]['tokens']:
print tdg.node[token_node_id]['tiger:word']
"""
def __init__(self, tiger_filepath, name=None):
"""
Creates a directed graph that represents all syntax annotated
sentences in the given TigerXML file.
Parameters
----------
tiger_filepath : str
absolute or relative path to the TigerXML file to be parsed
name : str or None
the name or ID of the graph to be generated. If no name is
given, the basename of the input file is used.
"""
# super calls __init__() of base class DiscourseDocumentGraph
super(TigerDocumentGraph, self).__init__()
utf8_parser = etree.XMLParser(encoding="utf-8")
tigerxml_tree = etree.parse(tiger_filepath, utf8_parser)
tigerxml_root = tigerxml_tree.getroot()
if name is not None:
self.name = os.path.basename(tiger_filepath)
self.corpus_id = tigerxml_root.attrib['id']
# add root node of TigerDocumentGraph
self.root = 'tiger:root_node'
self.add_node(self.root, layers={'tiger'})
self.sentences = []
for sentence in tigerxml_root.iterfind('./body/s'):
self.__add_sentence_to_document(sentence)
self.sentences = sorted(self.sentences, key=natural_sort_key)
def __add_sentence_to_document(self, sentence):
"""
Converts a sentence into a TigerSentenceGraph and adds all
its nodes, edges (and their features) to this graph.
This also adds an edge from the root node of this document
graph to the root node of the sentence and appends the
sentence root node ID to ``self.sentences``.
Parameters
----------
sentence : lxml.etree._Element
a sentence from a TigerXML file in etree element format
"""
sentence_graph = TigerSentenceGraph(sentence)
sentence_root_node_id = sentence_graph.root
self.add_nodes_from(sentence_graph.nodes(data=True))
self.add_edges_from(sentence_graph.edges(data=True))
self.add_edge(self.root, sentence_root_node_id,
layers={'tiger', 'tiger:sentence'})
self.sentences.append(sentence_root_node_id)
[docs]class TigerSentenceGraph(DiscourseDocumentGraph):
"""
A directed graph (based on a networkx.MultiDiGraph) that represents
one syntax annotated sentence extracted from a TigerXML file.
Attributes
----------
root : str
node ID of the root node of the sentence
tokens : list of str
a sorted list of terminal node IDs (i.e. token nodes)
"""
def __init__(self, sentence):
"""
Creates a directed graph from a syntax annotated sentence (i.e.
a <s> element from a TigerXML file parsed into an lxml etree
Element). For performance reasons, a sorted list of terminals
(i.e. nodes representing tokens) is stored under
``self.tokens``.
Parameters
----------
sentence : lxml.etree._Element
a sentence from a TigerXML file in etree element format
"""
# super calls __init__() of base class DiscourseDocumentGraph
super(TigerSentenceGraph, self).__init__()
graph_element = sentence.find('./graph')
sentence_root_id = graph_element.attrib['root']
# sentence.attrib is a lxml.etree._Attrib, which is 'dict-like'
# but doesn't behave exactly like a dict (i.e. it threw an error
# when I tried to update it)
sentence_attributes = add_prefix(sentence.attrib, 'tiger:')
# some sentences in the Tiger corpus are marked as discontinuous
if 'discontinuous' in graph_element.attrib:
sentence_attributes.update(
{'tiger:discontinuous': graph_element.attrib['discontinuous']})
self.__add_vroot(sentence_root_id, sentence_attributes)
self.__tigersentence2graph(sentence)
self.__repair_unconnected_nodes()
def __tigersentence2graph(self, sentence):
"""
Reads a sentence with syntax annotation (parsed from a TigerXML
file) into this directed graph. Adds an attribute named 'tokens'
to the sentence root node containing a sorted list of token node
IDs.
Parameters
----------
sentence : lxml.etree._Element
a sentence from a TigerXML file in etree element format
"""
token_ids = []
for t in sentence.iterfind('./graph/terminals/t'):
terminal_id = t.attrib['id']
token_ids.append(terminal_id)
terminal_features = add_prefix(t.attrib, 'tiger:')
# convert tokens to unicode
terminal_features['tiger:word'] = ensure_unicode(
terminal_features['tiger:word'])
self.add_node(terminal_id, layers={'tiger', 'tiger:token'},
attr_dict=terminal_features)
for secedge in t.iterfind('./secedge'):
to_id = secedge.attrib['idref']
secedge_attribs = add_prefix(secedge.attrib, 'tiger:')
if not to_id in self: # if graph doesn't contain to-node, yet
self.add_node(to_id, layers={'tiger', 'tiger:secedge'})
self.add_edge(terminal_id, to_id,
layers={'tiger', 'tiger:secedge'},
attr_dict=secedge_attribs)
# add sorted list of all token node IDs to sentence root node
# for performance reasons
sorted_token_ids = sorted(token_ids, key=natural_sort_key)
self.node[self.root].update({'tokens': sorted_token_ids})
for nt in sentence.iterfind('./graph/nonterminals/nt'):
from_id = nt.attrib['id']
nt_feats = add_prefix(nt.attrib, 'tiger:')
if from_id in self: # root node already exists,
# but doesn't have a cat value
self.node[from_id].update(nt_feats)
else:
self.add_node(from_id, layers={'tiger', 'tiger:syntax'},
attr_dict=nt_feats)
for edge in nt.iterfind('./edge'):
to_id = edge.attrib['idref']
if to_id not in self: # if graph doesn't contain to-node, yet
self.add_node(to_id, layers={'tiger', 'tiger:secedge'})
edge_attribs = add_prefix(edge.attrib, 'tiger:')
self.add_edge(from_id, to_id,
layers={'tiger', 'tiger:edge'},
attr_dict=edge_attribs)
for secedge in nt.iterfind('./secedge'):
to_id = secedge.attrib['idref']
if to_id not in self: # if graph doesn't contain to-node, yet
self.add_node(to_id, layers={'tiger', 'tiger:secedge'})
secedge_attribs = add_prefix(secedge.attrib, 'tiger:')
self.add_edge(from_id, to_id,
layers={'tiger', 'tiger:secedge'},
attr_dict=secedge_attribs)
def __add_vroot(self, sentence_root_id, sentence_attributes):
"""
Adds a new node with the ID 'VROOT' to this sentence graph.
The 'VROOT' node will have an outgoing edge to the node that has
previously been considered the root node of the sentence and
will have the attributes extracted from the <s> element of the
corresponding sentence in the TigerXML file.
The ``TigerSentenceGraph.root`` attribute will be set as well.
Why do we do this?
'VROOT' (virtual root) nodes are commonly used in the Tiger
corpus (version 2.1). They are useful whenever a sentence does
not have any nonterminals (e.g. if there is no full syntax
structure annotation in the case of a three word headline
'sentence').
Parameters
----------
sentence_root_id : str
the ID of the root node of the sentence, extracted from the
``root`` attribute of the ``<graph>`` element of the
corresponding sentence in the TigerXML file.
sentence_attributes : dict of (str, str)
a dictionary of sentence attributes extracted from the <s>
element (corresponding to this sentence) of a TigerXML file.
contains the attributes ``tiger:id``, ``tiger:art_id`` and
``tiger:orig_id``.
"""
old_root_node_id = sentence_root_id
sentence_id = sentence_attributes['tiger:id']
new_root_node_id = 'VROOT-{0}'.format(sentence_id)
self.add_node(old_root_node_id,
layers={'tiger', 'tiger:sentence', 'tiger:sentence:root'})
self.add_node(new_root_node_id,
layers={
'tiger', 'tiger:sentence', 'tiger:sentence:vroot'},
attr_dict=sentence_attributes)
self.add_edge(new_root_node_id, old_root_node_id,
layers={'tiger', 'tiger:sentence', 'tiger:sentence:vroot'})
self.root = new_root_node_id
def __repair_unconnected_nodes(self):
"""
Adds an edge from the 'VROOT' node to all previously unconnected
nodes (token nodes, that either represent a punctuation mark or
are part of a headline 'sentence' that has no full syntax
structure annotation).
"""
unconnected_node_ids = get_unconnected_nodes(self)
for unconnected_node_id in unconnected_node_ids:
self.add_edge(self.root, unconnected_node_id,
layers={'tiger', 'tiger:sentence'})
def _get_terminals_and_nonterminals(sentence_graph):
"""
Given a TigerSentenceGraph, returns a sorted list of terminal node
IDs, as well as a sorted list of nonterminal node IDs.
Parameters
----------
sentence_graph : TigerSentenceGraph
a directed graph representing one syntax annotated sentence from
a TigerXML file
Returns
-------
terminals, nonterminals : list of str
a sorted list of terminal node IDs and a sorted list of
nonterminal node IDs
"""
terminals = set()
nonterminals = set()
for node_id in sentence_graph.nodes_iter():
if sentence_graph.out_degree(node_id) > 0:
# all nonterminals (incl. root)
nonterminals.add(node_id)
else: # terminals
terminals.add(node_id)
return sorted(list(terminals), key=natural_sort_key), \
sorted(list(nonterminals), key=natural_sort_key)
[docs]def get_unconnected_nodes(sentence_graph):
"""
Takes a TigerSentenceGraph and returns a list of node IDs of
unconnected nodes.
A node is unconnected, if it doesn't have any in- or outgoing edges.
A node is NOT considered unconnected, if the graph only consists of
that particular node.
Parameters
----------
sentence_graph : TigerSentenceGraph
a directed graph representing one syntax annotated sentence from
a TigerXML file
Returns
-------
unconnected_node_ids : list of str
a list of node IDs of unconnected nodes
"""
return [node for node in sentence_graph.nodes_iter()
if sentence_graph.degree(node) == 0 and
sentence_graph.number_of_nodes() > 1]
[docs]def add_prefix(dict_like, prefix):
"""
takes a dict (or dict-like object, e.g. etree._Attrib) and adds the
given prefix to each key. Always returns a dict (via a typecast).
Parameters
----------
dict_like : dict (or similar)
a dictionary or a container that implements .items()
prefix : str
the prefix string to be prepended to each key in the input dict
Returns
-------
prefixed_dict : dict
A dict, in which each key begins with the given prefix.
"""
if not isinstance(dict_like, dict):
try:
dict_like = dict(dict_like)
except Error as e:
raise ValueError("{0}\nCan't convert container to dict: "
"{1}".format(e, dict_like))
return {prefix + k: v for (k, v) in dict_like.items()}
[docs]def tiger_tokenlist(tigerdoc_graph):
"""
extracts all tokens from a TigerDocumentGraph.
Parameters
----------
tigerdoc_graph : TigerDocumentGraph
a directed graph representing a TigerXML file and all the
annotated sentences found in it.
Returns
-------
all_tiger_tokens : tuple of (unicode, str, str)
a list of (unicode, str, str) tuples, where the first element
is the token, the second is the sentence root node ID (of the)
corresponding sentence and the third is the token node ID.
"""
all_tiger_tokens = []
for sent_id in tigerdoc_graph.sentences:
tiger_sent_tokens = [(tigerdoc_graph.node[token_id]['tiger:word'], sent_id, token_id)
for token_id in tigerdoc_graph.node[sent_id]['tokens']]
all_tiger_tokens.extend(tiger_sent_tokens)
return all_tiger_tokens
#~ def graph2tigersentence(sentence_graph):
#~ """
#~ @param sentence_graph: a directed graph containing a Tiger format
#~ sentence structure and annotations (syntax and morphology)
#~ @type sentence_graph: DiscourseDocumentGraph
#~ """
#~ terminals, nonterminals = _get_terminals_and_nonterminals(sentence_graph)
#~ sentence_root = objectify.Element('s', sentence_graph.metadata)
#~ graph_root = objectify.SubElement(sentence_root, 'graph', root=sentence_graph.root)
#~ terms_node = objectify.SubElement(graph_root, 'terminals')
#~ for term_id in terminals:
#~ terms_node.append(objectify.Element('t', sentence_graph.node[term_id], id=term_id))
#~
#~ nonterms_node = objectify.SubElement(graph_root, 'nonterminals')
#~ for nt_id in nonterminals:
#~ nt_node = objectify.Element('nt', sentence_graph.node[nt_id], id=nt_id)
# ~ # edges are grouped by their source node in TigerXML
#~ for out_edge in sentence_graph.out_edges(nt_id, data=True):
#~ from_id, to_id, edge_attribs = out_edge
#~ nt_node.append(objectify.Element('edge', edge_attribs))
#~ nonterms_node.append(nt_node)
#~
#~ objectify.deannotate(sentence_root, cleanup_namespaces=True)
#~ return sentence_root
if __name__ == '__main__':
if len(sys.argv) != 3:
sys.stderr.write(
'Usage: {0} TigerXML_input_file networkx_pickle_output_file\n'.format(sys.argv[0]))
sys.exit(1)
else:
tiger_filepath = sys.argv[1]
pickle_filepath = sys.argv[2]
assert os.path.isfile(tiger_filepath)
tiger_docgraph = TigerDocumentGraph(tiger_filepath)
write_gpickle(tiger_docgraph, pickle_filepath)