Source code for discoursegraphs.merging
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Arne Neumann <discoursegraphs.programming@arne.cl>
"""
The ``merging`` module combines several document graphs into one.
So far, it is able to merge rhetorical structure theory (RS3), syntax
(TigerXML) and anaphora (ad-hoc format) annotations of the same document.
"""
import os
import sys
import re
from networkx import write_dot
from discoursegraphs import DiscourseDocumentGraph
from discoursegraphs.relabel import relabel_nodes
from discoursegraphs.util import ensure_unicode
from discoursegraphs.readwrite.anaphoricity import AnaphoraDocumentGraph
from discoursegraphs.readwrite.rst import RSTGraph, rst_tokenlist
from discoursegraphs.readwrite.tiger import TigerDocumentGraph, tiger_tokenlist
[docs]def add_rst_to_tiger(tiger_docgraph, rst_graph):
"""
adds an RSTGraph to a TigerDocumentGraph, thereby adding edges from
each RST segment to the (Tiger) tokens they represent.
Parameters
----------
tiger_docgraph : TigerDocumentGraph
multidigraph representing a syntax annotated (TigerXML) document
rst_graph : RSTGraph
multidigraph representing a RST annotated (RS3) document
"""
tiger_tokens = tiger_tokenlist(tiger_docgraph)
rst_tokens = rst_tokenlist(rst_graph)
tiger_docgraph.add_nodes_from(rst_graph.nodes(data=True))
tiger_docgraph.add_edges_from(rst_graph.edges(data=True))
for i, (tiger_tok, tiger_sent_id, tiger_tok_id) in enumerate(tiger_tokens):
rst_token, rst_segment_node_id = rst_tokens[i]
if tiger_tok == rst_token:
tiger_docgraph.add_node(tiger_tok_id, layers={'rst', 'rst:token'},
attr_dict={'rst:token': rst_token})
tiger_docgraph.add_edge(int(rst_segment_node_id), tiger_tok_id,
layers={'rst', 'rst:token'})
else: # token mismatch
raise ValueError("Tokenization mismatch between:\n"
"{0}\n{1}".format(tiger_filepath, rst_filepath))
[docs]def map_anaphoricity_tokens_to_tiger(tiger_docgraph, anaphora_graph):
"""
creates a map from anaphoricity token node IDs to tiger token node
IDs.
Parameters
----------
tiger_docgraph : TigerDocumentGraph
multidigraph representing a syntax annotated (TigerXML) document
anaphora_graph : AnaphoraDocumentGraph
multidigraph representing a anaphorcity annotated document
(ad-hoc format used in Christian Dittrich's diploma thesis)
Returns
-------
anaphora2tiger : dict
map from anaphoricity token node IDs (int) to tiger token node
IDs (str, e.g. 's23_5')
"""
# list of (token unicode, tiger_sent_id str, tiger_token_id str)
tiger_tokens = tiger_tokenlist(tiger_docgraph)
anaphora2tiger = {}
for i, anaphora_node_id in enumerate(anaphora_graph.tokens):
anaphora_token = anaphora_graph.node[
anaphora_node_id]['anaphoricity:token']
tiger_token, tiger_sent_id, tiger_token_id = tiger_tokens[i]
if anaphora_token == tiger_token:
anaphora2tiger[anaphora_node_id] = tiger_token_id
else:
raise ValueError(u"tokens don't match: {0} (anaphoricity) vs. {1} (tiger)".format(
anaphora_token, tiger_token))
return anaphora2tiger
[docs]def add_anaphoricity_to_tiger(tiger_docgraph, anaphora_graph):
"""
adds an AnaphoraDocumentGraph to a TigerDocumentGraph, thereby
adding information about the anaphoricity of words
(e.g. 'das', 'es') to the respective (Tiger) tokens.
Parameters
----------
tiger_docgraph : TigerDocumentGraph
multidigraph representing a syntax annotated (TigerXML) document
anaphora_graph : AnaphoraDocumentGraph
multidigraph representing a anaphorcity annotated document
(ad-hoc format used in Christian Dittrich's diploma thesis)
"""
anaphora2tiger = map_anaphoricity_tokens_to_tiger(
tiger_docgraph, anaphora_graph)
relabel_nodes(anaphora_graph, anaphora2tiger, copy=False)
tiger_docgraph.add_nodes_from(anaphora_graph.nodes(data=True))
# the anaphora doc graph only contains trivial edges from its root
# node. we won't add them and will remove the root.
try:
tiger_docgraph.remove_node('anaphoricity:root_node')
except:
pass
[docs]def merging_cli():
"""
simple commandline interface of the merging module.
This function is called when you use the ``discoursegraphs`` application
directly on the command line.
"""
if len(sys.argv) != 5:
sys.stderr.write(
'Usage: {0} tiger_file rst_file anaphoricity_file dot_output_file\n'.format(sys.argv[0]))
sys.exit(1)
else:
tiger_filepath = sys.argv[1]
rst_filepath = sys.argv[2]
anaphora_filepath = sys.argv[3]
dot_filepath = sys.argv[4]
for filepath in (tiger_filepath, rst_filepath, anaphora_filepath):
assert os.path.isfile(
filepath), "{} doesn't exist".format(filepath)
tiger_docgraph = TigerDocumentGraph(tiger_filepath)
rst_graph = RSTGraph(rst_filepath)
anaphora_graph = AnaphoraDocumentGraph(anaphora_filepath)
add_rst_to_tiger(tiger_docgraph, rst_graph)
add_anaphoricity_to_tiger(tiger_docgraph, anaphora_graph)
write_dot(tiger_docgraph, dot_filepath)
if __name__ == '__main__':
merging_cli()