Source code for discoursegraphs.readwrite.anaphoricity
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Arne Neumann <>
The ``anaphoricity`` module parses Christian Dittrich's anaphoricity
annotation ad-hoc format into a document graph.
import sys
import os
import re
from itertools import chain
from networkx import write_gpickle
from discoursegraphs import DiscourseDocumentGraph
from discoursegraphs.util import ensure_unicode
# The words 'das' and 'es were annotatated in the Potsdam Commentary
# Corpus (PCC). Annotation options: '/n' (nominal), '/a' (abstract),
# '/r' (relative pronoun) or '/p' (pleonastic). If the annotator was
# uncertain, the annotation is marked with a question mark.
# Examples: 'Das/a', 'es/p?'
ANNOTATION_TYPES = {'n': 'nominal',
'a': 'abstract',
'r': 'relative',
'p': 'pleonastic'}
[docs]class AnaphoraDocumentGraph(DiscourseDocumentGraph):
represents a text in which abstract anaphora were annotated
as a graph.
tokens : list of int
a list of node IDs (int) which represent the tokens in the
order they occur in the text
root : str
name of the document root node ID
(default: 'anaphoricity:root_node')
def __init__(self, anaphora_filepath, name=None):
Reads an abstract anaphora annotation file, creates a directed
graph and adds a node for each token, as well as an edge from
the root node to each token.
If a token is annotated, it will have an attribute 'annotation',
which maps to a dict with the keys 'anaphoricity' (str) and
'certainty' (float).
'anaphoricity' is one of the following: 'abstract', 'nominal',
'pleonastic' or 'relative'.
anaphora_filepath : str
relative or absolute path to an anaphora annotation file.
The format of the file was created ad-hoc by one of our
students for his diploma thesis. It consists of tokenized
plain text (one sentence per line with spaces between
A token is annotated by appending '/' and one of the letters
'a' (abstract), 'n' (nominal), 'p' (pleonastic),
'r' (relative pronoun) and optionally a question mark to
signal uncertainty.
name : str or None
the name or ID of the graph to be generated. If no name is
given, the basename of the input file is used.
# super calls __init__() of base class DiscourseDocumentGraph
super(AnaphoraDocumentGraph, self).__init__()
if name is not None: = os.path.basename(anaphora_filepath)
self.root = 'anaphoricity:root_node'
self.add_node(self.root, layers={'anaphoricity'})
self.tokens = []
with open(anaphora_filepath, 'r') as anno_file:
annotated_lines = anno_file.readlines()
tokens = list(chain.from_iterable(line.split()
for line in annotated_lines))
for i, token in enumerate(tokens):
self.__add_token_to_document(token, i)
def __add_token_to_document(self, token, token_id):
adds a token to the document graph as a node with the given ID.
token : str
the token to be added to the document graph
token_id : int
the node ID of the token to be added, which must not yet
exist in the document graph
regex_match =
if regex_match: # token is annotated
unannotated_token ='token')
annotation ='annotation')
certainty = 1.0 if not'uncertain') else 0.5
self.add_node(token_id, layers={'anaphoricity', 'anaphoricity:token'},
'anaphoricity:annotation': ANNOTATION_TYPES[annotation],
'anaphoricity:certainty': certainty,
'anaphoricity:token': ensure_unicode(unannotated_token)})
else: # token is not annotated
layers={'anaphoricity', 'anaphoricity:token'},
attr_dict={'anaphoricity:token': ensure_unicode(token)})
self.add_edge(self.root, token_id,
layers={'anaphoricity', 'anaphoricity:token'})
if __name__ == '__main__':
if len(sys.argv) != 3:
'Usage: {0} anaphoricity_input_file networkx_pickle_output_file\n'.format(sys.argv[0]))
anaphora_filepath = sys.argv[1]
pickle_filepath = sys.argv[2]
assert os.path.isfile(anaphora_filepath)
anaphora_docgraph = AnaphoraDocumentGraph(anaphora_filepath)
write_gpickle(anaphora_docgraph, pickle_filepath)