Source code for discoursegraphs.readwrite.anaphoricity
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Arne Neumann <discoursegraphs.programming@arne.cl>
"""
The ``anaphoricity`` module parses Christian Dittrich's anaphoricity
annotation ad-hoc format into a document graph.
"""
import sys
import os
import re
from itertools import chain
from networkx import write_gpickle
from discoursegraphs import DiscourseDocumentGraph
from discoursegraphs.util import ensure_unicode
# The words 'das' and 'es were annotatated in the Potsdam Commentary
# Corpus (PCC). Annotation options: '/n' (nominal), '/a' (abstract),
# '/r' (relative pronoun) or '/p' (pleonastic). If the annotator was
# uncertain, the annotation is marked with a question mark.
#
# Examples: 'Das/a', 'es/p?'
ANNOTATED_ANAPHORA_REGEX = re.compile(
'(?P<token>([Dd]a|[Ee])s)/(?P<annotation>[anpr])(?P<uncertain>\??)')
ANNOTATION_TYPES = {'n': 'nominal',
'a': 'abstract',
'r': 'relative',
'p': 'pleonastic'}
[docs]class AnaphoraDocumentGraph(DiscourseDocumentGraph):
"""
represents a text in which abstract anaphora were annotated
as a graph.
Attributes
----------
tokens : list of int
a list of node IDs (int) which represent the tokens in the
order they occur in the text
root : str
name of the document root node ID
(default: 'anaphoricity:root_node')
"""
def __init__(self, anaphora_filepath, name=None):
"""
Reads an abstract anaphora annotation file, creates a directed
graph and adds a node for each token, as well as an edge from
the root node to each token.
If a token is annotated, it will have an attribute 'annotation',
which maps to a dict with the keys 'anaphoricity' (str) and
'certainty' (float).
'anaphoricity' is one of the following: 'abstract', 'nominal',
'pleonastic' or 'relative'.
Parameters
----------
anaphora_filepath : str
relative or absolute path to an anaphora annotation file.
The format of the file was created ad-hoc by one of our
students for his diploma thesis. It consists of tokenized
plain text (one sentence per line with spaces between
tokens).
A token is annotated by appending '/' and one of the letters
'a' (abstract), 'n' (nominal), 'p' (pleonastic),
'r' (relative pronoun) and optionally a question mark to
signal uncertainty.
name : str or None
the name or ID of the graph to be generated. If no name is
given, the basename of the input file is used.
"""
# super calls __init__() of base class DiscourseDocumentGraph
super(AnaphoraDocumentGraph, self).__init__()
if name is not None:
self.name = os.path.basename(anaphora_filepath)
self.root = 'anaphoricity:root_node'
self.add_node(self.root, layers={'anaphoricity'})
self.tokens = []
with open(anaphora_filepath, 'r') as anno_file:
annotated_lines = anno_file.readlines()
tokens = list(chain.from_iterable(line.split()
for line in annotated_lines))
for i, token in enumerate(tokens):
self.__add_token_to_document(token, i)
self.tokens.append(i)
def __add_token_to_document(self, token, token_id):
"""
adds a token to the document graph as a node with the given ID.
Parameters
----------
token : str
the token to be added to the document graph
token_id : int
the node ID of the token to be added, which must not yet
exist in the document graph
"""
regex_match = ANNOTATED_ANAPHORA_REGEX.search(token)
if regex_match: # token is annotated
unannotated_token = regex_match.group('token')
annotation = regex_match.group('annotation')
certainty = 1.0 if not regex_match.group('uncertain') else 0.5
self.add_node(token_id, layers={'anaphoricity', 'anaphoricity:token'},
attr_dict={
'anaphoricity:annotation': ANNOTATION_TYPES[annotation],
'anaphoricity:certainty': certainty,
'anaphoricity:token': ensure_unicode(unannotated_token)})
else: # token is not annotated
self.add_node(token_id,
layers={'anaphoricity', 'anaphoricity:token'},
attr_dict={'anaphoricity:token': ensure_unicode(token)})
self.add_edge(self.root, token_id,
layers={'anaphoricity', 'anaphoricity:token'})
if __name__ == '__main__':
if len(sys.argv) != 3:
sys.stderr.write(
'Usage: {0} anaphoricity_input_file networkx_pickle_output_file\n'.format(sys.argv[0]))
sys.exit(1)
else:
anaphora_filepath = sys.argv[1]
pickle_filepath = sys.argv[2]
assert os.path.isfile(anaphora_filepath)
anaphora_docgraph = AnaphoraDocumentGraph(anaphora_filepath)
write_gpickle(anaphora_docgraph, pickle_filepath)