Source code for discoursegraphs.readwrite.anaphoricity

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Arne Neumann <discoursegraphs.programming@arne.cl>

"""
The ``anaphoricity`` module parses Christian Dittrich's anaphoricity
annotation ad-hoc format into a document graph.
"""

import sys
import os
import re
from itertools import chain
from networkx import write_gpickle

from discoursegraphs import DiscourseDocumentGraph
from discoursegraphs.util import ensure_unicode

# The words 'das' and 'es were annotatated in the Potsdam Commentary
# Corpus (PCC). Annotation options: '/n' (nominal), '/a' (abstract),
# '/r' (relative pronoun) or '/p' (pleonastic). If the annotator was
# uncertain, the annotation is marked with a question mark.
#
# Examples: 'Das/a', 'es/p?'
ANNOTATED_ANAPHORA_REGEX = re.compile(
    '(?P<token>([Dd]a|[Ee])s)/(?P<annotation>[anpr])(?P<uncertain>\??)')

ANNOTATION_TYPES = {'n': 'nominal',
                    'a': 'abstract',
                    'r': 'relative',
                    'p': 'pleonastic'}


[docs]class AnaphoraDocumentGraph(DiscourseDocumentGraph): """ represents a text in which abstract anaphora were annotated as a graph. Attributes ---------- tokens : list of int a list of node IDs (int) which represent the tokens in the order they occur in the text root : str name of the document root node ID (default: 'anaphoricity:root_node') """ def __init__(self, anaphora_filepath, name=None): """ Reads an abstract anaphora annotation file, creates a directed graph and adds a node for each token, as well as an edge from the root node to each token. If a token is annotated, it will have an attribute 'annotation', which maps to a dict with the keys 'anaphoricity' (str) and 'certainty' (float). 'anaphoricity' is one of the following: 'abstract', 'nominal', 'pleonastic' or 'relative'. Parameters ---------- anaphora_filepath : str relative or absolute path to an anaphora annotation file. The format of the file was created ad-hoc by one of our students for his diploma thesis. It consists of tokenized plain text (one sentence per line with spaces between tokens). A token is annotated by appending '/' and one of the letters 'a' (abstract), 'n' (nominal), 'p' (pleonastic), 'r' (relative pronoun) and optionally a question mark to signal uncertainty. name : str or None the name or ID of the graph to be generated. If no name is given, the basename of the input file is used. """ # super calls __init__() of base class DiscourseDocumentGraph super(AnaphoraDocumentGraph, self).__init__() if name is not None: self.name = os.path.basename(anaphora_filepath) self.root = 'anaphoricity:root_node' self.add_node(self.root, layers={'anaphoricity'}) self.tokens = [] with open(anaphora_filepath, 'r') as anno_file: annotated_lines = anno_file.readlines() tokens = list(chain.from_iterable(line.split() for line in annotated_lines)) for i, token in enumerate(tokens): self.__add_token_to_document(token, i) self.tokens.append(i) def __add_token_to_document(self, token, token_id): """ adds a token to the document graph as a node with the given ID. Parameters ---------- token : str the token to be added to the document graph token_id : int the node ID of the token to be added, which must not yet exist in the document graph """ regex_match = ANNOTATED_ANAPHORA_REGEX.search(token) if regex_match: # token is annotated unannotated_token = regex_match.group('token') annotation = regex_match.group('annotation') certainty = 1.0 if not regex_match.group('uncertain') else 0.5 self.add_node(token_id, layers={'anaphoricity', 'anaphoricity:token'}, attr_dict={ 'anaphoricity:annotation': ANNOTATION_TYPES[annotation], 'anaphoricity:certainty': certainty, 'anaphoricity:token': ensure_unicode(unannotated_token)}) else: # token is not annotated self.add_node(token_id, layers={'anaphoricity', 'anaphoricity:token'}, attr_dict={'anaphoricity:token': ensure_unicode(token)}) self.add_edge(self.root, token_id, layers={'anaphoricity', 'anaphoricity:token'})
if __name__ == '__main__': if len(sys.argv) != 3: sys.stderr.write( 'Usage: {0} anaphoricity_input_file networkx_pickle_output_file\n'.format(sys.argv[0])) sys.exit(1) else: anaphora_filepath = sys.argv[1] pickle_filepath = sys.argv[2] assert os.path.isfile(anaphora_filepath) anaphora_docgraph = AnaphoraDocumentGraph(anaphora_filepath) write_gpickle(anaphora_docgraph, pickle_filepath)