Source code for discoursegraphs.dg

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Arne Neumann <discoursegraphs.programming@arne.cl>

"""
The ``dg`` module specifies a ``DisourseDocumentGraph``, the fundamential data
structure used in this package. It is a slightly modified
``networkx.MultiDiGraph``, which enforces every node and edge to have a
``layers`` attribute (which maps to the set of layers (str) it belongs to).
"""

from networkx import MultiDiGraph


[docs]class DiscourseDocumentGraph(MultiDiGraph): """ Base class for representing annotated documents as directed graphs with multiple edges. TODO list: - allow layers to be a single str or set of str - allow adding a layer by including it in ``**attr`` - add consistency check that would allow adding a node that already exists in the graph, but only if the new graph has different attributes (layers can be the same though) - outsource layer assertions to method? """ def __init__(self): """ Initialized an empty directed graph which allows multiple edges. """ # super calls __init__() of base class MultiDiGraph super(DiscourseDocumentGraph, self).__init__()
[docs] def add_node(self, n, layers, attr_dict=None, **attr): """Add a single node n and update node attributes. Parameters ---------- n : node A node can be any hashable Python object except None. layers : set of str the set of layers the node belongs to, e.g. {'tiger:token', 'anaphoricity:annotation'} attr_dict : dictionary, optional (default= no attributes) Dictionary of node attributes. Key/value pairs will update existing data associated with the node. attr : keyword arguments, optional Set or change attributes using key=value. See Also -------- add_nodes_from Examples -------- >>> from discoursegraphs import DiscourseDocumentGraph >>> d = DiscourseDocumentGraph() >>> d.add_node(1, {'node'}) # adding the same node with a different layer >>> d.add_node(1, {'number'}) >>> d.nodes(data=True) [(1, {'layers': {'node', 'number'}})] Use keywords set/change node attributes: >>> d.add_node(1, {'node'}, size=10) >>> d.add_node(3, layers={'num'}, weight=0.4, UTM=('13S',382)) >>> d.nodes(data=True) [(1, {'layers': {'node', 'number'}, 'size': 10}), (3, {'UTM': ('13S', 382), 'layers': {'num'}, 'weight': 0.4})] Notes ----- A hashable object is one that can be used as a key in a Python dictionary. This includes strings, numbers, tuples of strings and numbers, etc. On many platforms hashable items also include mutables such as NetworkX Graphs, though one should be careful that the hash doesn't change on mutables. """ assert isinstance(layers, set), \ "'layers' parameter must be given as a set of strings." assert all((isinstance(layer, str) for layer in layers)), \ "All elements of the 'layers' set must be strings." # add layers to keyword arguments dict attr.update({'layers': layers}) # set up attribute dict if attr_dict is None: attr_dict = attr else: try: attr_dict.update(attr) except AttributeError as e: raise AttributeError("The attr_dict argument must be " "a dictionary: ".format(e)) # if there's no node with this ID in the graph, yet if n not in self.succ: self.succ[n] = {} self.pred[n] = {} self.node[n] = attr_dict else: # update attr even if node already exists # if a node exists, its attributes will be updated, except # for the layers attribute. the value of 'layers' will # be the union of the existing layers set and the new one. existing_layers = self.node[n]['layers'] all_layers = existing_layers.union(layers) attrs_without_layers = {k: v for (k, v) in attr_dict.items() if k != 'layers'} self.node[n].update(attrs_without_layers) self.node[n].update({'layers': all_layers})
[docs] def add_nodes_from(self, nodes, **attr): """Add multiple nodes. Parameters ---------- nodes : iterable container of (node, attribute dict) tuples. Node attributes are updated using the attribute dict. attr : keyword arguments, optional (default= no attributes) Update attributes for all nodes in nodes. Node attributes specified in nodes as a tuple take precedence over attributes specified generally. See Also -------- add_node Examples -------- >>> d.add_nodes_from([(1, {'layers':{'token'}, 'word':'hello'}), \ (2, {'layers':{'token'}, 'word':'world'})]) >>> d.nodes(data=True) [(1, {'layers': {'token'}, 'word': 'hello'}), (2, {'layers': {'token'}, 'word': 'world'})] Use keywords to update specific node attributes for every node. >>> d.add_nodes_from(d.nodes(data=True), weight=1.0) >>> d.nodes(data=True) [(1, {'layers': {'token'}, 'weight': 1.0, 'word': 'hello'}), (2, {'layers': {'token'}, 'weight': 1.0, 'word': 'world'})] Use (node, attrdict) tuples to update attributes for specific nodes. >>> d.add_nodes_from([(1, {'layers': {'tiger'}})], size=10) >>> d.nodes(data=True) [(1, {'layers': {'tiger', 'token'}, 'size': 10, 'weight': 1.0, 'word': 'hello'}), (2, {'layers': {'token'}, 'weight': 1.0, 'word': 'world'})] """ additional_attribs = attr # will be added to each node for n in nodes: node_id, ndict = n assert 'layers' in ndict, \ "Every node must have a 'layers' attribute." layers = ndict['layers'] assert isinstance(layers, set), \ "'layers' must be specified as a set of strings." assert all((isinstance(layer, str) for layer in layers)), \ "All elements of the 'layers' set must be strings." if node_id not in self.succ: # node doesn't exist, yet self.succ[node_id] = {} self.pred[node_id] = {} newdict = additional_attribs.copy() newdict.update(ndict) # all given attribs incl. layers self.node[node_id] = newdict else: # node already exists existing_layers = self.node[node_id]['layers'] all_layers = existing_layers.union(layers) self.node[node_id].update(ndict) self.node[node_id].update(additional_attribs) self.node[node_id].update({'layers': all_layers})
[docs] def add_edge(self, u, v, layers, key=None, attr_dict=None, **attr): """Add an edge between u and v. An edge can only be added if the nodes u and v already exist. This decision was taken to ensure that all nodes are associated with at least one (meaningful) layer. Edge attributes can be specified with keywords or by providing a dictionary with key/value pairs. In contrast to other edge attributes, layers can only be added not overwriten or deleted. Parameters ---------- u,v : nodes Nodes can be, for example, strings or numbers. Nodes must be hashable (and not None) Python objects. layers : set of str the set of layers the edge belongs to, e.g. {'tiger:token', 'anaphoricity:annotation'} key : hashable identifier, optional (default=lowest unused integer) Used to distinguish multiedges between a pair of nodes. attr_dict : dictionary, optional (default= no attributes) Dictionary of edge attributes. Key/value pairs will update existing data associated with the edge. attr : keyword arguments, optional Edge data (or labels or objects) can be assigned using keyword arguments. See Also -------- add_edges_from : add a collection of edges Notes ----- To replace/update edge data, use the optional key argument to identify a unique edge. Otherwise a new edge will be created. NetworkX algorithms designed for weighted graphs cannot use multigraphs directly because it is not clear how to handle multiedge weights. Convert to Graph using edge attribute 'weight' to enable weighted graph algorithms. Examples -------- >>> from discoursegraphs import DiscourseDocumentGraph >>> d = DiscourseDocumentGraph() >>> d.add_nodes_from([(1, {'layers':{'token'}, 'word':'hello'}), \ (2, {'layers':{'token'}, 'word':'world'})]) >>> d.edges(data=True) >>> [] >>> d.add_edge(1, 2, layers={'generic'}) >>> d.add_edge(1, 2, layers={'tokens'}, weight=0.7) >>> d.edges(data=True) [(1, 2, {'layers': {'generic'}}), (1, 2, {'layers': {'tokens'}, 'weight': 0.7})] >>> d.edge[1][2] {0: {'layers': {'generic'}}, 1: {'layers': {'tokens'}, 'weight': 0.7}} >>> d.add_edge(1, 2, layers={'tokens'}, key=1, weight=1.0) >>> d.edges(data=True) [(1, 2, {'layers': {'generic'}}), (1, 2, {'layers': {'tokens'}, 'weight': 1.0})] >>> d.add_edge(1, 2, layers={'foo'}, key=1, weight=1.0) >>> d.edges(data=True) [(1, 2, {'layers': {'generic'}}), (1, 2, {'layers': {'foo', 'tokens'}, 'weight': 1.0})] """ assert isinstance(layers, set), \ "'layers' parameter must be given as a set of strings." assert all((isinstance(layer, str) for layer in layers)), \ "All elements of the 'layers' set must be strings." # add layers to keyword arguments dict attr.update({'layers': layers}) # set up attribute dict if attr_dict is None: attr_dict = attr else: try: attr_dict.update(attr) except AttributeError as e: raise AttributeError("The attr_dict argument must be " "a dictionary: ".format(e)) assert u in self.succ, "from-node doesn't exist, yet" assert v in self.succ, "to-node doesn't exist, yet" if v in self.succ[u]: # if there's already an edge from u to v keydict = self.adj[u][v] if key is None: # creating additional edge # find a unique integer key # other methods might be better here? key = len(keydict) while key in keydict: key += 1 datadict = keydict.get(key, {}) # works for existing & new edge existing_layers = datadict.get('layers', set()) all_layers = existing_layers.union(layers) datadict.update(attr_dict) datadict.update({'layers': all_layers}) keydict[key] = datadict else: # there's no edge between u and v, yet # selfloops work this way without special treatment if key is None: key = 0 datadict = {} datadict.update(attr_dict) # includes layers keydict = {key: datadict} self.succ[u][v] = keydict self.pred[v][u] = keydict
[docs] def add_edges_from(self, ebunch, attr_dict=None, **attr): """Add all the edges in ebunch. Parameters ---------- ebunch : container of edges Each edge given in the container will be added to the graph. The edges can be: - 3-tuples (u,v,d) for an edge attribute dict d, or - 4-tuples (u,v,k,d) for an edge identified by key k Each edge must have a layers attribute (set of str). attr_dict : dictionary, optional (default= no attributes) Dictionary of edge attributes. Key/value pairs will update existing data associated with each edge. attr : keyword arguments, optional Edge data (or labels or objects) can be assigned using keyword arguments. See Also -------- add_edge : add a single edge Notes ----- Adding the same edge twice has no effect but any edge data will be updated when each duplicate edge is added. An edge can only be added if the source and target nodes are already present in the graph. This decision was taken to ensure that all edges are associated with at least one (meaningful) layer. Edge attributes specified in edges as a tuple (in ebunch) take precedence over attributes specified otherwise (in attr_dict or attr). Layers can only be added (via a 'layers' edge attribute), but not overwritten. Examples -------- >>> d = DiscourseDocumentGraph() >>> d.add_node(1, {'int'}) >>> d.add_node(2, {'int'}) >>> d.add_edges_from([(1, 2, {'layers': {'int'}, 'weight': 23})]) >>> d.add_edges_from([(1, 2, {'layers': {'int'}, 'weight': 42})]) >>> d.edges(data=True) # multiple edges between the same nodes [(1, 2, {'layers': {'int'}, 'weight': 23}), (1, 2, {'layers': {'int'}, 'weight': 42})] Associate data to edges We update the existing edge (key=0) and overwrite its 'weight' value. Note that we can't overwrite the 'layers' value, though. Instead, they are added to the set of existing layers >>> d.add_edges_from([(1, 2, 0, {'layers':{'number'}, 'weight':66})]) [(1, 2, {'layers': {'int', 'number'}, 'weight': 66}), (1, 2, {'layers': {'int'}, 'weight': 42})] """ # set up attribute dict if attr_dict is None: attr_dict = attr else: try: attr_dict.update(attr) except AttributeError as e: raise AttributeError("The attr_dict argument must be " "a dictionary: ".format(e)) # process ebunch for e in ebunch: ne = len(e) if ne == 4: u, v, key, dd = e elif ne == 3: u, v, dd = e key = None else: raise AttributeError( "Edge tuple %s must be a 3-tuple (u,v,attribs) " "or 4-tuple (u,v,key,attribs)." % (e,)) assert 'layers' in dd, \ "Every edge must have a 'layers' attribute." layers = dd['layers'] assert isinstance(layers, set), \ "'layers' must be specified as a set of strings." assert all((isinstance(layer, str) for layer in layers)), \ "All elements of the 'layers' set must be strings." additional_layers = attr_dict.get('layers', {}) if additional_layers: assert isinstance(additional_layers, set), \ "'layers' must be specified as a set of strings." assert all((isinstance(layer, str) for layer in additional_layers)), \ "'layers' set must only contain strings." # union of layers specified in ebunch tuples, # attr_dict and **attr new_layers = layers.union(additional_layers) if u in self.adj: # edge with u as source already exists keydict = self.adj[u].get(v, {}) else: keydict = {} if key is None: # find a unique integer key # other methods might be better here? key = len(keydict) while key in keydict: key += 1 datadict = keydict.get(key, {}) # existing edge attribs existing_layers = datadict.get('layers', set()) datadict.update(attr_dict) datadict.update(dd) updated_attrs = {k: v for (k, v) in datadict.items() if k != 'layers'} all_layers = existing_layers.union(new_layers) # add_edge() checks if u and v exist, so we don't need to self.add_edge(u, v, layers=all_layers, key=key, attr_dict=updated_attrs)