Source code for discoursegraphs.dg

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Arne Neumann <discoursegraphs.programming@arne.cl>

"""
The ``dg`` module specifies a ``DisourseDocumentGraph``, the fundamential data
structure used in this package. It is a slightly modified
``networkx.MultiDiGraph``, which enforces every node and edge to have a
``layers`` attribute (which maps to the set of layers (str) it belongs to).
"""

from networkx import MultiDiGraph


[docs]class DiscourseDocumentGraph(MultiDiGraph):
    """
    Base class for representing annotated documents as directed graphs
    with multiple edges.

    TODO list:
    
    - allow layers to be a single str or set of str
    - allow adding a layer by including it in ``**attr``
    - add consistency check that would allow adding a node that
      already exists in the graph, but only if the new graph has
      different attributes (layers can be the same though)
    - outsource layer assertions to method?
    """
    def __init__(self):
        """
        Initialized an empty directed graph which allows multiple edges.
        """
        # super calls __init__() of base class MultiDiGraph
        super(DiscourseDocumentGraph, self).__init__()

[docs]    def add_node(self, n, layers, attr_dict=None, **attr):
        """Add a single node n and update node attributes.

        Parameters
        ----------
        n : node
            A node can be any hashable Python object except None.
        layers : set of str
            the set of layers the node belongs to,
            e.g. {'tiger:token', 'anaphoricity:annotation'}
        attr_dict : dictionary, optional (default= no attributes)
            Dictionary of node attributes.  Key/value pairs will
            update existing data associated with the node.
        attr : keyword arguments, optional
            Set or change attributes using key=value.

        See Also
        --------
        add_nodes_from

        Examples
        --------
        >>> from discoursegraphs import DiscourseDocumentGraph
        >>> d = DiscourseDocumentGraph()
        >>> d.add_node(1, {'node'})

        # adding the same node with a different layer
        >>> d.add_node(1, {'number'})
        >>> d.nodes(data=True)
        [(1, {'layers': {'node', 'number'}})]

        Use keywords set/change node attributes:

        >>> d.add_node(1, {'node'}, size=10)
        >>> d.add_node(3, layers={'num'}, weight=0.4, UTM=('13S',382))
        >>> d.nodes(data=True)
        [(1, {'layers': {'node', 'number'}, 'size': 10}),
         (3, {'UTM': ('13S', 382), 'layers': {'num'}, 'weight': 0.4})]

        Notes
        -----
        A hashable object is one that can be used as a key in a Python
        dictionary. This includes strings, numbers, tuples of strings
        and numbers, etc.

        On many platforms hashable items also include mutables such as
        NetworkX Graphs, though one should be careful that the hash
        doesn't change on mutables.
        """
        assert isinstance(layers, set), \
            "'layers' parameter must be given as a set of strings."
        assert all((isinstance(layer, str) for layer in layers)), \
            "All elements of the 'layers' set must be strings."
        # add layers to keyword arguments dict
        attr.update({'layers': layers})

        # set up attribute dict
        if attr_dict is None:
            attr_dict = attr
        else:
            try:
                attr_dict.update(attr)
            except AttributeError as e:
                raise AttributeError("The attr_dict argument must be "
                                     "a dictionary: ".format(e))

        # if there's no node with this ID in the graph, yet
        if n not in self.succ:
            self.succ[n] = {}
            self.pred[n] = {}
            self.node[n] = attr_dict
        else:  # update attr even if node already exists
            # if a node exists, its attributes will be updated, except
            # for the layers attribute. the value of 'layers' will
            # be the union of the existing layers set and the new one.
            existing_layers = self.node[n]['layers']
            all_layers = existing_layers.union(layers)
            attrs_without_layers = {k: v for (k, v) in attr_dict.items()
                                    if k != 'layers'}
            self.node[n].update(attrs_without_layers)
            self.node[n].update({'layers': all_layers})

[docs]    def add_nodes_from(self, nodes, **attr):
        """Add multiple nodes.

        Parameters
        ----------
        nodes : iterable container of (node, attribute dict) tuples.
            Node attributes are updated using the attribute dict.
        attr : keyword arguments, optional (default= no attributes)
            Update attributes for all nodes in nodes.
            Node attributes specified in nodes as a tuple
            take precedence over attributes specified generally.

        See Also
        --------
        add_node

        Examples
        --------
        >>> d.add_nodes_from([(1, {'layers':{'token'}, 'word':'hello'}), \
                (2, {'layers':{'token'}, 'word':'world'})])
        >>> d.nodes(data=True)
        [(1, {'layers': {'token'}, 'word': 'hello'}),
         (2, {'layers': {'token'}, 'word': 'world'})]

        Use keywords to update specific node attributes for every node.

        >>> d.add_nodes_from(d.nodes(data=True), weight=1.0)
        >>> d.nodes(data=True)
        [(1, {'layers': {'token'}, 'weight': 1.0, 'word': 'hello'}),
         (2, {'layers': {'token'}, 'weight': 1.0, 'word': 'world'})]

        Use (node, attrdict) tuples to update attributes for specific
        nodes.

        >>> d.add_nodes_from([(1, {'layers': {'tiger'}})], size=10)
        >>> d.nodes(data=True)
        [(1, {'layers': {'tiger', 'token'}, 'size': 10, 'weight': 1.0, 'word': 'hello'}),
         (2, {'layers': {'token'}, 'weight': 1.0, 'word': 'world'})]
        """
        additional_attribs = attr  # will be added to each node
        for n in nodes:
            node_id, ndict = n
            assert 'layers' in ndict, \
                "Every node must have a 'layers' attribute."
            layers = ndict['layers']
            assert isinstance(layers, set), \
                "'layers' must be specified as a set of strings."
            assert all((isinstance(layer, str) for layer in layers)), \
                "All elements of the 'layers' set must be strings."

            if node_id not in self.succ:  # node doesn't exist, yet
                self.succ[node_id] = {}
                self.pred[node_id] = {}
                newdict = additional_attribs.copy()
                newdict.update(ndict)  # all given attribs incl. layers
                self.node[node_id] = newdict
            else:  # node already exists
                existing_layers = self.node[node_id]['layers']
                all_layers = existing_layers.union(layers)

                self.node[node_id].update(ndict)
                self.node[node_id].update(additional_attribs)
                self.node[node_id].update({'layers': all_layers})

[docs]    def add_edge(self, u, v, layers, key=None, attr_dict=None, **attr):
        """Add an edge between u and v.

        An edge can only be added if the nodes u and v already exist.
        This decision was taken to ensure that all nodes are associated
        with at least one (meaningful) layer.

        Edge attributes can be specified with keywords or by providing
        a dictionary with key/value pairs. In contrast to other
        edge attributes, layers can only be added not overwriten or
        deleted.

        Parameters
        ----------
        u,v : nodes
            Nodes can be, for example, strings or numbers.
            Nodes must be hashable (and not None) Python objects.
        layers : set of str
            the set of layers the edge belongs to,
            e.g. {'tiger:token', 'anaphoricity:annotation'}
        key : hashable identifier, optional (default=lowest unused integer)
            Used to distinguish multiedges between a pair of nodes.
        attr_dict : dictionary, optional (default= no attributes)
            Dictionary of edge attributes.  Key/value pairs will
            update existing data associated with the edge.
        attr : keyword arguments, optional
            Edge data (or labels or objects) can be assigned using
            keyword arguments.

        See Also
        --------
        add_edges_from : add a collection of edges

        Notes
        -----
        To replace/update edge data, use the optional key argument
        to identify a unique edge.  Otherwise a new edge will be created.

        NetworkX algorithms designed for weighted graphs cannot use
        multigraphs directly because it is not clear how to handle
        multiedge weights.  Convert to Graph using edge attribute
        'weight' to enable weighted graph algorithms.

        Examples
        --------
        >>> from discoursegraphs import  DiscourseDocumentGraph
        >>> d = DiscourseDocumentGraph()
        >>> d.add_nodes_from([(1, {'layers':{'token'}, 'word':'hello'}), \
                (2, {'layers':{'token'}, 'word':'world'})])

        >>> d.edges(data=True)
        >>> []

        >>> d.add_edge(1, 2, layers={'generic'})

        >>> d.add_edge(1, 2, layers={'tokens'}, weight=0.7)

        >>> d.edges(data=True)
        [(1, 2, {'layers': {'generic'}}),
         (1, 2, {'layers': {'tokens'}, 'weight': 0.7})]

        >>> d.edge[1][2]
        {0: {'layers': {'generic'}}, 1: {'layers': {'tokens'}, 'weight': 0.7}}

        >>> d.add_edge(1, 2, layers={'tokens'}, key=1, weight=1.0)
        >>> d.edges(data=True)
        [(1, 2, {'layers': {'generic'}}),
         (1, 2, {'layers': {'tokens'}, 'weight': 1.0})]

        >>> d.add_edge(1, 2, layers={'foo'}, key=1, weight=1.0)
        >>> d.edges(data=True)
        [(1, 2, {'layers': {'generic'}}),
         (1, 2, {'layers': {'foo', 'tokens'}, 'weight': 1.0})]
        """
        assert isinstance(layers, set), \
            "'layers' parameter must be given as a set of strings."
        assert all((isinstance(layer, str) for layer in layers)), \
            "All elements of the 'layers' set must be strings."
        # add layers to keyword arguments dict
        attr.update({'layers': layers})

        # set up attribute dict
        if attr_dict is None:
            attr_dict = attr
        else:
            try:
                attr_dict.update(attr)
            except AttributeError as e:
                raise AttributeError("The attr_dict argument must be "
                                     "a dictionary: ".format(e))
        assert u in self.succ, "from-node doesn't exist, yet"
        assert v in self.succ, "to-node doesn't exist, yet"

        if v in self.succ[u]:  # if there's already an edge from u to v
            keydict = self.adj[u][v]
            if key is None:  # creating additional edge
                # find a unique integer key
                # other methods might be better here?
                key = len(keydict)
                while key in keydict:
                    key += 1
            datadict = keydict.get(key, {})  # works for existing & new edge
            existing_layers = datadict.get('layers', set())
            all_layers = existing_layers.union(layers)

            datadict.update(attr_dict)
            datadict.update({'layers': all_layers})
            keydict[key] = datadict

        else:  # there's no edge between u and v, yet
            # selfloops work this way without special treatment
            if key is None:
                key = 0
            datadict = {}
            datadict.update(attr_dict)  # includes layers
            keydict = {key: datadict}
            self.succ[u][v] = keydict
            self.pred[v][u] = keydict

[docs]    def add_edges_from(self, ebunch, attr_dict=None, **attr):
        """Add all the edges in ebunch.

        Parameters
        ----------
        ebunch : container of edges
            Each edge given in the container will be added to the
            graph. The edges can be:

                - 3-tuples (u,v,d) for an edge attribute dict d, or
                - 4-tuples (u,v,k,d) for an edge identified by key k

            Each edge must have a layers attribute (set of str).
        attr_dict : dictionary, optional  (default= no attributes)
            Dictionary of edge attributes.  Key/value pairs will
            update existing data associated with each edge.
        attr : keyword arguments, optional
            Edge data (or labels or objects) can be assigned using
            keyword arguments.


        See Also
        --------
        add_edge : add a single edge

        Notes
        -----
        Adding the same edge twice has no effect but any edge data
        will be updated when each duplicate edge is added.

        An edge can only be added if the source and target nodes are
        already present in the graph. This decision was taken to ensure
        that all edges are associated with at least one (meaningful)
        layer.

        Edge attributes specified in edges as a tuple (in ebunch) take
        precedence over attributes specified otherwise (in attr_dict or
        attr). Layers can only be added (via a 'layers' edge attribute),
        but not overwritten.

        Examples
        --------
        >>> d = DiscourseDocumentGraph()
        >>> d.add_node(1, {'int'})
        >>> d.add_node(2, {'int'})

        >>> d.add_edges_from([(1, 2, {'layers': {'int'}, 'weight': 23})])
        >>> d.add_edges_from([(1, 2, {'layers': {'int'}, 'weight': 42})])

        >>> d.edges(data=True) # multiple edges between the same nodes
        [(1, 2, {'layers': {'int'}, 'weight': 23}),
         (1, 2, {'layers': {'int'}, 'weight': 42})]

        Associate data to edges

        We update the existing edge (key=0) and overwrite its 'weight'
        value. Note that we can't overwrite the 'layers' value, though.
        Instead, they are added to the set of existing layers
        
        >>> d.add_edges_from([(1, 2, 0, {'layers':{'number'}, 'weight':66})])
        [(1, 2, {'layers': {'int', 'number'}, 'weight': 66}),
         (1, 2, {'layers': {'int'}, 'weight': 42})]
        """
        # set up attribute dict
        if attr_dict is None:
            attr_dict = attr
        else:
            try:
                attr_dict.update(attr)
            except AttributeError as e:
                raise AttributeError("The attr_dict argument must be "
                                     "a dictionary: ".format(e))
        # process ebunch
        for e in ebunch:
            ne = len(e)
            if ne == 4:
                u, v, key, dd = e
            elif ne == 3:
                u, v, dd = e
                key = None
            else:
                raise AttributeError(
                    "Edge tuple %s must be a 3-tuple (u,v,attribs) "
                    "or 4-tuple (u,v,key,attribs)." % (e,))

            assert 'layers' in dd, \
                "Every edge must have a 'layers' attribute."
            layers = dd['layers']
            assert isinstance(layers, set), \
                "'layers' must be specified as a set of strings."
            assert all((isinstance(layer, str)
                        for layer in layers)), \
                "All elements of the 'layers' set must be strings."
            additional_layers = attr_dict.get('layers', {})
            if additional_layers:
                assert isinstance(additional_layers, set), \
                    "'layers' must be specified as a set of strings."
                assert all((isinstance(layer, str)
                            for layer in additional_layers)), \
                    "'layers' set must only contain strings."
            # union of layers specified in ebunch tuples,
            # attr_dict and **attr
            new_layers = layers.union(additional_layers)

            if u in self.adj:  # edge with u as source already exists
                keydict = self.adj[u].get(v, {})
            else:
                keydict = {}
            if key is None:
                # find a unique integer key
                # other methods might be better here?
                key = len(keydict)
                while key in keydict:
                    key += 1
            datadict = keydict.get(key, {})  # existing edge attribs
            existing_layers = datadict.get('layers', set())
            datadict.update(attr_dict)
            datadict.update(dd)
            updated_attrs = {k: v for (k, v) in datadict.items()
                             if k != 'layers'}

            all_layers = existing_layers.union(new_layers)
            # add_edge() checks if u and v exist, so we don't need to
            self.add_edge(u, v, layers=all_layers, key=key,
                          attr_dict=updated_attrs)
Navigation

Source code for discoursegraphs.dg

Quick search

Navigation