Source code for libpgm.tablecpdfactorization

# Copyright (c) 2012, CyberPoint International, LLC
# All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#     * Neither the name of the CyberPoint International, LLC nor the
#       names of its contributors may be used to endorse or promote products
#       derived from this software without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL CYBERPOINT INTERNATIONAL, LLC BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
'''
This module provides tools for creating and using factorized representations of Bayesian networks. Factorized representations of Bayesian networks are discrete CPDs whose values have been flattened into a single array, while the cardinalities and strides of each variable represented are kept track of separately. With the proper setup, these flattened structures can be more easily multiplied together, reduced, and operated on. For more information on factors cf. Koller et al. Ch. 4.

'''

from tablecpdfactor import TableCPDFactor
from discretebayesiannetwork import DiscreteBayesianNetwork

import random
import copy

[docs]class TableCPDFactorization():
    '''
    This class represents a factorized Bayesian network with discrete CPD tables. It contains the attributes *bn*, *originalfactorlist*, and *factorlist*, and the methods *refresh*, *sumproductve*, *sumproducteliminatevar*, *condprobve*, *specificquery*, and *gibbssample*.

    '''

    def __init__(self, bn):
        '''
        This class is constructed with a :doc:`DiscreteBayesianNetwork <discretebayesiannetwork>` instance as argument. First, it takes the input itself and stores it in the *bn* attribute. Then, it transforms the information of each of these nodes from standard discrete CPD form into a :doc:`TableCPDFactor <tablecpdfactor>` isntance and stores the instances in an array in the attribute *originalfactorlist*. Finally, it makes a copy of this list to work with and stores it in *factorlist*.
        
        '''
        assert isinstance(bn, DiscreteBayesianNetwork), "Input must be a DiscreteBayesianNetwork instance."
        
        self.bn = bn
        '''The Bayesian network used as argument at instantiation.'''
        self.originalfactorlist = []
        '''A list of :doc:`TableCPDFactor <tablecpdfactor>` instances, one per node.'''
        for vertex in bn.V:
            factor = TableCPDFactor(vertex, bn)
            self.originalfactorlist.append(factor)
        self.factorlist = copy.copy(self.originalfactorlist)  
        '''A working copy of *originalfactorlist*.'''

        assert self.factorlist, "Factor list not properly loaded, check for an incomplete class instance as input."
    
[docs]    def refresh(self):
        '''
        Refresh the *factorlist* attribute to equate with *originalfactorlist*. This is in effect a reset of the system, erasing any changes to *factorlist* that the program has executed.

        '''
        self.factorlist = copy.copy(self.originalfactorlist)    
            
[docs]    def sumproducteliminatevar(self, vertex):    
        '''
        Multiply the all the factors in *factorlist* that have *vertex* in their scope, then sum out *vertex* from the resulting product factor. Replace all factors that were multiplied together with the resulting summed-out product.
        
        Arguments:
            1. *vertex* - The name of the variable to eliminate.
        
        Attributes modified:
            1. *factorlist* -- Modified to reflect the eliminated variable.
        
        For more information on this algorithm cf. Koller et al. 298

        '''
        factors2 = []
        factors1 = []
        for factor in self.factorlist:
            try:
                factor.scope.index(vertex)
                factors1.append(factor)
            except ValueError:
                factors2.append(factor)
                
        # multiply factors1 array together
        for i in range(1, len(factors1)):
            factors1[0].multiplyfactor(factors1[i])
        
        # sum out the vertex from the factor
        factors1[0].sumout(vertex)
        
        # add to rest of factors and return
        if (factors1[0] != None):
            factors2.append(factors1[0])
        self.factorlist = factors2
    
[docs]    def sumproductve(self, vertices):
        '''
        Eliminate each vertex in *vertices* from *factorlist* using *sumproducteliminatevar*.
        
        Arguments:
            1. *vertices* -- A list of UUIDs of vertices to be eliminated.
        
        Attributes modified: 
            1. *factorlist* -- modified to become a single factor representing the remaining variables.

        '''
    
        # eliminate one by one
        for vertex in vertices:
            self.sumproducteliminatevar(vertex)
        
        # multiply together if many factors remain 
        for i in range(1, len(self.factorlist)):
            self.factorlist[0].multiplyfactor(self.factorlist[i])
        
        self.factorlist = self.factorlist[0]
        
[docs]    def condprobve(self, query, evidence):
        '''
        Eliminate all variables in *factorlist* except for the ones queried. Adjust all distributions for the evidence given. Return the probability distribution over a set of variables given by the keys of *query* given *evidence*. 
        
        Arguments:
            1. *query* -- A dict containing (key: value) pairs reflecting (variable: value) that represents what outcome to calculate the probability of. 
            2. *evidence* -- A dict containing (key: value) pairs reflecting (variable: value) that represents what is known about the system.
                    
        Attributes modified:
            1. *factorlist* -- Modified to be one factor representing the probability distribution of the query variables given the evidence.
                           
        The function returns *factorlist* after it has been modified as above.
        
        Usage example: this code would return the distribution over a queried node, given evidence::

            import json

            from libpgm.graphskeleton import GraphSkeleton
            from libpgm.nodedata import NodeData
            from libpgm.discretebayesiannetwork import DiscreteBayesianNetwork
            from libpgm.tablecpdfactorization import TableCPDFactorization

            # load nodedata and graphskeleton
            nd = NodeData()
            skel = GraphSkeleton()
            nd.load("../tests/unittestdict.txt")
            skel.load("../tests/unittestdict.txt")

            # toporder graph skeleton
            skel.toporder()

            # load evidence
            evidence = dict(Letter='weak')
            query = dict(Grade='A')

            # load bayesian network
            bn = DiscreteBayesianNetwork(skel, nd)

            # load factorization
            fn = TableCPDFactorization(bn)

            # calculate probability distribution
            result = fn.condprobve(query, evidence)

            # output
            print json.dumps(result.vals, indent=2)
            print json.dumps(result.scope, indent=2)
            print json.dumps(result.card, indent=2)
            print json.dumps(result.stride, indent=2)

        '''
        assert (isinstance(query, dict) and isinstance(evidence, dict)), "First and second args must be dicts."

        eliminate = self.bn.V
        for key in query.keys():
            eliminate.remove(key)
        for key in evidence.keys():
            eliminate.remove(key)
        
        # modify factors to account for E = e
        for key in evidence.keys():
            for x in range(len(self.factorlist)):
                if (self.factorlist[x].scope.count(key) > 0):
                    self.factorlist[x].reducefactor(key, evidence[key])
            for x in reversed(range(len(self.factorlist))):
                if (self.factorlist[x].scope == []):
                    del(self.factorlist[x])
                    
        # eliminate all necessary variables in the new factor set to produce result
        self.sumproductve(eliminate)
        
        # normalize result
        summ = 0
        lngth = len(self.factorlist.vals)
        for x in range(lngth):
            summ += self.factorlist.vals[x]
        for x in range(lngth):
            self.factorlist.vals[x] /= summ
            
        # return table
        return self.factorlist

[docs]    def specificquery(self, query, evidence):
        '''
        Eliminate all variables except for the ones specified by *query*. Adjust all distributions to reflect *evidence*. Return the entry that matches the exact probability of a specific event, as specified by *query*.
        
        Arguments:
            1. *query* -- A dict containing (key: value) pairs reflecting (variable: value) that represents what outcome to calculate the probability of. 
            2. *evidence* -- A dict containing (key: value) pairs reflecting (variable: value) evidence that is known about the system.
                    
        Attributes modified:
            1. *factorlist* -- Modified as in *condprobve*.
                           
        The function then chooses the entries of *factorlist* that match the queried event or events. It then operates on them to return the probability that the event (or events) specified will occur, represented as a float between 0 and 1.

        Note that in this function, queries of the type P((x=A or x=B) and (y=C or y=D)) are permitted. They are executed by formatting the *query* dictionary like so::

            {
                "x": ["A", "B"],
                "y": ["C", "D"]
            }
        
        Usage example: this code would answer the specific query that vertex ``Grade`` gets outcome ``A`` given that ``Letter`` has outcome ``weak``, in :doc:`this Bayesian network <unittestdict>`::

            import json

            from libpgm.graphskeleton import GraphSkeleton
            from libpgm.nodedata import NodeData
            from libpgm.discretebayesiannetwork import DiscreteBayesianNetwork
            from libpgm.tablecpdfactorization import TableCPDFactorization
            
            # load nodedata and graphskeleton
            nd = NodeData()
            skel = GraphSkeleton()
            nd.load("../tests/unittestdict.txt")
            skel.load("../tests/unittestdict.txt")

            # toporder graph skeleton
            skel.toporder()

            # load evidence
            evidence = dict(Letter='weak')
            query = dict(Grade='A')

            # load bayesian network
            bn = DiscreteBayesianNetwork(skel, nd)

            # load factorization
            fn = TableCPDFactorization(bn)

            # calculate probability distribution
            result = fn.specificquery(query, evidence)

            # output
            print result

        '''
        assert (isinstance(query, dict) and isinstance(evidence, dict)), "First and second args must be dicts."
        
        self.condprobve(query, evidence)
        visited = dict()
        rindices = dict()
        findices = []
        
        # find corresponding numbers to possible values, store in rindices
        for var in query.keys():
            rindices[var] = []
            visited[var] = False
            for poss in query[var]:
                rindices[var].append(self.bn.Vdata[var]["vals"].index(poss))
        
        # define function to help iterate recursively through all combinations of evidence
        def findentry(var, index):
            visited[var] = True 
        
            for x in range(len(rindices[var])):
                newindex = index + rindices[var][x] * self.factorlist.stride[var]
                if (visited.values().count(False) > 0):
                    i = visited.values().index(False)
                    nextvar = query.keys()[i]
                    findentry(nextvar, newindex)
                else:
                    findices.append(newindex)
            visited[var] = False
            return
        
        # calculate all relevant entries
        findentry(query.keys()[0], 0)
            
        # sum entries
        fanswer = 0
        for findex in findices:
            fanswer += self.factorlist.vals[findex]
            
        # return result
        return fanswer

[docs]    def gibbssample(self, evidence, n):
        '''
        Return a sequence of *n* samples using the Gibbs sampling method, given evidence specified by *evidence*. Gibbs sampling is a technique wherein for each sample, each variable in turn is erased and calculated conditioned on the outcomes of its neighbors. This method starts by sampling from the 'prior distribution,' which is the distribution not conditioned on evidence, but the samples provably get closer and closer to the posterior distribution, which is the distribution conditioned on the evidence. It is thus a good way to deal with evidence when generating random samples.
        
        Arguments: 
            1. *evidence* -- A dict containing (key: value) pairs reflecting (variable: value) that represents what is known about the system.
            2. *n* -- The number of samples to return.
        
        Returns:
        
        A list of *n* random samples, each element of which is a dict containing (vertex: value) pairs.
        
        For more information, cf. Koller et al. Ch. 12.3.1

        Usage example: This code would generate a sequence of 10 samples::

            import json
            
            from libpgm.graphskeleton import GraphSkeleton
            from libpgm.nodedata import NodeData
            from libpgm.discretebayesiannetwork import DiscreteBayesianNetwork
            from libpgm.tablecpdfactorization import TableCPDFactorization
            
            # load nodedata and graphskeleton
            nd = NodeData()
            skel = GraphSkeleton()
            nd.load("../tests/unittestdict.txt")
            skel.load("../tests/unittestdict.txt")

            # toporder graph skeleton
            skel.toporder()

            # load evidence
            evidence = dict(Letter='weak')

            # load bayesian network
            bn = DiscreteBayesianNetwork(skel, nd)

            # load factorization
            fn = TableCPDFactorization(bn)

            # sample 
            result = fn.gibbssample(evidence, 10)

            # output
            print json.dumps(result, indent=2)

        '''
        self.refresh()
        random.seed() 
        
        # declare result array
        seq = []
        
        # create initial instantiation 
        initial = self.bn.randomsample(1)
        for key in evidence.keys():
            initial[0][key] = evidence[key]
        seq.append(initial[0])  
        
        # find nodes that we are sampling
        order = []
        for vertex in self.bn.V:
            if vertex not in evidence.keys():
                order.append(vertex)

        # reduce factorlist given E = e
        for key in evidence.keys():
            for x in range(len(self.factorlist)):
                if (self.factorlist[x].scope.count(key) > 0):
                    self.factorlist[x].reducefactor(key, evidence[key])
            for x in reversed(range(len(self.factorlist))):    
                if (self.factorlist[x].scope == []):
                    del(self.factorlist[x])
                    
        # define function to create the next instantiation
        def next(current): 
            for node in order:
                # multiply all relevant factors together
                relevantfactors = []
                for factor in self.factorlist:
                    if (factor.scope.count(node) > 0):
                        factorcopy = factor.copy()
                        relevantfactors.append(factorcopy)
                for j in range(1, len(relevantfactors)):
                    relevantfactors[0].multiplyfactor(relevantfactors[j])

                # reduce to leave only the one node
                for othernode in order:
                    if (othernode != node and relevantfactors[0].scope.count(othernode) > 0):
                        relevantfactors[0].reducefactor(othernode, current[othernode])
                
                # renormalize
                summ = 0
                for val in relevantfactors[0].vals:
                    summ += val
                for x in range(len(relevantfactors[0].vals)):
                    relevantfactors[0].vals[x] /= summ
                
                # convert random number
                val = random.random()
                lboundary = 0
                uboundary = 0
                for x in range(len(relevantfactors[0].vals)):
                    uboundary += relevantfactors[0].vals[x]
                    if (lboundary <= val and val < uboundary):
                        rindex = x
                        # print s, val
                        break
                    else:
                        lboundary = uboundary 
                
                # modify result
                current[node] = self.bn.Vdata[node]["vals"][rindex]
   
            return current
                        
        # run next() function n times
        for u in range(n-1):
            copy = dict() 
            for entry in seq[u]:
                copy[entry] = seq[u][entry]
            seq.append(next(copy))
            
        # return all samples
        return seq
Navigation

Source code for libpgm.tablecpdfactorization

Quick search

Navigation