Source code for libpgm.pgmlearner

# Copyright (c) 2012, CyberPoint International, LLC
# All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#     * Neither the name of the CyberPoint International, LLC nor the
#       names of its contributors may be used to endorse or promote products
#       derived from this software without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL CYBERPOINT INTERNATIONAL, LLC BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
'''
This module provides tools to generate Bayesian networks that are "learned" from a data set. The learning process involves finding the Bayesian network that most accurately models data given as input -- in other words, finding the Bayesian network that makes the data set most likely. There are two major parts of Bayesian network learning: structure learning and parameter learning. Structure learning means finding the graph that most accurately depicts the dependencies detected in the data. Parameter learning means adjusting the parameters of the CPDs in a graph skeleton to most accurately model the data. This module has tools for both of these tasks.

'''
import copy
import json
import itertools
try:
    import numpy as np
except ImportError:
    raise ImportError, "numpy is not installed on your system."

try: 
    from scipy.stats import chisquare
except ImportError:
    raise ImportError, "scipy is not installed on your system."


from graphskeleton import GraphSkeleton
from discretebayesiannetwork import DiscreteBayesianNetwork
from lgbayesiannetwork import LGBayesianNetwork
from sampleaggregator import SampleAggregator

[docs]class PGMLearner(): ''' This class is a machine with tools for learning Bayesian networks from data. It contains the *discrete_mle_estimateparams*, *lg_mle_estimateparams*, *discrete_constraint_estimatestruct*, *lg_constraint_estimatestruct*, *discrete_condind*, *discrete_estimatebn*, and *lg_estimatebn* methods. '''
[docs] def discrete_mle_estimateparams(self, graphskeleton, data): ''' Estimate parameters for a discrete Bayesian network with a structure given by *graphskeleton* in order to maximize the probability of data given by *data*. This function takes the following arguments: 1. *graphskeleton* -- An instance of the :doc:`GraphSkeleton <graphskeleton>` class containing vertex and edge data. 2. *data* -- A list of dicts containing samples from the network in {vertex: value} format. Example:: [ { 'Grade': 'B', 'SAT': 'lowscore', ... }, ... ] This function normalizes the distribution of a node's outcomes for each combination of its parents' outcomes. In doing so it creates an estimated tabular conditional probability distribution for each node. It then instantiates a :doc:`DiscreteBayesianNetwork <discretebayesiannetwork>` instance based on the *graphskeleton*, and modifies that instance's *Vdata* attribute to reflect the estimated CPDs. It then returns the instance. The Vdata attribute instantiated is in the format seen in :doc:`unittestdict`, as described in :doc:`discretebayesiannetwork`. Usage example: this would learn parameters from a set of 200 discrete samples:: import json from libpgm.nodedata import NodeData from libpgm.graphskeleton import GraphSkeleton from libpgm.discretebayesiannetwork import DiscreteBayesianNetwork from libpgm.pgmlearner import PGMLearner # generate some data to use nd = NodeData() nd.load("../tests/unittestdict.txt") # an input file skel = GraphSkeleton() skel.load("../tests/unittestdict.txt") skel.toporder() bn = DiscreteBayesianNetwork(skel, nd) data = bn.randomsample(200) # instantiate my learner learner = PGMLearner() # estimate parameters from data and skeleton result = learner.discrete_mle_estimateparams(skel, data) # output print json.dumps(result.Vdata, indent=2) ''' assert (isinstance(graphskeleton, GraphSkeleton)), "First arg must be a loaded GraphSkeleton class." assert (isinstance(data, list) and data and isinstance(data[0], dict)), "Second arg must be a list of dicts." # instantiate Bayesian network, and add parent and children data bn = DiscreteBayesianNetwork() graphskeleton.toporder() bn.V = graphskeleton.V bn.E = graphskeleton.E bn.Vdata = dict() for vertex in bn.V: bn.Vdata[vertex] = dict() bn.Vdata[vertex]["children"] = graphskeleton.getchildren(vertex) bn.Vdata[vertex]["parents"] = graphskeleton.getparents(vertex) # make placeholders for vals, cprob, and numoutcomes bn.Vdata[vertex]["vals"] = [] if (bn.Vdata[vertex]["parents"] == []): bn.Vdata[vertex]["cprob"] = [] else: bn.Vdata[vertex]["cprob"] = dict() bn.Vdata[vertex]["numoutcomes"] = 0 # determine which outcomes are possible for each node for sample in data: for vertex in bn.V: if (sample[vertex] not in bn.Vdata[vertex]["vals"]): bn.Vdata[vertex]["vals"].append(sample[vertex]) bn.Vdata[vertex]["numoutcomes"] += 1 # lay out probability tables, and put a [num, denom] entry in all spots: # define helper function to recursively set up cprob table def addlevel(vertex, _dict, key, depth, totaldepth): if depth == totaldepth: _dict[str(key)] = [] for _ in range(bn.Vdata[vertex]["numoutcomes"]): _dict[str(key)].append([0, 0]) return else: for val in bn.Vdata[bn.Vdata[vertex]["parents"][depth]]["vals"]: ckey = key[:] ckey.append(str(val)) addlevel(vertex, _dict, ckey, depth+1, totaldepth) # put [0, 0] at each entry of cprob table for vertex in bn.V: if (bn.Vdata[vertex]["parents"]): root = bn.Vdata[vertex]["cprob"] numparents = len(bn.Vdata[vertex]["parents"]) addlevel(vertex, root, [], 0, numparents) else: for _ in range(bn.Vdata[vertex]["numoutcomes"]): bn.Vdata[vertex]["cprob"].append([0, 0]) # fill out entries with samples: for sample in data: for vertex in bn.V: # compute index of result rindex = bn.Vdata[vertex]["vals"].index(sample[vertex]) # go to correct place in Vdata if bn.Vdata[vertex]["parents"]: pvals = [str(sample[t]) for t in bn.Vdata[vertex]["parents"]] lev = bn.Vdata[vertex]["cprob"][str(pvals)] else: lev = bn.Vdata[vertex]["cprob"] # increase all denominators for the current condition for entry in lev: entry[1] += 1 # increase numerator for current outcome lev[rindex][0] += 1 # convert arrays to floats for vertex in bn.V: if not bn.Vdata[vertex]["parents"]: bn.Vdata[vertex]["cprob"] = [x[0]/float(x[1]) for x in bn.Vdata[vertex]["cprob"]] else: for key in bn.Vdata[vertex]["cprob"].keys(): try: bn.Vdata[vertex]["cprob"][key] = [x[0]/float(x[1]) for x in bn.Vdata[vertex]["cprob"][key]] # default to even distribution if no data points except ZeroDivisionError: bn.Vdata[vertex]["cprob"][key] = [1/float(bn.Vdata[vertex]["numoutcomes"]) for x in bn.Vdata[vertex]["cprob"][key]] # return cprob table with estimated probability distributions return bn
[docs] def lg_mle_estimateparams(self, graphskeleton, data): ''' Estimate parameters for a linear Gaussian Bayesian network with a structure given by *graphskeleton* in order to maximize the probability of data given by *data*. This function takes the following arguments: 1. *graphskeleton* -- An instance of the :doc:`GraphSkeleton <graphskeleton>` class containing vertex and edge data. 2. *data* -- A list of dicts containing samples from the network in {vertex: value} format. Example:: [ { 'Grade': 74.343, 'Intelligence': 29.545, ... }, ... ] The algorithm used to calculate the linear Gaussian parameters is beyond the scope of this documentation -- for a full explanation, cf. Koller et al. 729. After the parameters are calculated, the program instantiates a :doc:`DiscreteBayesianNetwork <discretebayesiannetwork>` instance based on the *graphskeleton*, and modifies that instance's *Vdata* attribute to reflect the estimated CPDs. It then returns the instance. The Vdata attribute instantiated is in the format seen in the input file example :doc:`unittestdict`, as described in :doc:`discretebayesiannetwork`. Usage example: this would learn parameters from a set of 200 linear Gaussian samples:: import json from libpgm.nodedata import NodeData from libpgm.graphskeleton import GraphSkeleton from libpgm.lgbayesiannetwork import LGBayesianNetwork from libpgm.pgmlearner import PGMLearner # generate some data to use nd = NodeData() nd.load("../tests/unittestlgdict.txt") # an input file skel = GraphSkeleton() skel.load("../tests/unittestdict.txt") skel.toporder() lgbn = LGBayesianNetwork(skel, nd) data = lgbn.randomsample(200) # instantiate my learner learner = PGMLearner() # estimate parameters result = learner.lg_mle_estimateparams(skel, data) # output print json.dumps(result.Vdata, indent=2) ''' assert (isinstance(graphskeleton, GraphSkeleton)), "First arg must be a loaded GraphSkeleton class." assert (isinstance(data, list) and data and isinstance(data[0], dict)), "Second arg must be a list of dicts." # instantiate Bayesian network, and add parent and children data bn = LGBayesianNetwork() graphskeleton.toporder() bn.V = graphskeleton.V bn.E = graphskeleton.E bn.Vdata = dict() for vertex in bn.V: bn.Vdata[vertex] = dict() bn.Vdata[vertex]["children"] = graphskeleton.getchildren(vertex) bn.Vdata[vertex]["parents"] = graphskeleton.getparents(vertex) # make placeholders for mean_base, mean_scal, and variance bn.Vdata[vertex]["mean_base"] = 0.0 bn.Vdata[vertex]["mean_scal"] = [] for parent in bn.Vdata[vertex]["parents"]: bn.Vdata[vertex]["mean_scal"].append(0.0) bn.Vdata[vertex]["variance"] = 0.0 # make covariance table, array of E[X_i] for each vertex, and table # of E[X_i * X_j] for each combination of vertices cov = [[0 for _ in range(len(bn.V))] for __ in range(len(bn.V))] singletons = [0 for _ in range(len(bn.V))] numtrials = len(data) for sample in data: for x in range(len(bn.V)): singletons[x] += sample[bn.V[x]] for y in range(len(bn.V)): cov[x][y] += sample[bn.V[x]] * sample[bn.V[y]] for x in range(len(bn.V)): singletons[x] /= float(numtrials) for y in range(len(bn.V)): cov[x][y] /= float(numtrials) # (save copy. this is the E[X_i * X_j] table) product_expectations = [[cov[x][y] for y in range(len(bn.V))] for x in range(len(bn.V))] for x in range(len(bn.V)): for y in range(len(bn.V)): cov[x][y] = cov[x][y] - (singletons[x] * singletons[y]) # construct system of equations and solve (for each node) for x in range(len(bn.V)): # start with the E[X_i * X_j] table system = [[product_expectations[p][q] for q in range(len(bn.V))] for p in range(len(bn.V))] # step 0: remove all entries from all the tables except for node and its parents rowstokeep = [x] for z in range(len(bn.V)): if bn.V[z] in bn.Vdata[bn.V[x]]["parents"]: rowstokeep.append(z) smalldim = len(rowstokeep) smallsystem = [[0 for _ in range(smalldim)] for __ in range(smalldim)] smallcov = [[0 for _ in range(smalldim)] for __ in range(smalldim)] smallsing = [0 for _ in range(smalldim)] for index in range(len(rowstokeep)): smallsing[index] = singletons[rowstokeep[index]] for index2 in range(len(rowstokeep)): smallsystem[index][index2] = system[rowstokeep[index]][rowstokeep[index2]] smallcov[index][index2] = cov[rowstokeep[index]][rowstokeep[index2]] # step 1: delete and copy row corresponding to node (using [row][column] notation) tmparray = [0 for _ in range(smalldim)] for y in range(smalldim): if (y > 0): for j in range(smalldim): smallsystem[y-1][j] = smallsystem[y][j] if (y == 0): for j in range(smalldim): tmparray[j] = smallsystem[y][j] # step 2: delete column, leaving system with all entries # corresponding to parents of node for y in range(smalldim): if (y > 0): for j in range(smalldim): smallsystem[j][y-1] = smallsystem[j][y] # step 3: take entry for node out of singleton array and store it bordarray = [] for y in range(smalldim): if (y != 0): bordarray.append(smallsing[y]) else: tmpentry = smallsing[y] # step 4: add border array on borders of system for y in range(len(bordarray)): smallsystem[smalldim - 1][y] = bordarray[y] smallsystem[y][smalldim - 1] = bordarray[y] smallsystem[smalldim - 1][smalldim - 1] = 1 # step 5: construct equality vector (the 'b' of ax = b) evector = [0 for _ in range(smalldim)] for y in range(smalldim): if (y != smalldim - 1): evector[y] = tmparray[y + 1] else: evector[y] = tmpentry # use numpy to solve a = np.array(smallsystem) b = np.array(evector) solve = list(np.linalg.solve(a, b)) # fill mean_base and mean_scal[] with this data bn.Vdata[bn.V[x]]["mean_base"] = solve[smalldim - 1] for i in range(smalldim - 1): bn.Vdata[bn.V[x]]["mean_scal"][i] = solve[i] # add variance variance = smallcov[0][0] for y in range(1, smalldim): for z in range(1, smalldim): variance -= (bn.Vdata[bn.V[x]]["mean_scal"][y-1] * bn.Vdata[bn.V[x]]["mean_scal"][z-1] * smallcov[y][z]) bn.Vdata[bn.V[x]]["variance"] = variance # that's all folks return bn
[docs] def discrete_constraint_estimatestruct(self, data, pvalparam=0.05, indegree=1): ''' Learn a Bayesian network structure from discrete data given by *data*, using constraint-based approaches. This function first calculates all the independencies and conditional independencies present between variables in the data. To calculate dependencies, it uses the *discrete_condind* method on each pair of variables, conditioned on other sets of variables of size *indegree* or smaller, to generate a chi-squared result and a p-value. If this p-value is less than *pvalparam*, the pair of variables are considered dependent conditioned on the variable set. Once all true dependencies -- pairs of variables that are dependent no matter what they are conditioned by -- are found, the algorithm uses these dependencies to construct a directed acyclic graph. It returns this DAG in the form of a :doc:`GraphSkeleton <graphskeleton>` class. Arguments: 1. *data* -- An array of dicts containing samples from the network in {vertex: value} format. Example:: [ { 'Grade': 'B', 'SAT': 'lowscore', ... }, ... ] 2. *pvalparam* -- (Optional, default is 0.05) The p-value below which to consider something significantly unlikely. A common number used is 0.05. This is passed to *discrete_condind* when it is called. 3. *indegree* -- (Optional, default is 1) The upper bound on the size of a witness set (see Koller et al. 85). If this is larger than 1, a huge amount of samples in *data* are required to avoid a divide-by-zero error. Usage example: this would learn structure from a set of 8000 discrete samples:: import json from libpgm.nodedata import NodeData from libpgm.graphskeleton import GraphSkeleton from libpgm.discretebayesiannetwork import DiscreteBayesianNetwork from libpgm.pgmlearner import PGMLearner # generate some data to use nd = NodeData() nd.load("../tests/unittestdict.txt") # an input file skel = GraphSkeleton() skel.load("../tests/unittestdict.txt") skel.toporder() bn = DiscreteBayesianNetwork(skel, nd) data = bn.randomsample(8000) # instantiate my learner learner = PGMLearner() # estimate structure result = learner.discrete_constraint_estimatestruct(data) # output print json.dumps(result.E, indent=2) ''' assert (isinstance(data, list) and data and isinstance(data[0], dict)), "Arg must be a list of dicts." # instantiate array of variables and array of potential dependencies variables = data[0].keys() ovariables = variables[:] dependencies = [] for x in variables: ovariables.remove(x) for y in ovariables: if (x != y): dependencies.append([x, y]) # define helper function to find subsets def subsets(array): result = [] for i in range(indegree + 1): comb = itertools.combinations(array, i) for c in comb: result.append(list(c)) return result witnesses = [] othervariables = variables[:] # for each pair of variables X, Y: for X in variables: othervariables.remove(X) for Y in othervariables: # consider all sets of witnesses that do not have X or Y in # them, and are less than or equal to the size specified by # the "indegree" argument for U in subsets(variables): if (X not in U) and (Y not in U) and len(U) <= indegree: # determine conditional independence chi, pv, witness = self.discrete_condind(data, X, Y, U) if pv > pvalparam: msg = "***%s and %s are found independent (chi = %f, pv = %f) with witness %s***" % (X, Y, chi, pv, U) try: dependencies.remove([X, Y]) dependencies.remove([Y, X]) except: pass witnesses.append([X, Y, witness]) break # now that we have found our dependencies, run build PDAG (cf. Koller p. 89) # with the stored set of independencies: # assemble undirected graph skeleton pdag = GraphSkeleton() pdag.E = dependencies pdag.V = variables # adjust for immoralities (cf. Koller 86) dedges = [x[:] for x in pdag.E] for edge in dedges: edge.append('u') for edge1 in reversed(dedges): for edge2 in reversed(dedges): if (edge1 in dedges) and (edge2 in dedges): if edge1[0] == edge2[1]: if (([edge1[1], edge2[0], [edge1[0]]] not in witnesses) and ([edge2[0], edge1[1], [edge1[0]]] not in witnesses)): dedges.append([edge1[1], edge1[0]]) dedges.append([edge2[0], edge2[1]]) dedges.remove(edge1) dedges.remove(edge2) elif edge1[1] == edge2[0]: if (([edge1[0], edge2[1], [edge1[1]]] not in witnesses) and ([edge2[1], edge1[0], [edge1[1]]] not in witnesses)): dedges.append([edge1[0], edge1[1]]) dedges.append([edge2[1], edge2[0]]) dedges.remove(edge1) dedges.remove(edge2) elif edge1[1] == edge2[1] and edge1[0] != edge2[0]: if (([edge1[0], edge2[0], [edge1[1]]] not in witnesses) and ([edge2[0], edge1[0], [edge1[1]]] not in witnesses)): dedges.append([edge1[0], edge1[1]]) dedges.append([edge2[0], edge2[1]]) dedges.remove(edge1) dedges.remove(edge2) elif edge1[0] == edge2[0] and edge1[1] != edge2[1]: if (([edge1[1], edge2[1], [edge1[0]]] not in witnesses) and ([edge2[1], edge1[1], [edge1[0]]] not in witnesses)): dedges.append([edge1[1], edge1[0]]) dedges.append([edge2[1], edge2[0]]) dedges.remove(edge1) dedges.remove(edge2) # define helper method "exists_undirected_edge" def exists_undirected_edge(one_end, the_other_end): for edge in dedges: if len(edge) == 3: if (edge[0] == one_end and edge[1] == the_other_end): return True elif (edge[1] == one_end and edge[0] == the_other_end): return True return False # use right hand rules to improve graph until convergence (Koller 89) olddedges = [] while (olddedges != dedges): olddedges = [x[:] for x in dedges] for edge1 in reversed(dedges): for edge2 in reversed(dedges): # rule 1 inverted = False check1, check2 = False, True if (edge1[1] == edge2[0] and len(edge1) == 2 and len(edge2) == 3): check1 = True elif (edge1[1] == edge2[1] and len(edge1) == 2 and len(edge2) == 3): check = True inverted = True for edge3 in dedges: if edge3 != edge1 and ((edge3[0] == edge1[0] and edge3[1] == edge2[1]) or (edge3[1] == edge1[0] and edge3[0] == edge2[1])): check2 = False if check1 == True and check2 == True: if inverted: dedges.append([edge1[1], edge2[0]]) else: dedges.append([edge1[1], edge2[1]]) dedges.remove(edge2) # rule 2 check1, check2 = False, False if (edge1[1] == edge2[0] and len(edge1) == 2 and len(edge2) == 2): check1 = True for edge3 in dedges: if ((edge3[0] == edge1[0] and edge3[1] == edge2[1]) or (edge3[1] == edge1[0] and edge3[0] == edge2[1]) and len(edge3) == 3): check2 = True if check1 == True and check2 == True: if edge3[0] == edge1[0]: dedges.append([edge3[0], edge3[1]]) elif edge3[1] == edge1[0]: dedges.append([edge3[1], edge3[0]]) dedges.remove(edge3) # rule 3 check1, check2 = False, False if len(edge1) == 2 and len(edge2) == 2: if (edge1[1] == edge2[1] and edge1[0] != edge2[0]): check1 = True for v in variables: if (exists_undirected_edge(v, edge1[0]) and exists_undirected_edge(v, edge1[1]) and exists_undirected_edge(v, edge2[0])): check2 = True if check1 == True and check2 == True: dedges.append([v, edge1[1]]) for edge3 in dedges: if (len(edge3) == 3 and ((edge3[0] == v and edge3[1] == edge1[1]) or (edge3[1] == v and edge3[0] == edge1[1]))): dedges.remove(edge3) # return one possible graph skeleton from the pdag class found for x in range(len(dedges)): if len(dedges[x]) == 3: dedges[x] = dedges[x][:2] pdag.E = dedges pdag.toporder() return pdag
[docs] def lg_constraint_estimatestruct(self, data, pvalparam=0.05, bins=10, indegree=1): ''' Learn a Bayesian network structure from linear Gaussian data given by *data* using constraint-based approaches. This function works by discretizing the linear Gaussian data into *bins* number of bins, and running the *discrete_constraint_estimatestruct* method on that discrete data with *pvalparam* and *indegree* as arguments. It returns the :doc:`GraphSkeleton <graphskeleton>` instance returned by this function. Arguments: 1. *data* -- An array of dicts containing samples from the network in {vertex: value} format. Example:: [ { 'Grade': 78.3223, 'SAT': 56.33, ... }, ... ] 2. *pvalparam* -- (Optional, default is 0.05) The p-value below which to consider something significantly unlikely. A common number used is 0.05 3. *bins* -- (Optional, default is 10) The number of bins to discretize the data into. The method is to find the highest and lowest value, divide that interval uniformly into a certain number of bins, and place the data inside. This number must be chosen carefully in light of the number of trials. There must be at least 5 trials in every bin, with more if the indegree is increased. 4. *indegree* -- (Optional, default is 1) The upper bound on the size of a witness set (see Koller et al. 85). If this is larger than 1, a huge amount of trials are required to avoid a divide-by-zero error. The number of bins and indegree must be chosen carefully based on the size and nature of the data set. Too many bins will lead to not enough data per bin, while too few bins will lead to dependencies not getting noticed. Usage example: this would learn structure from a set of 8000 linear Gaussian samples:: import json from libpgm.nodedata import NodeData from libpgm.graphskeleton import GraphSkeleton from libpgm.lgbayesiannetwork import LGBayesianNetwork from libpgm.pgmlearner import PGMLearner # generate some data to use nd = NodeData() nd.load("../tests/unittestdict.txt") # an input file skel = GraphSkeleton() skel.load("../tests/unittestdict.txt") skel.toporder() lgbn = LGBayesianNetwork(skel, nd) data = lgbn.randomsample(8000) # instantiate my learner learner = PGMLearner() # estimate structure result = learner.lg_constraint_estimatestruct(data) # output print json.dumps(result.E, indent=2) ''' assert (isinstance(data, list) and data and isinstance(data[0], dict)), "Arg must be a list of dicts." cdata = copy.deepcopy(data) # establish ranges ranges = dict() for variable in cdata[0].keys(): ranges[variable] = [float("infinity"), float("infinity") * -1] for sample in cdata: for var in sample.keys(): if sample[var] < ranges[var][0]: ranges[var][0] = sample[var] if sample[var] > ranges[var][1]: ranges[var][1] = sample[var] # discretize cdata set bincounts = dict() for key in cdata[0].keys(): bincounts[key] = [0 for _ in range(bins)] for sample in cdata: for i in range(bins): for var in sample.keys(): if (sample[var] >= (ranges[var][0] + (ranges[var][1] - ranges[var][0]) * i / float(bins)) and (sample[var] <= (ranges[var][0] + (ranges[var][1] - ranges[var][0]) * (i + 1) / float(bins)))): sample[var] = i bincounts[var][i] += 1 # run discrete_constraint_estimatestruct return self.discrete_constraint_estimatestruct(cdata, pvalparam, indegree)
[docs] def discrete_condind(self, data, X, Y, U): ''' Test how independent a variable *X* and a variable *Y* are in a discrete data set given by *data*, where the independence is conditioned on a set of variables given by *U*. This method works by assuming as a null hypothesis that the variables are conditionally independent on *U*, and thus that: .. math:: P(X, Y, U) = P(U) \\cdot P(X|U) \\cdot P(Y|U) It tests the deviance of the data from this null hypothesis, returning the result of a chi-square test and a p-value. Arguments: 1. *data* -- An array of dicts containing samples from the network in {vertex: value} format. Example:: [ { 'Grade': 'B', 'SAT': 'lowscore', ... }, ... ] 2. *X* -- A variable whose dependence on Y we are testing given U. 3. *Y* -- A variable whose dependence on X we are testing given U. 4. *U* -- A list of variables that are given. Returns: 1. *chi* -- The result of the chi-squared test on the data. This is a measure of the deviance of the actual distribution of X and Y given U from the expected distribution of X and Y given U. Since the null hypothesis is that X and Y are independent given U, the expected distribution is that :math:`P(X, Y, U) = P(U) P(X | U) P (Y | U)`. 2. *pval* -- The p-value of the test, meaning the probability of attaining a chi-square result as extreme as or more extreme than the one found, assuming that the null hypothesis is true. (e.g., a p-value of .05 means that if X and Y were independent given U, the chance of getting a chi-squared result this high or higher are .05) 3. *U* -- The 'witness' of X and Y's independence. This is the variable that, when it is known, leaves X and Y independent. For more information see Koller et al. 790. ''' # find possible outcomes and store _outcomes = dict() for key in data[0].keys(): _outcomes[key] = [data[0][key]] for sample in data: for key in _outcomes.keys(): if _outcomes[key].count(sample[key]) == 0: _outcomes[key].append(sample[key]) # store number of outcomes for X, Y, and U Xnumoutcomes = len(_outcomes[X]) Ynumoutcomes = len(_outcomes[Y]) Unumoutcomes = [] for val in U: Unumoutcomes.append(len(_outcomes[val])) # calculate P(U) -- the distribution of U PU = 1 # define helper function to add a dimension to an array recursively def add_dimension_to_array(mdarray, size): if isinstance(mdarray, list): for h in range(len(mdarray)): mdarray[h] = add_dimension_to_array(mdarray[h], size) return mdarray else: mdarray = [0 for _ in range(size)] return mdarray # make PU the right size for size in Unumoutcomes: PU = add_dimension_to_array(PU, size) # fill with data if (len(U) > 0): for sample in data: tmp = PU for x in range(len(U)-1): Uindex = _outcomes[U[x]].index(sample[U[x]]) tmp = tmp[Uindex] lastindex = _outcomes[U[-1]].index(sample[U[-1]]) tmp[lastindex] += 1 # calculate P(X, U) -- the distribution of X and U PXandU = [0 for _ in range(Xnumoutcomes)] for size in Unumoutcomes: PXandU = add_dimension_to_array(PXandU, size) for sample in data: Xindex = _outcomes[X].index(sample[X]) if len(U) > 0: tmp = PXandU[Xindex] for x in range(len(U)-1): Uindex = _outcomes[U[x]].index(sample[U[x]]) tmp = tmp[Uindex] lastindex = _outcomes[U[-1]].index(sample[U[-1]]) tmp[lastindex] += 1 else: PXandU[Xindex] += 1 # calculate P(Y, U) -- the distribution of Y and U PYandU = [0 for _ in range(Ynumoutcomes)] for size in Unumoutcomes: PYandU = add_dimension_to_array(PYandU, size) for sample in data: Yindex = _outcomes[Y].index(sample[Y]) if len(U) > 0: tmp = PYandU[Yindex] for x in range(len(U)-1): Uindex = _outcomes[U[x]].index(sample[U[x]]) tmp = tmp[Uindex] lastindex = _outcomes[U[-1]].index(sample[U[-1]]) tmp[lastindex] += 1 else: PYandU[Yindex] += 1 # assemble P(U)P(X|U)P(Y|U) -- the expected distribution if X and Y are # independent given U. expected = [[ 0 for _ in range(Ynumoutcomes)] for __ in range(Xnumoutcomes)] # define helper function to multiply the entries of two matrices def multiply_entries(matrixa, matrixb): matrix1 = copy.deepcopy(matrixa) matrix2 = copy.deepcopy(matrixb) if isinstance(matrix1, list): for h in range(len(matrix1)): matrix1[h] = multiply_entries(matrix1[h], matrix2[h]) return matrix1 else: return (matrix1 * matrix2) # define helper function to divide the entries of two matrices def divide_entries(matrixa, matrixb): matrix1 = copy.deepcopy(matrixa) matrix2 = copy.deepcopy(matrixb) if isinstance(matrix1, list): for h in range(len(matrix1)): matrix1[h] = divide_entries(matrix1[h], matrix2[h]) return matrix1 else: return (matrix1 / float(matrix2)) # combine known graphs to calculate P(U)P(X|U)P(Y|U) for x in range(Xnumoutcomes): for y in range(Ynumoutcomes): product = multiply_entries(PXandU[x], PYandU[y]) final = divide_entries(product, PU) expected[x][y] = final # find P(XYU) -- the actual distribution of X, Y, and U -- in sample PXYU = [[ 0 for _ in range(Ynumoutcomes)] for __ in range(Xnumoutcomes)] for size in Unumoutcomes: PXYU = add_dimension_to_array(PXYU, size) for sample in data: Xindex = _outcomes[X].index(sample[X]) Yindex = _outcomes[Y].index(sample[Y]) if len(U) > 0: tmp = PXYU[Xindex][Yindex] for x in range(len(U)-1): Uindex = _outcomes[U[x]].index(sample[U[x]]) tmp = tmp[Uindex] lastindex = _outcomes[U[-1]].index(sample[U[-1]]) tmp[lastindex] += 1 else: PXYU[Xindex][Yindex] += 1 # use scipy's chisquare to determine the deviance of the evidence a = np.array(expected) a = a.flatten() b = np.array(PXYU) b = b.flatten() # delete entries with value 0 (they mess up the chisquare function) for i in reversed(range(b.size)): if (b[i] == 0): if i != 0: a.itemset(i-1, a[i-1]+a[i]) a = np.delete(a, i) b = np.delete(b, i) # run chi-squared chi, pv = chisquare(a, b) # return chi-squared result, p-value for that result, and witness return chi, pv, U
[docs] def discrete_estimatebn(self, data, pvalparam=.05, indegree=1): ''' Fully learn a Bayesian network from discrete data given by *data*. This function combines the *discrete_constraint_estimatestruct* method (where it passes in the *pvalparam* and *indegree* arguments) with the *discrete_mle_estimateparams* method. It returns a complete :doc:`DiscreteBayesianNetwork <discretebayesiannetwork>` class instance learned from the data. Arguments: 1. *data* -- An array of dicts containing samples from the network in {vertex: value} format. Example:: [ { 'Grade': 'B', 'SAT': 'lowscore', ... }, ... ] 2. *pvalparam* -- The p-value below which to consider something significantly unlikely. A common number used is 0.05 3. *indegree* -- The upper bound on the size of a witness set (see Koller et al. 85). If this is larger than 1, a huge amount of trials are required to avoid a divide-by- zero error. ''' assert (isinstance(data, list) and data and isinstance(data[0], dict)), "Arg must be a list of dicts." # learn graph skeleton skel = self.discrete_constraint_estimatestruct(data, pvalparam=pvalparam, indegree=indegree) # learn parameters bn = self.discrete_mle_estimateparams(skel, data) # return return bn
[docs] def lg_estimatebn(self, data, pvalparam=.05, bins=10, indegree=1): ''' Fully learn a Bayesian network from linear Gaussian data given by *data*. This function combines the *lg_constraint_estimatestruct* method (where it passes in the *pvalparam*, *bins*, and *indegree* arguments) with the *lg_mle_estimateparams* method. It returns a complete :doc:`LGBayesianNetwork <discretebayesiannetwork>` class instance learned from the data. Arguments: 1. *data* -- An array of dicts containing samples from the network in {vertex: value} format. Example:: [ { 'Grade': 75.23423, 'SAT': 873.42342, ... }, ... ] 2. *pvalparam* -- The p-value below which to consider something significantly unlikely. A common number used is 0.05 3. *indegree* -- The upper bound on the size of a witness set (see Koller et al. 85). If this is larger than 1, a huge amount of trials are required to avoid a divide-by- zero error. Usage example: this would learn entire Bayesian networks from sets of 8000 data points:: import json from libpgm.nodedata import NodeData from libpgm.graphskeleton import GraphSkeleton from libpgm.lgbayesiannetwork import LGBayesianNetwork from libpgm.discretebayesiannetwork import DiscreteBayesianNetwork from libpgm.pgmlearner import PGMLearner # LINEAR GAUSSIAN # generate some data to use nd = NodeData() nd.load("../tests/unittestlgdict.txt") # an input file skel = GraphSkeleton() skel.load("../tests/unittestdict.txt") skel.toporder() lgbn = LGBayesianNetwork(skel, nd) data = lgbn.randomsample(8000) # instantiate my learner learner = PGMLearner() # learn bayesian network result = learner.lg_estimatebn(data) # output print json.dumps(result.E, indent=2) print json.dumps(result.Vdata, indent=2) # DISCRETE # generate some data to use nd = NodeData() nd.load("../tests/unittestdict.txt") # an input file skel = GraphSkeleton() skel.load("../tests/unittestdict.txt") skel.toporder() bn = DiscreteBayesianNetwork(skel, nd) data = bn.randomsample(8000) # instantiate my learner learner = PGMLearner() # learn bayesian network result = learner.discrete_estimatebn(data) # output print json.dumps(result.E, indent=2) print json.dumps(result.Vdata, indent=2) ''' assert (isinstance(data, list) and data and isinstance(data[0], dict)), "Arg must be a list of dicts." # learn graph skeleton skel = self.lg_constraint_estimatestruct(data, pvalparam=pvalparam, bins=bins, indegree=indegree) # learn parameters bn = self.lg_mle_estimateparams(skel, data) # return return bn