# Source code for libpgm.pgmlearner

# Copyright (c) 2012, CyberPoint International, LLC
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#     * Neither the name of the CyberPoint International, LLC nor the
#       names of its contributors may be used to endorse or promote products
#       derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL CYBERPOINT INTERNATIONAL, LLC BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
'''
This module provides tools to generate Bayesian networks that are "learned" from a data set. The learning process involves finding the Bayesian network that most accurately models data given as input -- in other words, finding the Bayesian network that makes the data set most likely. There are two major parts of Bayesian network learning: structure learning and parameter learning. Structure learning means finding the graph that most accurately depicts the dependencies detected in the data. Parameter learning means adjusting the parameters of the CPDs in a graph skeleton to most accurately model the data. This module has tools for both of these tasks.

'''
import copy
import json
import itertools
try:
import numpy as np
except ImportError:
raise ImportError, "numpy is not installed on your system."

try:
from scipy.stats import chisquare
except ImportError:
raise ImportError, "scipy is not installed on your system."

from graphskeleton import GraphSkeleton
from discretebayesiannetwork import DiscreteBayesianNetwork
from lgbayesiannetwork import LGBayesianNetwork
from sampleaggregator import SampleAggregator

[docs]class PGMLearner():
'''
This class is a machine with tools for learning Bayesian networks from data. It contains the *discrete_mle_estimateparams*, *lg_mle_estimateparams*, *discrete_constraint_estimatestruct*, *lg_constraint_estimatestruct*, *discrete_condind*, *discrete_estimatebn*, and *lg_estimatebn* methods.

'''
[docs]    def discrete_mle_estimateparams(self, graphskeleton, data):
'''
Estimate parameters for a discrete Bayesian network with a structure given by *graphskeleton* in order to maximize the probability of data given by *data*. This function takes the following arguments:

1. *graphskeleton* -- An instance of the :doc:GraphSkeleton <graphskeleton> class containing vertex and edge data.
2. *data* -- A list of dicts containing samples from the network in {vertex: value} format. Example::

[
{
'SAT': 'lowscore',
...
},
...
]

This function normalizes the distribution of a node's outcomes for each combination of its parents' outcomes. In doing so it creates an estimated tabular conditional probability distribution for each node. It then instantiates a :doc:DiscreteBayesianNetwork <discretebayesiannetwork> instance based on the *graphskeleton*, and modifies that instance's *Vdata* attribute to reflect the estimated CPDs. It then returns the instance.

The Vdata attribute instantiated is in the format seen in :doc:unittestdict, as described in :doc:discretebayesiannetwork.

Usage example: this would learn parameters from a set of 200 discrete samples::

import json

from libpgm.nodedata import NodeData
from libpgm.graphskeleton import GraphSkeleton
from libpgm.discretebayesiannetwork import DiscreteBayesianNetwork
from libpgm.pgmlearner import PGMLearner

# generate some data to use
nd = NodeData()
skel = GraphSkeleton()
skel.toporder()
bn = DiscreteBayesianNetwork(skel, nd)
data = bn.randomsample(200)

# instantiate my learner
learner = PGMLearner()

# estimate parameters from data and skeleton
result = learner.discrete_mle_estimateparams(skel, data)

# output
print json.dumps(result.Vdata, indent=2)

'''
assert (isinstance(graphskeleton, GraphSkeleton)), "First arg must be a loaded GraphSkeleton class."
assert (isinstance(data, list) and data and isinstance(data[0], dict)), "Second arg must be a list of dicts."

# instantiate Bayesian network, and add parent and children data
bn = DiscreteBayesianNetwork()
graphskeleton.toporder()
bn.V = graphskeleton.V
bn.E = graphskeleton.E
bn.Vdata = dict()
for vertex in bn.V:
bn.Vdata[vertex] = dict()
bn.Vdata[vertex]["children"] = graphskeleton.getchildren(vertex)
bn.Vdata[vertex]["parents"] = graphskeleton.getparents(vertex)

# make placeholders for vals, cprob, and numoutcomes
bn.Vdata[vertex]["vals"] = []
if (bn.Vdata[vertex]["parents"] == []):
bn.Vdata[vertex]["cprob"] = []
else:
bn.Vdata[vertex]["cprob"] = dict()

bn.Vdata[vertex]["numoutcomes"] = 0

# determine which outcomes are possible for each node
for sample in data:
for vertex in bn.V:
if (sample[vertex] not in bn.Vdata[vertex]["vals"]):
bn.Vdata[vertex]["vals"].append(sample[vertex])
bn.Vdata[vertex]["numoutcomes"] += 1

# lay out probability tables, and put a [num, denom] entry in all spots:

# define helper function to recursively set up cprob table
def addlevel(vertex, _dict, key, depth, totaldepth):
if depth == totaldepth:
_dict[str(key)] = []
for _ in range(bn.Vdata[vertex]["numoutcomes"]):
_dict[str(key)].append([0, 0])
return
else:
for val in bn.Vdata[bn.Vdata[vertex]["parents"][depth]]["vals"]:
ckey = key[:]
ckey.append(str(val))

# put [0, 0] at each entry of cprob table
for vertex in bn.V:
if (bn.Vdata[vertex]["parents"]):
root = bn.Vdata[vertex]["cprob"]
numparents = len(bn.Vdata[vertex]["parents"])
else:
for _ in range(bn.Vdata[vertex]["numoutcomes"]):
bn.Vdata[vertex]["cprob"].append([0, 0])

# fill out entries with samples:
for sample in data:
for vertex in bn.V:

# compute index of result
rindex = bn.Vdata[vertex]["vals"].index(sample[vertex])

# go to correct place in Vdata
if bn.Vdata[vertex]["parents"]:
pvals = [str(sample[t]) for t in bn.Vdata[vertex]["parents"]]
lev = bn.Vdata[vertex]["cprob"][str(pvals)]
else:
lev = bn.Vdata[vertex]["cprob"]

# increase all denominators for the current condition
for entry in lev:
entry[1] += 1

# increase numerator for current outcome
lev[rindex][0] += 1

# convert arrays to floats
for vertex in bn.V:
if not bn.Vdata[vertex]["parents"]:
bn.Vdata[vertex]["cprob"] = [x[0]/float(x[1]) for x in bn.Vdata[vertex]["cprob"]]
else:
for key in bn.Vdata[vertex]["cprob"].keys():
try:
bn.Vdata[vertex]["cprob"][key] = [x[0]/float(x[1]) for x in bn.Vdata[vertex]["cprob"][key]]

# default to even distribution if no data points
except ZeroDivisionError:
bn.Vdata[vertex]["cprob"][key] = [1/float(bn.Vdata[vertex]["numoutcomes"]) for x in bn.Vdata[vertex]["cprob"][key]]

# return cprob table with estimated probability distributions
return bn

[docs]    def lg_mle_estimateparams(self, graphskeleton, data):
'''
Estimate parameters for a linear Gaussian Bayesian network with a structure given by *graphskeleton* in order to maximize the probability of data given by *data*. This function takes the following arguments:

1. *graphskeleton* -- An instance of the :doc:GraphSkeleton <graphskeleton> class containing vertex and edge data.
2. *data* -- A list of dicts containing samples from the network in {vertex: value} format. Example::

[
{
'Intelligence': 29.545,
...
},
...
]

The algorithm used to calculate the linear Gaussian parameters is beyond the scope of this documentation -- for a full explanation, cf. Koller et al. 729. After the parameters are calculated, the program instantiates a :doc:DiscreteBayesianNetwork <discretebayesiannetwork> instance based on the *graphskeleton*, and modifies that instance's *Vdata* attribute to reflect the estimated CPDs. It then returns the instance.

The Vdata attribute instantiated is in the format seen in the input file example :doc:unittestdict, as described in :doc:discretebayesiannetwork.

Usage example: this would learn parameters from a set of 200 linear Gaussian samples::

import json

from libpgm.nodedata import NodeData
from libpgm.graphskeleton import GraphSkeleton
from libpgm.lgbayesiannetwork import LGBayesianNetwork
from libpgm.pgmlearner import PGMLearner

# generate some data to use
nd = NodeData()
skel = GraphSkeleton()
skel.toporder()
lgbn = LGBayesianNetwork(skel, nd)
data = lgbn.randomsample(200)

# instantiate my learner
learner = PGMLearner()

# estimate parameters
result = learner.lg_mle_estimateparams(skel, data)

# output
print json.dumps(result.Vdata, indent=2)

'''
assert (isinstance(graphskeleton, GraphSkeleton)), "First arg must be a loaded GraphSkeleton class."
assert (isinstance(data, list) and data and isinstance(data[0], dict)), "Second arg must be a list of dicts."

# instantiate Bayesian network, and add parent and children data
bn = LGBayesianNetwork()
graphskeleton.toporder()
bn.V = graphskeleton.V
bn.E = graphskeleton.E
bn.Vdata = dict()
for vertex in bn.V:
bn.Vdata[vertex] = dict()
bn.Vdata[vertex]["children"] = graphskeleton.getchildren(vertex)
bn.Vdata[vertex]["parents"] = graphskeleton.getparents(vertex)

# make placeholders for mean_base, mean_scal, and variance
bn.Vdata[vertex]["mean_base"] = 0.0
bn.Vdata[vertex]["mean_scal"] = []
for parent in bn.Vdata[vertex]["parents"]:
bn.Vdata[vertex]["mean_scal"].append(0.0)
bn.Vdata[vertex]["variance"] = 0.0

# make covariance table, array of E[X_i] for each vertex, and table
# of E[X_i * X_j] for each combination of vertices
cov = [[0 for _ in range(len(bn.V))] for __ in range(len(bn.V))]
singletons = [0 for _ in range(len(bn.V))]
numtrials = len(data)
for sample in data:
for x in range(len(bn.V)):
singletons[x] += sample[bn.V[x]]
for y in range(len(bn.V)):
cov[x][y] += sample[bn.V[x]] * sample[bn.V[y]]

for x in range(len(bn.V)):
singletons[x] /= float(numtrials)
for y in range(len(bn.V)):
cov[x][y] /= float(numtrials)

# (save copy. this is the E[X_i * X_j] table)
product_expectations = [[cov[x][y] for y in range(len(bn.V))] for x in range(len(bn.V))]

for x in range(len(bn.V)):
for y in range(len(bn.V)):
cov[x][y] = cov[x][y] - (singletons[x] * singletons[y])

# construct system of equations and solve (for each node)
for x in range(len(bn.V)):

system = [[product_expectations[p][q] for q in range(len(bn.V))] for p in range(len(bn.V))]

# step 0: remove all entries from all the tables except for node and its parents
rowstokeep = [x]
for z in range(len(bn.V)):
if bn.V[z] in bn.Vdata[bn.V[x]]["parents"]:
rowstokeep.append(z)
smalldim = len(rowstokeep)
smallsystem = [[0 for _ in range(smalldim)] for __ in range(smalldim)]
smallcov = [[0 for _ in range(smalldim)] for __ in range(smalldim)]
smallsing = [0 for _ in range(smalldim)]
for index in range(len(rowstokeep)):
smallsing[index] = singletons[rowstokeep[index]]
for index2 in range(len(rowstokeep)):
smallsystem[index][index2] = system[rowstokeep[index]][rowstokeep[index2]]
smallcov[index][index2] = cov[rowstokeep[index]][rowstokeep[index2]]

# step 1: delete and copy row corresponding to node (using [row][column] notation)
tmparray = [0 for _ in range(smalldim)]
for y in range(smalldim):
if (y > 0):
for j in range(smalldim):
smallsystem[y-1][j] = smallsystem[y][j]
if (y == 0):
for j in range(smalldim):
tmparray[j] = smallsystem[y][j]

# step 2: delete column, leaving system with all entries
# corresponding to parents of node
for y in range(smalldim):
if (y > 0):
for j in range(smalldim):
smallsystem[j][y-1] = smallsystem[j][y]

# step 3: take entry for node out of singleton array and store it
bordarray = []
for y in range(smalldim):
if (y != 0):
bordarray.append(smallsing[y])
else:
tmpentry = smallsing[y]

# step 4: add border array on borders of system
for y in range(len(bordarray)):
smallsystem[smalldim - 1][y] = bordarray[y]
smallsystem[y][smalldim - 1] = bordarray[y]
smallsystem[smalldim - 1][smalldim - 1] = 1

# step 5: construct equality vector (the 'b' of ax = b)
evector = [0 for _ in range(smalldim)]
for y in range(smalldim):
if (y != smalldim - 1):
evector[y] = tmparray[y + 1]
else:
evector[y] = tmpentry

# use numpy to solve
a = np.array(smallsystem)
b = np.array(evector)
solve = list(np.linalg.solve(a, b))

# fill mean_base and mean_scal[] with this data
bn.Vdata[bn.V[x]]["mean_base"] = solve[smalldim - 1]
for i in range(smalldim - 1):
bn.Vdata[bn.V[x]]["mean_scal"][i] = solve[i]

variance = smallcov[0][0]
for y in range(1, smalldim):
for z in range(1, smalldim):
variance -= (bn.Vdata[bn.V[x]]["mean_scal"][y-1] * bn.Vdata[bn.V[x]]["mean_scal"][z-1] * smallcov[y][z])
bn.Vdata[bn.V[x]]["variance"] = variance

# that's all folks
return bn

[docs]    def discrete_constraint_estimatestruct(self, data, pvalparam=0.05, indegree=1):
'''
Learn a Bayesian network structure from discrete data given by *data*, using constraint-based approaches. This function first calculates all the independencies and conditional independencies present between variables in the data. To calculate dependencies, it uses the *discrete_condind* method on each pair of variables, conditioned on other sets of variables of size *indegree* or smaller, to generate a chi-squared result and a p-value. If this p-value is less than *pvalparam*, the pair of variables are considered dependent conditioned on the variable set. Once all true dependencies -- pairs of variables that are dependent no matter what they are conditioned by -- are found, the algorithm uses these dependencies to construct a directed acyclic graph. It returns this DAG in the form of a :doc:GraphSkeleton <graphskeleton> class.

Arguments:
1. *data* -- An array of dicts containing samples from the network in {vertex: value} format. Example::

[
{
'SAT': 'lowscore',
...
},
...
]

2. *pvalparam* -- (Optional, default is 0.05) The p-value below which to consider something significantly unlikely. A common number used is 0.05. This is passed to *discrete_condind* when it is called.
3. *indegree* -- (Optional, default is 1) The upper bound on the size of a witness set (see Koller et al. 85). If this is larger than 1, a huge amount of samples in *data* are required to avoid a divide-by-zero error.

Usage example: this would learn structure from a set of 8000 discrete samples::

import json

from libpgm.nodedata import NodeData
from libpgm.graphskeleton import GraphSkeleton
from libpgm.discretebayesiannetwork import DiscreteBayesianNetwork
from libpgm.pgmlearner import PGMLearner

# generate some data to use
nd = NodeData()
skel = GraphSkeleton()
skel.toporder()
bn = DiscreteBayesianNetwork(skel, nd)
data = bn.randomsample(8000)

# instantiate my learner
learner = PGMLearner()

# estimate structure
result = learner.discrete_constraint_estimatestruct(data)

# output
print json.dumps(result.E, indent=2)

'''
assert (isinstance(data, list) and data and isinstance(data[0], dict)), "Arg must be a list of dicts."

# instantiate array of variables and array of potential dependencies
variables = data[0].keys()
ovariables = variables[:]
dependencies = []
for x in variables:
ovariables.remove(x)
for y in ovariables:
if (x != y):
dependencies.append([x, y])

# define helper function to find subsets
def subsets(array):
result = []
for i in range(indegree + 1):
comb = itertools.combinations(array, i)
for c in comb:
result.append(list(c))
return result

witnesses = []
othervariables = variables[:]

# for each pair of variables X, Y:
for X in variables:
othervariables.remove(X)
for Y in othervariables:

# consider all sets of witnesses that do not have X or Y in
# them, and are less than or equal to the size specified by
# the "indegree" argument
for U in subsets(variables):
if (X not in U) and (Y not in U) and len(U) <= indegree:

# determine conditional independence
chi, pv, witness = self.discrete_condind(data, X, Y, U)
if pv > pvalparam:
msg = "***%s and %s are found independent (chi = %f, pv = %f) with witness %s***" % (X, Y, chi, pv, U)
try:
dependencies.remove([X, Y])
dependencies.remove([Y, X])
except:
pass
witnesses.append([X, Y, witness])
break

# now that we have found our dependencies, run build PDAG (cf. Koller p. 89)
# with the stored set of independencies:

# assemble undirected graph skeleton
pdag = GraphSkeleton()
pdag.E = dependencies
pdag.V = variables

# adjust for immoralities (cf. Koller 86)
dedges = [x[:] for x in pdag.E]
for edge in dedges:
edge.append('u')

for edge1 in reversed(dedges):
for edge2 in reversed(dedges):
if (edge1 in dedges) and (edge2 in dedges):
if edge1[0] == edge2[1]:
if (([edge1[1], edge2[0], [edge1[0]]] not in witnesses) and ([edge2[0], edge1[1], [edge1[0]]] not in witnesses)):
dedges.append([edge1[1], edge1[0]])
dedges.append([edge2[0], edge2[1]])
dedges.remove(edge1)
dedges.remove(edge2)
elif edge1[1] == edge2[0]:
if (([edge1[0], edge2[1], [edge1[1]]] not in witnesses) and ([edge2[1], edge1[0], [edge1[1]]] not in witnesses)):
dedges.append([edge1[0], edge1[1]])
dedges.append([edge2[1], edge2[0]])
dedges.remove(edge1)
dedges.remove(edge2)
elif edge1[1] == edge2[1] and edge1[0] != edge2[0]:
if (([edge1[0], edge2[0], [edge1[1]]] not in witnesses) and ([edge2[0], edge1[0], [edge1[1]]] not in witnesses)):
dedges.append([edge1[0], edge1[1]])
dedges.append([edge2[0], edge2[1]])
dedges.remove(edge1)
dedges.remove(edge2)
elif edge1[0] == edge2[0] and edge1[1] != edge2[1]:
if (([edge1[1], edge2[1], [edge1[0]]] not in witnesses) and ([edge2[1], edge1[1], [edge1[0]]] not in witnesses)):
dedges.append([edge1[1], edge1[0]])
dedges.append([edge2[1], edge2[0]])
dedges.remove(edge1)
dedges.remove(edge2)

# define helper method "exists_undirected_edge"
def exists_undirected_edge(one_end, the_other_end):
for edge in dedges:
if len(edge) == 3:
if (edge[0] == one_end and edge[1] == the_other_end):
return True
elif (edge[1] == one_end and edge[0] == the_other_end):
return True
return False

# use right hand rules to improve graph until convergence (Koller 89)
olddedges = []
while (olddedges != dedges):
olddedges = [x[:] for x in dedges]
for edge1 in reversed(dedges):
for edge2 in reversed(dedges):

# rule 1
inverted = False
check1, check2 = False, True
if (edge1[1] == edge2[0] and len(edge1) == 2 and len(edge2) == 3):
check1 = True
elif (edge1[1] == edge2[1] and len(edge1) == 2 and len(edge2) == 3):
check = True
inverted = True
for edge3 in dedges:
if edge3 != edge1 and ((edge3[0] == edge1[0] and edge3[1]
== edge2[1]) or (edge3[1] == edge1[0] and edge3[0]
== edge2[1])):
check2 = False
if check1 == True and check2 == True:
if inverted:
dedges.append([edge1[1], edge2[0]])
else:
dedges.append([edge1[1], edge2[1]])
dedges.remove(edge2)

# rule 2
check1, check2 = False, False
if (edge1[1] == edge2[0] and len(edge1) == 2 and len(edge2) == 2):
check1 = True
for edge3 in dedges:
if ((edge3[0] == edge1[0] and edge3[1]
== edge2[1]) or (edge3[1] == edge1[0] and edge3[0]
== edge2[1]) and len(edge3) == 3):
check2 = True
if check1 == True and check2 == True:
if edge3[0] == edge1[0]:
dedges.append([edge3[0], edge3[1]])
elif edge3[1] == edge1[0]:
dedges.append([edge3[1], edge3[0]])
dedges.remove(edge3)

# rule 3
check1, check2 = False, False
if len(edge1) == 2 and len(edge2) == 2:
if (edge1[1] == edge2[1] and edge1[0] != edge2[0]):
check1 = True
for v in variables:
if (exists_undirected_edge(v, edge1[0]) and
exists_undirected_edge(v, edge1[1]) and
exists_undirected_edge(v, edge2[0])):
check2 = True
if check1 == True and check2 == True:
dedges.append([v, edge1[1]])
for edge3 in dedges:
if (len(edge3) == 3 and ((edge3[0] == v and edge3[1]
== edge1[1]) or (edge3[1] == v and edge3[0] ==
edge1[1]))):
dedges.remove(edge3)

# return one possible graph skeleton from the pdag class found
for x in range(len(dedges)):
if len(dedges[x]) == 3:
dedges[x] = dedges[x][:2]

pdag.E = dedges
pdag.toporder()
return pdag

[docs]    def lg_constraint_estimatestruct(self, data, pvalparam=0.05, bins=10, indegree=1):
'''
Learn a Bayesian network structure from linear Gaussian data given by *data* using constraint-based approaches. This function works by discretizing the linear Gaussian data into *bins* number of bins, and running the *discrete_constraint_estimatestruct* method on that discrete data with *pvalparam* and *indegree* as arguments. It returns the :doc:GraphSkeleton <graphskeleton> instance returned by this function.

Arguments:
1. *data* -- An array of dicts containing samples from the network in {vertex: value} format. Example::

[
{
'SAT': 56.33,
...
},
...
]

2. *pvalparam* -- (Optional, default is 0.05) The p-value below which to consider something significantly unlikely. A common number used is 0.05
3. *bins* -- (Optional, default is 10) The number of bins to discretize the data into. The method is to find the highest and lowest value, divide that interval uniformly into a certain number of bins, and place the data inside. This number must be chosen carefully in light of the number of trials. There must be at least 5 trials in every bin, with more if the indegree is increased.
4. *indegree* -- (Optional, default is 1) The upper bound on the size of a witness set (see Koller et al. 85). If this is larger than 1, a huge amount of trials are required to avoid a divide-by-zero error.

The number of bins and indegree must be chosen carefully based on the size and nature of the data set. Too many bins will lead to not enough data per bin, while too few bins will lead to dependencies not getting noticed.

Usage example: this would learn structure from a set of 8000 linear Gaussian samples::

import json

from libpgm.nodedata import NodeData
from libpgm.graphskeleton import GraphSkeleton
from libpgm.lgbayesiannetwork import LGBayesianNetwork
from libpgm.pgmlearner import PGMLearner

# generate some data to use
nd = NodeData()
skel = GraphSkeleton()
skel.toporder()
lgbn = LGBayesianNetwork(skel, nd)
data = lgbn.randomsample(8000)

# instantiate my learner
learner = PGMLearner()

# estimate structure
result = learner.lg_constraint_estimatestruct(data)

# output
print json.dumps(result.E, indent=2)

'''
assert (isinstance(data, list) and data and isinstance(data[0], dict)), "Arg must be a list of dicts."
cdata = copy.deepcopy(data)

# establish ranges
ranges = dict()
for variable in cdata[0].keys():
ranges[variable] = [float("infinity"), float("infinity") * -1]
for sample in cdata:
for var in sample.keys():
if sample[var] < ranges[var][0]:
ranges[var][0] = sample[var]
if sample[var] > ranges[var][1]:
ranges[var][1] = sample[var]

# discretize cdata set
bincounts = dict()
for key in cdata[0].keys():
bincounts[key] = [0 for _ in range(bins)]
for sample in cdata:
for i in range(bins):
for var in sample.keys():
if (sample[var] >= (ranges[var][0] + (ranges[var][1] - ranges[var][0]) * i / float(bins)) and (sample[var] <= (ranges[var][0] + (ranges[var][1] - ranges[var][0]) * (i + 1) / float(bins)))):
sample[var] = i
bincounts[var][i] += 1

# run discrete_constraint_estimatestruct
return self.discrete_constraint_estimatestruct(cdata, pvalparam, indegree)

[docs]    def discrete_condind(self, data, X, Y, U):
'''
Test how independent a variable *X* and a variable *Y* are in a discrete data set given by *data*, where the independence is conditioned on a set of variables given by *U*. This method works by assuming as a null hypothesis that the variables are conditionally independent on *U*, and thus that:

.. math::

P(X, Y, U) = P(U) \\cdot P(X|U) \\cdot P(Y|U)

It tests the deviance of the data from this null hypothesis, returning the result of a chi-square test and a p-value.

Arguments:
1. *data* -- An array of dicts containing samples from the network in {vertex: value} format. Example::

[
{
'SAT': 'lowscore',
...
},
...
]
2. *X* -- A variable whose dependence on Y we are testing given U.
3. *Y* -- A variable whose dependence on X we are testing given U.
4. *U* -- A list of variables that are given.

Returns:
1. *chi* -- The result of the chi-squared test on the data. This is a
measure of the deviance of the actual distribution of X and
Y given U from the expected distribution of X and Y given U.
Since the null hypothesis is that X and Y are independent
given U, the expected distribution is that :math:P(X, Y, U) =
P(U) P(X | U) P (Y | U).
2. *pval* -- The p-value of the test, meaning the probability of
attaining a chi-square result as extreme as or more extreme
than the one found, assuming that the null hypothesis is
true. (e.g., a p-value of .05 means that if X and Y were
independent given U, the chance of getting a chi-squared
result this high or higher are .05)
3. *U* -- The 'witness' of X and Y's independence. This is the variable
that, when it is known, leaves X and Y independent.

'''
# find possible outcomes and store
_outcomes = dict()
for key in data[0].keys():
_outcomes[key] = [data[0][key]]
for sample in data:
for key in _outcomes.keys():
if _outcomes[key].count(sample[key]) == 0:
_outcomes[key].append(sample[key])

# store number of outcomes for X, Y, and U
Xnumoutcomes = len(_outcomes[X])
Ynumoutcomes = len(_outcomes[Y])
Unumoutcomes = []
for val in U:
Unumoutcomes.append(len(_outcomes[val]))

# calculate P(U) -- the distribution of U
PU = 1

# define helper function to add a dimension to an array recursively
if isinstance(mdarray, list):
for h in range(len(mdarray)):
return mdarray
else:
mdarray = [0 for _ in range(size)]
return mdarray

# make PU the right size
for size in Unumoutcomes:

# fill with data
if (len(U) > 0):
for sample in data:
tmp = PU
for x in range(len(U)-1):
Uindex = _outcomes[U[x]].index(sample[U[x]])
tmp = tmp[Uindex]
lastindex = _outcomes[U[-1]].index(sample[U[-1]])
tmp[lastindex] += 1

# calculate P(X, U) -- the distribution of X and U
PXandU = [0 for _ in range(Xnumoutcomes)]
for size in Unumoutcomes:

for sample in data:
Xindex = _outcomes[X].index(sample[X])
if len(U) > 0:
tmp = PXandU[Xindex]
for x in range(len(U)-1):
Uindex = _outcomes[U[x]].index(sample[U[x]])
tmp = tmp[Uindex]
lastindex = _outcomes[U[-1]].index(sample[U[-1]])
tmp[lastindex] += 1
else:
PXandU[Xindex] += 1

# calculate P(Y, U) -- the distribution of Y and U
PYandU = [0 for _ in range(Ynumoutcomes)]
for size in Unumoutcomes:
for sample in data:
Yindex = _outcomes[Y].index(sample[Y])
if len(U) > 0:
tmp = PYandU[Yindex]
for x in range(len(U)-1):
Uindex = _outcomes[U[x]].index(sample[U[x]])
tmp = tmp[Uindex]
lastindex = _outcomes[U[-1]].index(sample[U[-1]])
tmp[lastindex] += 1
else:
PYandU[Yindex] += 1

# assemble P(U)P(X|U)P(Y|U) -- the expected distribution if X and Y are
# independent given U.
expected = [[ 0 for _ in range(Ynumoutcomes)] for __ in range(Xnumoutcomes)]

# define helper function to multiply the entries of two matrices
def multiply_entries(matrixa, matrixb):
matrix1 = copy.deepcopy(matrixa)
matrix2 = copy.deepcopy(matrixb)
if isinstance(matrix1, list):
for h in range(len(matrix1)):
matrix1[h] = multiply_entries(matrix1[h], matrix2[h])
return matrix1
else:
return (matrix1 * matrix2)

# define helper function to divide the entries of two matrices
def divide_entries(matrixa, matrixb):
matrix1 = copy.deepcopy(matrixa)
matrix2 = copy.deepcopy(matrixb)
if isinstance(matrix1, list):
for h in range(len(matrix1)):
matrix1[h] = divide_entries(matrix1[h], matrix2[h])
return matrix1
else:
return (matrix1 / float(matrix2))

# combine known graphs to calculate P(U)P(X|U)P(Y|U)
for x in range(Xnumoutcomes):
for y in range(Ynumoutcomes):
product = multiply_entries(PXandU[x], PYandU[y])
final = divide_entries(product, PU)
expected[x][y] = final

# find P(XYU) -- the actual distribution of X, Y, and U -- in sample
PXYU = [[ 0 for _ in range(Ynumoutcomes)] for __ in range(Xnumoutcomes)]
for size in Unumoutcomes:

for sample in data:
Xindex = _outcomes[X].index(sample[X])
Yindex = _outcomes[Y].index(sample[Y])
if len(U) > 0:
tmp = PXYU[Xindex][Yindex]
for x in range(len(U)-1):
Uindex = _outcomes[U[x]].index(sample[U[x]])
tmp = tmp[Uindex]
lastindex = _outcomes[U[-1]].index(sample[U[-1]])
tmp[lastindex] += 1
else:
PXYU[Xindex][Yindex] += 1

# use scipy's chisquare to determine the deviance of the evidence
a = np.array(expected)
a = a.flatten()
b = np.array(PXYU)
b = b.flatten()

# delete entries with value 0 (they mess up the chisquare function)
for i in reversed(range(b.size)):
if (b[i] == 0):
if i != 0:
a.itemset(i-1, a[i-1]+a[i])
a = np.delete(a, i)
b = np.delete(b, i)

# run chi-squared
chi, pv = chisquare(a, b)

# return chi-squared result, p-value for that result, and witness
return chi, pv, U

[docs]    def discrete_estimatebn(self, data, pvalparam=.05, indegree=1):
'''
Fully learn a Bayesian network from discrete data given by *data*. This function combines the *discrete_constraint_estimatestruct* method (where it passes in the *pvalparam* and *indegree* arguments) with the *discrete_mle_estimateparams* method. It returns a complete :doc:DiscreteBayesianNetwork <discretebayesiannetwork> class instance learned from the data.

Arguments:
1. *data* -- An array of dicts containing samples from the network in {vertex: value} format. Example::

[
{
'SAT': 'lowscore',
...
},
...
]
2. *pvalparam* -- The p-value below which to consider something significantly unlikely. A common number used is 0.05
3. *indegree* -- The upper bound on the size of a witness set (see Koller et al. 85). If this is larger than 1, a huge amount of trials are required to avoid a divide-by- zero error.

'''
assert (isinstance(data, list) and data and isinstance(data[0], dict)), "Arg must be a list of dicts."

# learn graph skeleton
skel = self.discrete_constraint_estimatestruct(data, pvalparam=pvalparam, indegree=indegree)

# learn parameters
bn = self.discrete_mle_estimateparams(skel, data)

# return
return bn

[docs]    def lg_estimatebn(self, data, pvalparam=.05, bins=10, indegree=1):
'''
Fully learn a Bayesian network from linear Gaussian data given by *data*. This function combines the *lg_constraint_estimatestruct* method (where it passes in the *pvalparam*, *bins*, and *indegree* arguments) with the *lg_mle_estimateparams* method. It returns a complete :doc:LGBayesianNetwork <discretebayesiannetwork> class instance learned from the data.

Arguments:
1. *data* -- An array of dicts containing samples from the network in {vertex: value} format. Example::

[
{
'SAT': 873.42342,
...
},
...
]
2. *pvalparam* -- The p-value below which to consider something significantly unlikely. A common number used is 0.05
3. *indegree* -- The upper bound on the size of a witness set (see Koller et al. 85). If this is larger than 1, a huge amount of trials are required to avoid a divide-by- zero error.

Usage example: this would learn entire Bayesian networks from sets of 8000 data points::

import json

from libpgm.nodedata import NodeData
from libpgm.graphskeleton import GraphSkeleton
from libpgm.lgbayesiannetwork import LGBayesianNetwork
from libpgm.discretebayesiannetwork import DiscreteBayesianNetwork
from libpgm.pgmlearner import PGMLearner

# LINEAR GAUSSIAN

# generate some data to use
nd = NodeData()
skel = GraphSkeleton()
skel.toporder()
lgbn = LGBayesianNetwork(skel, nd)
data = lgbn.randomsample(8000)

# instantiate my learner
learner = PGMLearner()

# learn bayesian network
result = learner.lg_estimatebn(data)

# output
print json.dumps(result.E, indent=2)
print json.dumps(result.Vdata, indent=2)

# DISCRETE

# generate some data to use
nd = NodeData()
skel = GraphSkeleton()
skel.toporder()
bn = DiscreteBayesianNetwork(skel, nd)
data = bn.randomsample(8000)

# instantiate my learner
learner = PGMLearner()

# learn bayesian network
result = learner.discrete_estimatebn(data)

# output
print json.dumps(result.E, indent=2)
print json.dumps(result.Vdata, indent=2)

'''
assert (isinstance(data, list) and data and isinstance(data[0], dict)), "Arg must be a list of dicts."

# learn graph skeleton
skel = self.lg_constraint_estimatestruct(data, pvalparam=pvalparam, bins=bins, indegree=indegree)

# learn parameters
bn = self.lg_mle_estimateparams(skel, data)

# return
return bn