#!/usr/bin/env python
# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------
# Copyright (c) 2015, IBM Corp.
# All rights reserved.
#
# Distributed under the terms of the BSD Simplified License.
#
# The full license is in the LICENSE file, distributed with this software.
#-----------------------------------------------------------------------------
"""
In-Database Naive Bayes modelization and prediction.
Copies the interface of sklearn.naive_bayes
"""
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from builtins import dict
from builtins import str
from future import standard_library
standard_library.install_aliases()
from lazy import lazy
import ibmdbpy
from ibmdbpy.exceptions import IdaNaiveBayesError
import six
[docs]class NaiveBayes(object):
"""
The Naive Bayes classification algorithm is a probabilistic classifier.
It is based on probability models that incorporate strong independence
assumptions. Often, the independence assumptions do not have an impact
on reality. Therefore, they are considered naive.
The NaiveBayes class provides an interface for using the NAIVEBAYES
and PREDICT_NAIVEBAYES IDAX methods of dashDB/DB2.
"""
[docs] def __init__(self, modelname = None, disc = None, bins = None):
"""
Constructor for NaiveBayes model objects
Parameters
----------
modelname : str, optional
The name of the Naive Bayes model that will be built. If no name is
specified, it will be generated automatically. If the parameter
corresponds to an existing model in the database, it is replaced
during the fitting step.
disc : str, optional, default: ew
Determine the automatic discretization of all continuous attributes.
The following values are allowed: ef, em, ew, and ewn.
* disc=ef
Equal-frequency discretization.
An unsupervised discretization algorithm that uses the equal
frequency criterion for interval bound setting.
* disc=em
Minimal entropy discretization.
An unsupervised discretization algorithm that uses the minimal
entropy criterion for interval bound setting.
* disc=ew (default)
Equal-width discretization.
An unsupervised discretization algorithm that uses the equal
width criterion for interval bound setting.
* disc=ewn
Equal-width discretization with nice bucket limits.
An unsupervised discretization algorithm that uses the equal
width criterion for interval bound setting.
bins : int, optional, default : 10
Number of bins for numeric columns.
Attributes
----------
TODO
Returns
-------
The NaiveBayes object, ready to be used for fitting and prediction.
Examples
--------
>>> idadb = IdaDataBase("BLUDB-TEST")
>>> idadf = IdaDataFrame(idadb, "IRIS")
>>> bayes = NaiveBayes("NAIVEBAYES_TEST")
>>> bayes.fit(idadf, column_id="ID", target="species")
>>> bayes.predict(idadf, outtable="IRIS_PREDICTION", outtableProb="IRIS_PREDICTIONPROB")
Notes
-----
Inner parameters of the model can be printed and modified by using
get_params and set_params. But we recommend creating a new
NaiveBayes model instead of modifying an existing model.
"""
# Get set at fit step
self._idadf = None
self._idadb = None
self._column_id = None
self.target = None
self.incolumn = None
self.coldeftype = None
self.coldefrole = None
self.colpropertiestable = None
# Get set at predict step
self.outtable = None
self.outtableProb = None
self.mestimation = None
self.modelname = modelname
self.disc = disc
self.bins = bins
@lazy
[docs] def labels_(self):
"""
Return the labels of the classification if available
"""
try:
return self.predict(self._idadf, self._column_id)
except:
raise AttributeError(str(self.__class__) + " object has no attribute 'labels_'")
[docs] def get_params(self):
"""
Return the parameters of the Naive Byes model.
"""
print(dir(self))
params = dict()
params['modelname'] = self.modelname
params['disc'] = self.disc
params['bins'] = self.bins
params['target'] = self.target
params['incolumn'] = self.incolumn
params['coldeftype'] = self.coldeftype
params['coldefrole'] = self.coldefrole
params['colpropertiestable'] = self.colpropertiestable
params['outtable'] = self.outtable
params['outtableProb'] = self.outtableProb
params['mestimation'] = self.mestimation
return params
[docs] def set_params(self, **params):
"""
Modify the parameters of the Naive Bayes model.
"""
if not params:
# Simple optimisation to gain speed (inspect is slow)
return self
valid_params = self.get_params()
for key, value in six.iteritems(params):
if key not in valid_params:
raise ValueError('Invalid parameter %s for estimator %s' %
(key, self.__class__.__name__))
setattr(self, key, value)
return self
[docs] def fit(self, idadf, target, column_id="ID", incolumn=None, coldeftype=None,
coldefrole=None, colpropertiestable=None, verbose=False):
"""
Create a Naive Bayes model from an IdaDataFrame.
Parameters
----------
idadf : IdaDataFrame
The IdaDataFrame to be used as input.
target : str
The column of the input table that represents the class
column_id : str, default: "ID
The column of the input table that identifies the transaction ID.
incolumn : str, optional
The columns of the input table that have specific properties,
which are separated by a semi-colon (;). Each column is succeeded
by one or more of the following properties:
* By type nominal (':nom') or by type continuous (':cont'). By default, numerical types are continuous, and all other types are nominal.
* By role ':id', ':target', ':input', or ':ignore'.
If this parameter is not specified, all columns of the input table have default properties.
coldeftype : str, optional
The default type of the input table columns.
The following values are allowed: 'nom' and 'cont'.
If the parameter is not specified, numeric columns are continuous,
and all other columns are nominal.
coldefrole : str, optional
The default role of the input table columns.
The following values are allowed: 'input' and 'ignore'.
If the parameter is not specified, all columns are input columns.
colpropertiestable : str, optional
The input table where the properties of the columns of the input table are stored.
If this parameter is not specified, the column properties of the input table
column properties are detected automatically.
verbose : bool, default: False
Verbosity mode.
"""
# Some basic checks
if not isinstance(idadf, ibmdbpy.IdaDataFrame):
raise TypeError("Argument should be an IdaDataFrame")
if target not in idadf.columns:
raise ValueError("Target is not a column in " + idadf.name)
idadf._idadb._check_procedure("NAIVEBAYES", "Naive Bayes")
# Check the ID
if column_id not in idadf.columns:
raise ValueError("No id columns is available in IdaDataFrame:" + column_id +
". Either create a new ID column using add_column_id function" +
" or give the name of a column that can be used as ID")
self._idadb = idadf._idadb
self._idadf = idadf
self._column_id = column_id
self.target = target
self.incolumn = incolumn
self.coldeftype = coldeftype
self.coldefrole = coldefrole
self.colpropertiestable = colpropertiestable
# Check or create a model name, drop it if it already exists.
if self.modelname is None:
self.modelname = idadf._idadb._get_valid_modelname('NAIVEBAYES_')
else:
self.modelname = ibmdbpy.utils.check_tablename(self.modelname)
if idadf._idadb.exists_model(self.modelname):
idadf._idadb.drop_model(self.modelname)
# Create a temporay view
# TODO: Why do we need actually to create a view ?
idadf.internal_state._create_view()
tmp_view_name = idadf.internal_state.current_state
if "." in tmp_view_name:
tmp_view_name = tmp_view_name.split('.')[-1]
try:
idadf._idadb._call_stored_procedure("IDAX.NAIVEBAYES ",
model = self.modelname,
intable = tmp_view_name,
id = self._column_id,
target = self.target,
incolumn = self.incolumn,
coldeftype = self.coldeftype,
coldefrole = self.coldefrole,
colPropertiesTable = self.colpropertiestable,
disc = self.disc,
bins = self.bins)
except:
raise
finally:
idadf.internal_state._delete_view()
idadf.commit()
self._retrieve_NaiveBayes_Model(self.modelname, verbose)
if verbose is True:
self.describe()
return
[docs] def predict(self, idadf, column_id=None, outtable=None, outtableProb=None,
mestimation = False):
"""
Use the Naive Bayes predict stored procedure to apply a Naive Bayes model
to generate classification predictions for a data set.
Parameters
----------
idadf : IdaDataFrame
IdaDataFrame to be used as input.
column_id : str, optional
The column of the input table that identifies a unique instance ID.
By default, the same id column that is specified in the stored
procedure to build the model.
outtable : str, optional
The name of the output table where the predictions are stored. If
this parameter is not specified, it is generated automatically. If
the parameter corresponds to an existing table in the database, it
will be replaced.
outtableProb : str, optional
The output table where the probabilities for each of the classes are stored.
If this parameter is not specified, the table is not created. If
the parameter corresponds to an existing table in the database, it
will be replaced.
mestimation : flag, default: False
A flag that indicates the use of m-estimation for probabilities.
This kind of estimation might be slower than other ones, but it
might produce better results for small or unbalanced data sets.
Returns
-------
IdaDataFrame
IdaDataFrame containing the classification decision for each
datapoints referenced by their ID.
"""
if not isinstance(idadf, ibmdbpy.IdaDataFrame):
raise TypeError("Argument should be an IdaDataFrame")
idadf._idadb._check_procedure("PREDICT_NAIVEBAYES", "Prediction for Naive Bayes")
# Check the ID
if column_id is None :
column_id = self._column_id
if column_id not in idadf.columns:
raise ValueError("No id columns is available in IdaDataFrame:" + column_id +
". Either create a new ID column using add_column_id function" +
" or give the name of a column that can be used as ID")
if self._idadb is None:
raise IdaNaiveBayesError("The Naive Bayes model was not trained before.")
# Check or create an outtable name, drop it if it already exists.
if outtable is None:
outtable = idadf._idadb._get_valid_tablename('PREDICT_NAIVEBAYES_')
else:
outtable = ibmdbpy.utils.check_tablename(outtable)
if idadf._idadb.exists_table(outtable):
idadf._idadb.drop_table(outtable)
if outtableProb is not None:
outtableProb = ibmdbpy.utils.check_tablename(outtableProb)
if idadf._idadb.exists_table(outtableProb):
idadf._idadb.drop_table(outtableProb)
self.outtable = outtable
self.outtableProb = outtableProb
self.mestimation = mestimation
# Create a temporay view
idadf.internal_state._create_view()
tmp_view_name = idadf.internal_state.current_state
if "." in tmp_view_name:
tmp_view_name = tmp_view_name.split('.')[-1]
try:
idadf._idadb._call_stored_procedure("IDAX.PREDICT_NAIVEBAYES ",
model = self.modelname,
intable = tmp_view_name,
id = column_id,
outtable = self.outtable,
outtableProb = self.outtableProb,
mestimation = self.mestimation
)
except:
raise
finally:
idadf.internal_state._delete_view()
idadf._idadb._autocommit()
self.labels_ = ibmdbpy.IdaDataFrame(idadf._idadb, self.outtable)
return self.labels_
[docs] def fit_predict(self, idadf, column_id="ID", incolumn=None, coldeftype=None,
coldefrole=None, colprepertiesTable=None, outtable = None,
outtableProb = None, mestimation = False, verbose=False):
"""
Convenience function for fitting the model and using it to make
predictions about the same dataset. See to fit and predict
documentation for an explanation about their attributes.
"""
self.fit(idadf, column_id, incolumn, coldeftype, coldefrole, colprepertiesTable, verbose)
return self.predict(idadf, column_id, outtable, outtableProb, mestimation)
[docs] def describe(self):
"""
Return a description of Naives Bayes.
"""
if self._idadb is None:
return self.get_params
else:
try:
self._retrieve_NaiveBayes_Model(self.modelname, verbose=True)
except:
raise
[docs] def _retrieve_NaiveBayes_Model(self, modelname, verbose = False):
"""
Retrieve information about the model to print the results. The Naive
Bayes IDAX function stores its result in 2 tables:
* <MODELNAME>_MODEL
* <MODELNAME>_DISCRANGES
Parameters
----------
modelname : str
The name of the model that is retrieved.
verbose : bol, default: False
Verbosity mode.
Notes
-----
Needs better formatting instead of printing the tables.
"""
modelname = ibmdbpy.utils.check_tablename(modelname)
if self._idadb is None:
raise IdaNaiveBayesError("The Naive Bayes model was not trained before.")
model_main = self._idadb.ida_query('SELECT * FROM "' +
self._idadb.current_schema + '"."' + modelname + '_MODEL"')
model_main.columns = ['ATTRIBUTE', 'VAL', 'CLASS', 'CLASSVALCOUNT', 'ATTRCLASSCOUNT',
'CLASSCOUNT', 'TOTALCOUNT']
model_main.columns = [x.upper() for x in model_main.columns]
disc = self._idadb.ida_query('SELECT * FROM "' +
self._idadb.current_schema + '"."' + modelname + '_DISCRANGES"')
disc.columns = ['COLNAME', 'BREAK']
disc.columns = [x.upper() for x in disc.columns]
if verbose is True:
print("MODEL")
print(model_main)
print("DISCRANGES")
print(disc)
return