#!/usr/bin/env python
# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------
# Copyright (c) 2015, IBM Corp.
# All rights reserved.
#
# Distributed under the terms of the BSD Simplified License.
#
# The full license is in the LICENSE file, distributed with this software.
#-----------------------------------------------------------------------------
"""
In-Database Association Rules Mining
"""
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from builtins import dict
from future import standard_library
standard_library.install_aliases()
import ibmdbpy
from ibmdbpy.exceptions import IdaAssociationRulesError
import six
#----------------------------------------------------------------------
# AssociationRules class
[docs]class AssociationRules(object):
"""
Association rules mining can be used to discover interesting and useful
relations between items in a large-scale transaction table. You can
identify strong rules between related items by using different measures of
relevance. Apriori or FP-Growth are well-known algorithms for association
rules mining. For analytic stored procedures, the PrefixSpan algorithm is
preferred due to its scalability.
The AssociationRules class provides an interface for using the
ASSOCRULES amd PREDICT_ASSOCRULES IDAX methods of dashDB/DB2.
"""
[docs] def __init__(self, modelname = None, minsupport = None, maxlen = 5, maxheadlen = 1, minconf = 0.5):
"""
Constructor for association rules model
Parameters
----------
modelname : str
The name of the Association Rules model that is built. If the
parameter corresponds to an existing model in the database, it
will be replaced during the fitting step.
minsupport : float or integer, optional
The minimum fraction (0.0 - 1.0) or the minimum number (above 1) of
transactions that must contain a pattern to be considered as frequent.
Default: system-determined
Range: >0.0 and <1.0 for a minimum fraction
>1 for a minimum number of transactions.
maxlen : int, optional, >=2, default: 5
The maximum length of a pattern or a rule, that is,
the maximum number of items per pattern or rule.
maxheadlen : int, optional, >= 1 and <maxlen, default: 1
The maximum length of a rule head, that is, the maximum number of
items that might belong to the item set on the right side of a rule.
Increasing this value might significantly increase the number of detected rules.
minconf : float, optional, >=0.0 and <= 1, default: 0.5
The minimum confidence that a rule must achieve to be kept in the model of the pattern.
Attributes
----------
TODO
Returns
-------
The AssociationRules object, ready to be used for fitting and prediction
Examples
--------
>>> idadb = IdaDataBase("BLUDB-TEST")
>>> idadf = IdaDataFrame(idadb, "GROCERIES")
>>> arules = AssociationRules("ASSOCRULES_TEST")
>>> arules.fit(idadf, transaction_id = "TID", item_id = "SID")
Notes
-----
Inner parameters of the model can be printed and modified by using
get_params and set_params. But we recommend creating a new
AssociationRules model instead of modifying it.
"""
# Get set at fit step
self._idadf = None
self._idadb = None
self._transaction_id = None
self._item_id = None
self.nametable = None
self.namecol = None
# Get set at predict step
self.outtable = None
self.type = None
self.limit = None
self.sort = None
self.modelname = modelname
self.minsupport = minsupport
self.maxlen = maxlen
self.maxheadlen = maxheadlen
self.minconf = minconf
[docs] def get_params(self):
"""
Return the parameters of the Association Rules model.
"""
params = dict()
params['modelname'] = self.modelname
params['minsupport'] = self.minsupport
params['maxlen'] = self.maxlen
params['maxheadlen'] = self.maxheadlen
params['minconf'] = self.minconf
params['nametable'] = self.nametable
params['namecol'] = self.namecol
params['outtable'] = self.outtable
params['type'] = self.type
params['limit'] = self.limit
params['sort'] = self.sort
return params
[docs] def set_params(self, **params):
"""
Modify the parameters of the Association Rules model.
"""
if not params:
# Simple optimisation to gain speed (inspect is slow)
return self
valid_params = self.get_params()
for key, value in six.iteritems(params):
if key not in valid_params:
raise ValueError('Invalid parameter %s for estimator %s' %
(key, self.__class__.__name__))
setattr(self, key, value)
return self
[docs] def fit(self, idadf, transaction_id, item_id, nametable=None, namecol=None, verbose=False):
"""
Create an Association Rules model from an IdaDataFrame.
Parameters
----------
idadf : IdaDataFrame
The IdaDataFrame to be used as input.
transaction_id : str
The column of the input table that identifies the transaction ID.
item_id : str
The column of the input table that identifies an item of the transaction.
nametable : str, optional
The table that contains a mapping of the items in the input table and their names.
The table must contain at least two columns, where
* The first column has the same name as the column that is
contained in the item parameter of the input table
* The second column has the same name as the name that is
defined in the namecol parameter
namecol : str, optional
The column that contains the item name that is defined in the
nametable parameter. You cannot specify this parameter if the
nametable parameter is not specified.
"""
if not type(idadf).__name__ == 'IdaDataFrame':
raise TypeError("Argument should be an IdaDataFrame")
if transaction_id not in idadf.columns:
raise ValueError("transaction_id is not a column in " + idadf.name)
if item_id is None:
raise ValueError("item_id cannot be None (should be a column in " + idadf.name + ")")
if item_id not in idadf.columns:
raise ValueError("item_id is not a column in " + idadf.name)
idadf._idadb._check_procedure("ASSOCRULES", "Association Rules")
# Check the ID
if transaction_id not in idadf.columns:
raise ValueError("Transaction id column"+ transaction_id +" is not available in IdaDataFrame:" )
self._idadb = idadf._idadb
self._idadf = idadf
self._transaction_id = transaction_id
self._item_id = item_id
self.nametable = nametable
self.namecol = namecol
# Check or create a model name
if self.modelname is None:
self.modelname = idadf._idadb._get_valid_modelname('ASSOCRULES_')
else:
self.modelname = ibmdbpy.utils.check_tablename(self.modelname)
if idadf._idadb.exists_model(self.modelname):
idadf._idadb.drop_model(self.modelname)
# Create a temporay view
idadf.internal_state._create_view()
tmp_view_name = idadf.internal_state.current_state
if "." in tmp_view_name:
tmp_view_name = tmp_view_name.split('.')[-1]
try:
idadf._idadb._call_stored_procedure("IDAX.ASSOCRULES ",
model = self.modelname,
intable = tmp_view_name,
tid = transaction_id,
item = item_id,
minsupport = self.minsupport,
maxlen = self.maxlen,
maxheadlen = self.maxheadlen,
minconf = self.minconf,
nametable = self.nametable,
namecol = self.namecol)
except:
raise
finally:
idadf.internal_state._delete_view()
idadf.commit()
self._retrieve_AssociationRules_Model(self.modelname, verbose)
return
[docs] def prune(self, itemsin = None, itemsout = None, minlen = 1, maxlen = None,
minsupport = 0, maxsupport = 1, minlift = None, maxlift = None,
minconf = None, maxconf = None, reset = False):
"""
Prune the rules and patterns of an association rules model. To remove
rules and pattern which you are not interested in, you can use filters
to exclude these rules and patterns. These rules and patterns are then
marked as not valid in the model and are no longer shown.
Parameters
----------
itemsin : str or list, optional
A list of item names that must be contained in the rules or
patterns to be kept. The items are separated by semicolons. At
least one of the listed items must be contained in a rule or
pattern to be kept.
For rules, the following conditions apply:
* To indicate that the item must be contained in the head of
then rule, the item names can be succeeded by :h or :head.
* To indicate that the item must be contained in the body of
the rule, the item names can be succeeded by :b or :body
If this parameter is not specified, no constraint is applied.
itemsout : str or list, optional
A list of item names that must not be contained in the rules or
patterns to be kept. The items are separated by semicolons.
If this parameter is not specified, no constraint is applied.
minlen : int, optional, >=1, default: 1
The minimum number of items that are to be kept in the rules or
patterns.
maxlen : int, optional, >=1, default: the longest pattern of the model
The maximum number of items that are to be kept in the rules or
patterns.
minsupport : float, optional, >=0.0 and <=maxsupport, default : 0
The minimum support for the rules or patterns that are to be kept.
maxsupport : float, optional, >=minsupport and <=1.0, default : 1
The maximum support for the rules or patterns that are to be kept.
minlift : float, optional, >=0.0 and <=maxlift, defaukt : 0
The minimum lift of the rules or patterns that are to be kept.
maxlift : float, optional, >=minlift, default: the maximum lift of the patterns of the model
The maximum lift of the rules or patterns that are to be kept.
minconf : float, optional, >=0.0 and <= maxconf, default : 0
The minimum confidence of the rules that are to be kept.
maxconf : float, optional, >=minconf and <= 1.0, default : 1
The maximum confidence of the rules that are to be kept.
reset : bool, optional, default: false
If you specify reset=true, all rules and patterns are first reset
to not pruned.
If you specify reset=true or reset=false, the rules and patterns
that are not to be kept are marked as pruned.
"""
self._idadf._idadb._check_procedure("PRUNE_ASSOCRULES", "Pruning for Association Rules")
if isinstance(itemsin, list):
itemsin = ";".join(itemsin)
try:
self._idadf._idadb._call_stored_procedure("IDAX.PRUNE_ASSOCRULES ",
model = self.modelname,
itemsin = itemsin,
itemsout = itemsout,
minlen = minlen,
maxlen = maxlen,
minsupport = minsupport,
maxsupport = maxsupport,
minlift = minlift,
maxlift = maxlift,
minconf = minconf,
maxconf = maxconf,
reset = reset)
except:
raise
else:
return
[docs] def predict(self, idadf, outtable=None, transaction_id=None, item_id=None,
type="rules", limit=1, sort=None):
"""
Apply the rules and patterns of an association rules model to other
transactions. You can apply all rules or only specific rules according
to specified criteria.
Parameters
----------
idadf : IdaDataFrame
IdaDataFrame to be used as input.
outtable : str, optional
The name of the output table in which the mapping between the input
sequences and the associated rules or patterns is written. If the
parameter corresponds to an existing table in the database, it is
replaced.
transaction_id : str, optional
The column of the input table that identifies the transaction ID.
By default, this is the same tid column that is specified in the
stored procedure to build the model.
item_id : str, optional
The column of the input table that identifies an item of the
transaction. By default, this is the same item column that is
specified in the stored procedure to build the model.
type : str, optional, default : "rules"
The type of information that is written in the output table. The
following values are possible: ‘rules’ and ‘patterns’.
limit : int, optional, >=1, default: 1
The maximum number of rules or patterns that is written in the
output table for each input sequence.
sort : str or list, optional
A list of keywords that indicates the order in which the rules or
patterns are written in the output table. The order of the list is
descending. The items are separated by semicolons. The following
values are possible: ‘support’, ‘confidence’, ‘lift’, and ‘length’.
The ‘confidence’ value can only be specified if the type parameter
is ‘rules’. If the type parameter is ‘rules’, the default is:
support;confidence;length. If the type parameter is ‘patterns’,
the default is: support;lift;length.
Notes
-----
When "type" is set to "rules", it looks like nothing is returned.
"""
if not isinstance(idadf, ibmdbpy.IdaDataFrame):
raise TypeError("Argument should be an IdaDataFrame")
if sort is not None:
sort = ';'.join(sort)
if transaction_id is None:
transaction_id = self.transaction_id
if item_id is None:
item_id = self.item_id
# Check the ID
if transaction_id not in idadf.columns:
raise ValueError("Transaction id column"+ transaction_id +" is not available in IdaDataFrame." )
if self._idadb is None:
raise IdaAssociationRulesError("No Association rules model was trained before.")
# The version where we don't replace the outtable if it exists but raise an exception
#if outtable is not None:
# if idadf._idadb.exists_table(outtable):
# raise ValueError("Table "+ outtable +" already exists.")
#else:
# outtable = idadf._idadb._get_valid_modelname('PREDICT_ASSOCRULES_')
if self.outtable is None:
self.outtable = idadf._idadb._get_valid_tablename('ASSOCRULES_')
else:
self.outtable = ibmdbpy.utils.check_tablename(self.outtable)
if idadf._idadb.exists_table(self.outtable):
idadf._idadb.drop_table(self.outtable)
self.outtable = outtable
self.type = type
self.limit = limit
self.sort = sort
# Create a temporay view
idadf.internal_state._create_view()
tmp_view_name = idadf.internal_state.current_state
if "." in tmp_view_name:
tmp_view_name = tmp_view_name.split('.')[-1]
try:
idadf._idadb._call_stored_procedure("IDAX.PREDICT_ASSOCRULES ",
model = self.modelname,
intable = tmp_view_name,
outtable = outtable,
tid = transaction_id,
item = item_id,
type = type,
limit = limit,
sort = sort
)
except:
raise
finally:
idadf.internal_state._delete_view()
idadf._cursor.commit()
self.labels_ = ibmdbpy.IdaDataFrame(idadf._idadb, outtable)
return self.labels_
[docs] def fit_predict(self, idadf, transaction_id, item_id, nametable=None,
namecol=None, outtable=None, type="rules", limit=1,
sort=None, verbose=False):
"""
Convenience function for fitting the model and using it to make
predictions about the same dataset. See the fit and predict
documentation for an explanation about their attributes.
Notes
-----
If you use this function, you are not able to use the prune step
between the fit and the predict step. However, you can still prune
afterwards and reuse the predict function.
"""
#TODO: Is it relevant ? Result of predict on the same dataset looks empty
#for type "rules"
self.fit(idadf, transaction_id, item_id, nametable, namecol, verbose)
return self.predict(idadf, outtable, transaction_id, item_id, type, limit, sort)
[docs] def describe(self):
"""
Return a description of Association Rules Model.
"""
if self._idadb is None:
return self.get_params
else:
try:
# Not sure it is useful
res = self._idadb._call_stored_procedure("IDAX.PRINT_MODEL ", model = self.modelname)
self._retrieve_AssociationRules_Model(self.modelname, verbose=True)
except:
raise
else:
print(res)
return
[docs] def _retrieve_AssociationRules_Model(self, modelname, verbose = False):
"""
Retrieve information about the model to print the results. The
Association Rules IDAX function stores its result in 4 tables:
* <MODELNAME>_ASSOCPATTERNS
* <MODELNAME>_ASSOCPATTERNS_STATISTICS
* <MODELNAME>_ASSOCRULES
* <MODELNAME>_ITEMS
Parameters
----------
modelname : str
The name of the model that is retrieved.
verbose : bol, default: False
Verbosity mode.
Notes
-----
Needs better formatting instead of printing the tables
"""
modelname = ibmdbpy.utils.check_tablename(modelname)
if self._idadb is None:
raise IdaAssociationRulesError("No Association rules model was trained before.")
# Note: The name of the columns in hardcoded, this is done so as a
# workaround for some bug in a specific ODBC linux driver.
# In case the implementation of the IDA method changes, this may break
# But still would not be difficult to fix
assocpatterns = self._idadb.ida_query('SELECT * FROM "' +
self._idadb.current_schema + '"."' + modelname + '_ASSOCPATTERNS"')
assocpatterns.columns = ["ITEMSETID","ITEMID"]
assocpatterns.columns = [x.upper() for x in assocpatterns.columns]
assocpatterns_stats = self._idadb.ida_query('SELECT * FROM "' +
self._idadb.current_schema + '"."' + modelname + '_ASSOCPATTERNS_STATISTICS"')
assocpatterns_stats = ["ITEMSETID" , "LENGTH" , "COUNT" , "SUPPORT" , "LIFT" ,"PRUNED"]
assocpatterns_stats.columns = [x.upper() for x in assocpatterns_stats.columns]
assocrules = self._idadb.ida_query('SELECT * FROM "' +
self._idadb.current_schema + '"."' + modelname + '_ASSOCRULES"')
assocrules.columns = ["RULEID", "ITEMSETID", "BODYID", "HEADID", "CONFIDENCE", "PRUNED"]
assocrules.columns = [x.upper() for x in assocrules.columns]
items = self._idadb.ida_query('SELECT * FROM "' +
self._idadb.current_schema + '"."' + modelname + '_ITEMS"')
items.columns = ["ITEMID","ITEM","ITEMNAME","COUNT","SUPPORT"]
items.columns = [x.upper() for x in items.columns]
if verbose is True:
print("assocpatterns")
print(assocpatterns)
print("assocpatterns_stats")
print(assocpatterns_stats)
print("assocrules")
print(assocrules)
print("items")
print(items)
return