Source code for ibmdbpy.learn.association_rules

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------
# Copyright (c) 2015, IBM Corp.
# All rights reserved.
#
# Distributed under the terms of the BSD Simplified License.
#
# The full license is in the LICENSE file, distributed with this software.
#-----------------------------------------------------------------------------

"""
In-Database Association Rules Mining
"""
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from builtins import dict
from future import standard_library
standard_library.install_aliases()

import ibmdbpy
from ibmdbpy.exceptions import IdaAssociationRulesError
import six

#----------------------------------------------------------------------
# AssociationRules class
[docs]class AssociationRules(object): """ Association rules mining can be used to discover interesting and useful relations between items in a large-scale transaction table. You can identify strong rules between related items by using different measures of relevance. Apriori or FP-Growth are well-known algorithms for association rules mining. For analytic stored procedures, the PrefixSpan algorithm is preferred due to its scalability. The AssociationRules class provides an interface for using the ASSOCRULES amd PREDICT_ASSOCRULES IDAX methods of dashDB/DB2. """
[docs] def __init__(self, modelname = None, minsupport = None, maxlen = 5, maxheadlen = 1, minconf = 0.5): """ Constructor for association rules model Parameters ---------- modelname : str The name of the Association Rules model that is built. If the parameter corresponds to an existing model in the database, it will be replaced during the fitting step. minsupport : float or integer, optional The minimum fraction (0.0 - 1.0) or the minimum number (above 1) of transactions that must contain a pattern to be considered as frequent. Default: system-determined Range: >0.0 and <1.0 for a minimum fraction >1 for a minimum number of transactions. maxlen : int, optional, >=2, default: 5 The maximum length of a pattern or a rule, that is, the maximum number of items per pattern or rule. maxheadlen : int, optional, >= 1 and <maxlen, default: 1 The maximum length of a rule head, that is, the maximum number of items that might belong to the item set on the right side of a rule. Increasing this value might significantly increase the number of detected rules. minconf : float, optional, >=0.0 and <= 1, default: 0.5 The minimum confidence that a rule must achieve to be kept in the model of the pattern. Attributes ---------- TODO Returns ------- The AssociationRules object, ready to be used for fitting and prediction Examples -------- >>> idadb = IdaDataBase("BLUDB-TEST") >>> idadf = IdaDataFrame(idadb, "GROCERIES") >>> arules = AssociationRules("ASSOCRULES_TEST") >>> arules.fit(idadf, transaction_id = "TID", item_id = "SID") Notes ----- Inner parameters of the model can be printed and modified by using get_params and set_params. But we recommend creating a new AssociationRules model instead of modifying it. """ # Get set at fit step self._idadf = None self._idadb = None self._transaction_id = None self._item_id = None self.nametable = None self.namecol = None # Get set at predict step self.outtable = None self.type = None self.limit = None self.sort = None self.modelname = modelname self.minsupport = minsupport self.maxlen = maxlen self.maxheadlen = maxheadlen self.minconf = minconf
[docs] def get_params(self): """ Return the parameters of the Association Rules model. """ params = dict() params['modelname'] = self.modelname params['minsupport'] = self.minsupport params['maxlen'] = self.maxlen params['maxheadlen'] = self.maxheadlen params['minconf'] = self.minconf params['nametable'] = self.nametable params['namecol'] = self.namecol params['outtable'] = self.outtable params['type'] = self.type params['limit'] = self.limit params['sort'] = self.sort return params
[docs] def set_params(self, **params): """ Modify the parameters of the Association Rules model. """ if not params: # Simple optimisation to gain speed (inspect is slow) return self valid_params = self.get_params() for key, value in six.iteritems(params): if key not in valid_params: raise ValueError('Invalid parameter %s for estimator %s' % (key, self.__class__.__name__)) setattr(self, key, value) return self
[docs] def fit(self, idadf, transaction_id, item_id, nametable=None, namecol=None, verbose=False): """ Create an Association Rules model from an IdaDataFrame. Parameters ---------- idadf : IdaDataFrame The IdaDataFrame to be used as input. transaction_id : str The column of the input table that identifies the transaction ID. item_id : str The column of the input table that identifies an item of the transaction. nametable : str, optional The table that contains a mapping of the items in the input table and their names. The table must contain at least two columns, where * The first column has the same name as the column that is contained in the item parameter of the input table * The second column has the same name as the name that is defined in the namecol parameter namecol : str, optional The column that contains the item name that is defined in the nametable parameter. You cannot specify this parameter if the nametable parameter is not specified. """ if not type(idadf).__name__ == 'IdaDataFrame': raise TypeError("Argument should be an IdaDataFrame") if transaction_id not in idadf.columns: raise ValueError("transaction_id is not a column in " + idadf.name) if item_id is None: raise ValueError("item_id cannot be None (should be a column in " + idadf.name + ")") if item_id not in idadf.columns: raise ValueError("item_id is not a column in " + idadf.name) idadf._idadb._check_procedure("ASSOCRULES", "Association Rules") # Check the ID if transaction_id not in idadf.columns: raise ValueError("Transaction id column"+ transaction_id +" is not available in IdaDataFrame:" ) self._idadb = idadf._idadb self._idadf = idadf self._transaction_id = transaction_id self._item_id = item_id self.nametable = nametable self.namecol = namecol # Check or create a model name if self.modelname is None: self.modelname = idadf._idadb._get_valid_modelname('ASSOCRULES_') else: self.modelname = ibmdbpy.utils.check_tablename(self.modelname) if idadf._idadb.exists_model(self.modelname): idadf._idadb.drop_model(self.modelname) # Create a temporay view idadf.internal_state._create_view() tmp_view_name = idadf.internal_state.current_state if "." in tmp_view_name: tmp_view_name = tmp_view_name.split('.')[-1] try: idadf._idadb._call_stored_procedure("IDAX.ASSOCRULES ", model = self.modelname, intable = tmp_view_name, tid = transaction_id, item = item_id, minsupport = self.minsupport, maxlen = self.maxlen, maxheadlen = self.maxheadlen, minconf = self.minconf, nametable = self.nametable, namecol = self.namecol) except: raise finally: idadf.internal_state._delete_view() idadf.commit() self._retrieve_AssociationRules_Model(self.modelname, verbose) return
[docs] def prune(self, itemsin = None, itemsout = None, minlen = 1, maxlen = None, minsupport = 0, maxsupport = 1, minlift = None, maxlift = None, minconf = None, maxconf = None, reset = False): """ Prune the rules and patterns of an association rules model. To remove rules and pattern which you are not interested in, you can use filters to exclude these rules and patterns. These rules and patterns are then marked as not valid in the model and are no longer shown. Parameters ---------- itemsin : str or list, optional A list of item names that must be contained in the rules or patterns to be kept. The items are separated by semicolons. At least one of the listed items must be contained in a rule or pattern to be kept. For rules, the following conditions apply: * To indicate that the item must be contained in the head of then rule, the item names can be succeeded by :h or :head. * To indicate that the item must be contained in the body of the rule, the item names can be succeeded by :b or :body If this parameter is not specified, no constraint is applied. itemsout : str or list, optional A list of item names that must not be contained in the rules or patterns to be kept. The items are separated by semicolons. If this parameter is not specified, no constraint is applied. minlen : int, optional, >=1, default: 1 The minimum number of items that are to be kept in the rules or patterns. maxlen : int, optional, >=1, default: the longest pattern of the model The maximum number of items that are to be kept in the rules or patterns. minsupport : float, optional, >=0.0 and <=maxsupport, default : 0 The minimum support for the rules or patterns that are to be kept. maxsupport : float, optional, >=minsupport and <=1.0, default : 1 The maximum support for the rules or patterns that are to be kept. minlift : float, optional, >=0.0 and <=maxlift, defaukt : 0 The minimum lift of the rules or patterns that are to be kept. maxlift : float, optional, >=minlift, default: the maximum lift of the patterns of the model The maximum lift of the rules or patterns that are to be kept. minconf : float, optional, >=0.0 and <= maxconf, default : 0 The minimum confidence of the rules that are to be kept. maxconf : float, optional, >=minconf and <= 1.0, default : 1 The maximum confidence of the rules that are to be kept. reset : bool, optional, default: false If you specify reset=true, all rules and patterns are first reset to not pruned. If you specify reset=true or reset=false, the rules and patterns that are not to be kept are marked as pruned. """ self._idadf._idadb._check_procedure("PRUNE_ASSOCRULES", "Pruning for Association Rules") if isinstance(itemsin, list): itemsin = ";".join(itemsin) try: self._idadf._idadb._call_stored_procedure("IDAX.PRUNE_ASSOCRULES ", model = self.modelname, itemsin = itemsin, itemsout = itemsout, minlen = minlen, maxlen = maxlen, minsupport = minsupport, maxsupport = maxsupport, minlift = minlift, maxlift = maxlift, minconf = minconf, maxconf = maxconf, reset = reset) except: raise else: return
[docs] def predict(self, idadf, outtable=None, transaction_id=None, item_id=None, type="rules", limit=1, sort=None): """ Apply the rules and patterns of an association rules model to other transactions. You can apply all rules or only specific rules according to specified criteria. Parameters ---------- idadf : IdaDataFrame IdaDataFrame to be used as input. outtable : str, optional The name of the output table in which the mapping between the input sequences and the associated rules or patterns is written. If the parameter corresponds to an existing table in the database, it is replaced. transaction_id : str, optional The column of the input table that identifies the transaction ID. By default, this is the same tid column that is specified in the stored procedure to build the model. item_id : str, optional The column of the input table that identifies an item of the transaction. By default, this is the same item column that is specified in the stored procedure to build the model. type : str, optional, default : "rules" The type of information that is written in the output table. The following values are possible: ‘rules’ and ‘patterns’. limit : int, optional, >=1, default: 1 The maximum number of rules or patterns that is written in the output table for each input sequence. sort : str or list, optional A list of keywords that indicates the order in which the rules or patterns are written in the output table. The order of the list is descending. The items are separated by semicolons. The following values are possible: ‘support’, ‘confidence’, ‘lift’, and ‘length’. The ‘confidence’ value can only be specified if the type parameter is ‘rules’. If the type parameter is ‘rules’, the default is: support;confidence;length. If the type parameter is ‘patterns’, the default is: support;lift;length. Notes ----- When "type" is set to "rules", it looks like nothing is returned. """ if not isinstance(idadf, ibmdbpy.IdaDataFrame): raise TypeError("Argument should be an IdaDataFrame") if sort is not None: sort = ';'.join(sort) if transaction_id is None: transaction_id = self.transaction_id if item_id is None: item_id = self.item_id # Check the ID if transaction_id not in idadf.columns: raise ValueError("Transaction id column"+ transaction_id +" is not available in IdaDataFrame." ) if self._idadb is None: raise IdaAssociationRulesError("No Association rules model was trained before.") # The version where we don't replace the outtable if it exists but raise an exception #if outtable is not None: # if idadf._idadb.exists_table(outtable): # raise ValueError("Table "+ outtable +" already exists.") #else: # outtable = idadf._idadb._get_valid_modelname('PREDICT_ASSOCRULES_') if self.outtable is None: self.outtable = idadf._idadb._get_valid_tablename('ASSOCRULES_') else: self.outtable = ibmdbpy.utils.check_tablename(self.outtable) if idadf._idadb.exists_table(self.outtable): idadf._idadb.drop_table(self.outtable) self.outtable = outtable self.type = type self.limit = limit self.sort = sort # Create a temporay view idadf.internal_state._create_view() tmp_view_name = idadf.internal_state.current_state if "." in tmp_view_name: tmp_view_name = tmp_view_name.split('.')[-1] try: idadf._idadb._call_stored_procedure("IDAX.PREDICT_ASSOCRULES ", model = self.modelname, intable = tmp_view_name, outtable = outtable, tid = transaction_id, item = item_id, type = type, limit = limit, sort = sort ) except: raise finally: idadf.internal_state._delete_view() idadf._cursor.commit() self.labels_ = ibmdbpy.IdaDataFrame(idadf._idadb, outtable) return self.labels_
[docs] def fit_predict(self, idadf, transaction_id, item_id, nametable=None, namecol=None, outtable=None, type="rules", limit=1, sort=None, verbose=False): """ Convenience function for fitting the model and using it to make predictions about the same dataset. See the fit and predict documentation for an explanation about their attributes. Notes ----- If you use this function, you are not able to use the prune step between the fit and the predict step. However, you can still prune afterwards and reuse the predict function. """ #TODO: Is it relevant ? Result of predict on the same dataset looks empty #for type "rules" self.fit(idadf, transaction_id, item_id, nametable, namecol, verbose) return self.predict(idadf, outtable, transaction_id, item_id, type, limit, sort)
[docs] def describe(self): """ Return a description of Association Rules Model. """ if self._idadb is None: return self.get_params else: try: # Not sure it is useful res = self._idadb._call_stored_procedure("IDAX.PRINT_MODEL ", model = self.modelname) self._retrieve_AssociationRules_Model(self.modelname, verbose=True) except: raise else: print(res) return
[docs] def _retrieve_AssociationRules_Model(self, modelname, verbose = False): """ Retrieve information about the model to print the results. The Association Rules IDAX function stores its result in 4 tables: * <MODELNAME>_ASSOCPATTERNS * <MODELNAME>_ASSOCPATTERNS_STATISTICS * <MODELNAME>_ASSOCRULES * <MODELNAME>_ITEMS Parameters ---------- modelname : str The name of the model that is retrieved. verbose : bol, default: False Verbosity mode. Notes ----- Needs better formatting instead of printing the tables """ modelname = ibmdbpy.utils.check_tablename(modelname) if self._idadb is None: raise IdaAssociationRulesError("No Association rules model was trained before.") # Note: The name of the columns in hardcoded, this is done so as a # workaround for some bug in a specific ODBC linux driver. # In case the implementation of the IDA method changes, this may break # But still would not be difficult to fix assocpatterns = self._idadb.ida_query('SELECT * FROM "' + self._idadb.current_schema + '"."' + modelname + '_ASSOCPATTERNS"') assocpatterns.columns = ["ITEMSETID","ITEMID"] assocpatterns.columns = [x.upper() for x in assocpatterns.columns] assocpatterns_stats = self._idadb.ida_query('SELECT * FROM "' + self._idadb.current_schema + '"."' + modelname + '_ASSOCPATTERNS_STATISTICS"') assocpatterns_stats = ["ITEMSETID" , "LENGTH" , "COUNT" , "SUPPORT" , "LIFT" ,"PRUNED"] assocpatterns_stats.columns = [x.upper() for x in assocpatterns_stats.columns] assocrules = self._idadb.ida_query('SELECT * FROM "' + self._idadb.current_schema + '"."' + modelname + '_ASSOCRULES"') assocrules.columns = ["RULEID", "ITEMSETID", "BODYID", "HEADID", "CONFIDENCE", "PRUNED"] assocrules.columns = [x.upper() for x in assocrules.columns] items = self._idadb.ida_query('SELECT * FROM "' + self._idadb.current_schema + '"."' + modelname + '_ITEMS"') items.columns = ["ITEMID","ITEM","ITEMNAME","COUNT","SUPPORT"] items.columns = [x.upper() for x in items.columns] if verbose is True: print("assocpatterns") print(assocpatterns) print("assocpatterns_stats") print(assocpatterns_stats) print("assocrules") print(assocrules) print("items") print(items) return