Source code for ibmdbpy.learn.kmeans

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------
# Copyright (c) 2015, IBM Corp.
# All rights reserved.
#
# Distributed under the terms of the BSD Simplified License.
#
# The full license is in the LICENSE file, distributed with this software.
#-----------------------------------------------------------------------------

"""
In-Database K-means. Copies the interface of sklearn.cluster.KMeans
"""
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from builtins import int
from builtins import dict
from builtins import str
from future import standard_library
standard_library.install_aliases()

from lazy import lazy
import pandas as pd

import ibmdbpy
from ibmdbpy import IdaDataFrame
from ibmdbpy.exceptions import IdaKMeansError
import six

[docs]class KMeans(object):
    """
    The K-means algorithm is the most widely used clustering algorithm that 
    uses an explicit distance measure to partition the data set into clusters.

    The K-means algorithm represents each cluster by the vector of the mean 
    attribute values of all training instances - for numeric attributes - and 
    by the vector of modal (most frequent) values - for nominal attributes - 
    that are assigned to that cluster. This cluster representation is called 
    cluster center.


    The KMeans class provides an interface for using the KMEANS
    and PREDICT_KMEANS IDAX methods of dashDB/DB2.
    """
[docs]    def __init__(self, n_clusters=3, modelname = None, max_iter = 5, distance = "euclidean",
                 random_state = 12345, idbased = False, statistics = None):
        """ 
        Constructor for K-means clustering.

        Parameters
        ----------

        n_cluster : int, optional, default: 3
            The number of cluster centers.
            Range : > 2

        modelname : str, optional
            The name of the clustering model that is built. If it is not given, 
            it is generated automatically. If the parameter corresponds to an 
            existing model in the database, it is replaced during the fitting 
            step.


        max_iter : int, > 1 and <= 1000, default = 5
            The maximum number of iterations.

        distance : str, default: "euclidean"
             The distance function. The following values are allowed: “euclidean” and “norm_euclidean”.


        random_state : int, default: 12345
            The random seed of the generator.

        idbased : bool, optional, default: False
            Specifies that the random seed of the generator is based on the value of the ID column.

        statistics : str, optional
            Indicates the statistics that are collected.

            The following values are allowed: ‘none’, ‘columns’, ‘values:n’, and ‘all’:
                * If statistics='none' is specified, no statistics are collected.
                * If statistics='columns' is specified, statistics on the columns of the input table are collected, for example, mean values.
                * If statistics='values:n' is specified, and if n is a positive number, statistics on the columns and the column values are collected.
                    Up to <n> column value statistics are collected.
                        * If a nominal column contains more than <n> values, only the <n> most frequent column statistics are kept.
                        * If a numeric column contains more than <n> values, the values are discretized, and the statistics are collected on the discretized values.
                * statistics=all is identical to statistics=values:100.

        Attributes
        ----------
        centers
            TODO

        cluster_centers_
            TODO

        withinss
            TODO

        size_clusters
            TODO

        inertia_
            TODO


        Returns
        -------
            The KMeans object, ready to be used for fitting and prediction

        Examples
        --------
        >>> idadb = IdaDataBase("DASHDB")
        >>> idadf = IdaDataFrame(idadb, "IRIS", indexer = "ID")
        >>> kmeans = KMeans(3) # clustering with 3 clusters
        >>> kmeans.fit(idadf)
        >>> kmeans.predict(idadf)

        Notes
        -----
        Inner parameters of the model can be printed and modified by using 
        get_params and set_params. But we recommend creating a new KMeans model 
        instead of modifying it.

        """
        self.modelname = modelname
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.distance = distance
        self.random_state = random_state
        self.idbased = idbased
        self.statistics = statistics

        # Get set at fit step
        self._idadb = None
        self._idadf = None
        self._column_id = None
        self.incolumn = None
        self.coldeftype = None
        self.coldefrole = None
        self.colPropertiesTable = None

        # Get set at predict step
        self.outtable = None

    @lazy
[docs]    def labels_(self):
        """
        Return the corresponding labels for each ID.
        """
        try:
            return self.predict(self._idadf, self._column_id)
        except:
            raise AttributeError(str(self.__class__) + " object has no attribute 'labels_'")

[docs]    def get_params(self):
        """
        Return the parameters of the K-means clustering.
        """
        params = dict()
        params['modelname'] = self.modelname
        params['n_clusters'] = self.n_clusters
        params['max_iter'] = self.max_iter
        params['distance'] = self.distance
        params['random_state'] = self.random_state
        params['idbased'] = self.idbased
        params['statistics'] = self.statistics

        params['incolumn'] = self.incolumn
        params['coldeftype'] = self.coldeftype
        params['coldefrole'] = self.coldefrole
        params['colPropertiesTable'] = self.colPropertiesTable

        params['outtable'] = self.outtable
        return params

[docs]    def set_params(self, **params):
        """
        Change the parameters of the K-means clustering.
        """
        if not params:
            # Simple optimisation to gain speed (inspect is slow)
            return self
        valid_params = self.get_params()
        for key, value in six.iteritems(params):
            if key not in valid_params:
                raise ValueError('Invalid parameter %s for estimator %s' %
                                     (key, self.__class__.__name__))
            setattr(self, key, value)
        return self

[docs]    def fit(self, idadf, column_id="ID", incolumn=None, coldeftype=None,
            coldefrole=None, colPropertiesTable=None, verbose=False):
        """
        Use the KMEANS stored procedure to build a K-means clustering model 
        that clusters the input data into k centers.


        Parameters
        ----------
        idadf : IdaDataFrame
            The name of the input IdaDataFrame.

        column_id : str, default: "ID"
            The column of the input IdaDataFrame that identifies a unique
            instance ID.

        incolumn : dict, optional
            The columns of the input table that have specific properties, which
            are separated by a semi-colon (;).
            Each column is succeeded by one or more of the following properties:
                * By type nominal (':nom') or by type continuous (':cont'). By default, numerical types are continuous, and all other types nominal.
                * By role ':id', ':target', ':input', or ':ignore'.

            If this parameter is not specified, all columns of the input table
            have default properties.

        coldeftype : dict, optional
            The default type of the input table columns. The following values 
            are allowed: ‘nom’ and ‘cont’. If the parameter is not specified, 
            numeric columns are continuous and all other columns are nominal.


        coldefrole : dict, optional
            The default role of the input table columns. The following values 
            are allowed: ‘input’ and ‘ignore’. If the parameter is not 
            specified, all columns are input columns.

        colPropertiesTable : idaDataFrame, optional
            The input IdaDataFrame where the properties of the columns of the 
            input IdaDataFrame (idadf) are stored. If this parameter is not 
            specified, the column properties of the input table column 
            properties are detected automatically.

        verbose : bool, default: False
            Verbosity mode.

        """
        if not type(idadf).__name__ == 'IdaDataFrame':
            raise TypeError("Argument should be an IdaDataFrame")

        idadf._idadb._check_procedure("KMEANS", "KMeans")

        # Check the ID
        if column_id not in idadf.columns:
            raise ValueError("No id columns is available in IdaDataFrame:" + column_id +
                             ". Either create a new ID column using add_column_id function" +
                             " or give the name of a column that can be used as ID")

        self._idadb = idadf._idadb
        self._idadf = idadf
        self._column_id = column_id

        # Check or create a model name
        if self.modelname is None:
            self.modelname = idadf._idadb._get_valid_modelname('KMEANS_')
        elif " " in self.modelname:
            raise ValueError("Space in model name is not allowed")
        else:
            if idadf._idadb.exists_model(self.modelname):
                idadf._idadb.drop_model(self.modelname)

        # Create a temporay view
        idadf.internal_state._create_view()
        tmp_view_name = idadf.internal_state.current_state # deprecated, hange to idadf.name
        
        if "." in tmp_view_name:
            tmp_view_name = tmp_view_name.split('.')[-1]

        try:
            # TODO: outtable is optional but this does not match with the doc
            # Defect to declare
            idadf._idadb._call_stored_procedure("IDAX.KMEANS ",
                                                model = self.modelname,
                                                intable = tmp_view_name,
                                                k = self.n_clusters,
                                                maxiter = self.max_iter,
                                                #outtable = self.outtable_fit,
                                                distance = self.distance,
                                                id = self._column_id,
                                                randseed = self.random_state,
                                                statistics = self.statistics,
                                                idbased = self.idbased,
                                                incolumn = self.incolumn,
                                                coldeftype = self.coldeftype,
                                                coldefrole = self.coldefrole,
                                                colPropertiesTable = self.colPropertiesTable)

        except:
            raise
        finally:
            idadf.internal_state._delete_view()
            idadf.commit()

        result = self._retrieve_KMeans_Model(self.modelname, verbose)
        self.centers = result['centers']
        self.cluster_centers_ = result['centers'].values
        self.withinss = result['withinss']
        self.size_clusters = [int(str(x)) for x in result['size']]
        self.inertia_ = sum(self.withinss)

        if verbose is True:
            self.describe()

        return

[docs]    def predict(self, idadf, column_id=None, outtable = None):
        """
        Apply the K-means clustering model to new data.

        Parameters
        ----------
        idadf : IdaDataFrame
            IdaDataFrame to be used as input.

        column_id : str
            The column of the input table that identifies a unique instance ID.
            Default: the same id column that is specified in the stored procedure to build the model.

        outtable : str
            The name of the output table where the assigned clusters are stored.
            If this parameter is not specified, it is generated automatically.
            If the parameter corresponds to an existing table in the database,
            it is replaced.

        Returns
        -------
        IdaDataFrame
            IdaDataFrame containing the closest cluster for each data point referenced by its ID.
        """
        if not type(idadf).__name__ == 'IdaDataFrame':
            raise TypeError("Argument should be an IdaDataFrame")

        # Check the ID
        if column_id is None:
            column_id = self._column_id
        if column_id not in idadf.columns:
            raise ValueError("No id columns is available in IdaDataFrame:" + column_id +
                             ". Either create a new ID column using add_column_id function" +
                             " or give the name of a column that can be used as ID")

        if self._idadb is None:
            raise IdaKMeansError("No KMeans model was trained before")


        if outtable is None:
            outtable = idadf._idadb._get_valid_modelname('PREDICT_KMEANS_')
        else:
            if self.outtable:
                outtable = self.outtable
            outtable = ibmdbpy.utils.check_tablename(outtable)
            if idadf._idadb.exists_table(outtable):
                idadf._idadb.drop_table(outtable)

        self.outtable = outtable
        # Create a temporay view
        idadf.internal_state._create_view()
        tmp_view_name = idadf.internal_state.current_state
        
        if "." in tmp_view_name:
            tmp_view_name = tmp_view_name.split('.')[-1]
            
        try:
            idadf._idadb._call_stored_procedure("IDAX.PREDICT_KMEANS ",
                                                 model = self.modelname,
                                                 intable = tmp_view_name,
                                                 id = column_id,
                                                 outtable = self.outtable
                                                 )
        except:
            raise
        finally:
            idadf.internal_state._delete_view()
            idadf._idadb.commit()

        self.labels_ = ibmdbpy.IdaDataFrame(idadf._idadb, outtable, indexer=column_id)
        return self.labels_

[docs]    def fit_predict(self, idadf, column_id="ID", incolumn=None, coldeftype=None,
                    coldefrole=None, colPropertiesTable=None, outtable = None,
                    verbose=False):
        """
        Convenience function for fitting the model and using it to make 
        predictions on the same dataset. See the fit and predict documentation 
        for an explanation about their attributes.
        """
        self.fit(idadf, column_id, incolumn, coldeftype, coldefrole,
                 colPropertiesTable, verbose)
        return self.predict(idadf, column_id, outtable)

[docs]    def describe(self):
        """
        Return a description of the K-means clustering, if a prediction was 
        made. Otherwise,  this function returns the parameters of the model.
        """
        if self._idadb is None:
            return self.get_params
        else:
            print("KMeans clustering with " + str(self.n_clusters) +
            " clusters of sizes " + ', '.join([str(x) for x in self.size_clusters]))
            print()
            print("Cluster means: ")
            print(self.centers)
            print()
            print("Within cluster sum of squares by cluster:")
            print(self.withinss)
            try:
                self._idadb._call_stored_procedure("IDAX.PRINT_MODEL ", model = self.modelname)
            except:
                raise
            return

[docs]    def _retrieve_KMeans_Model(self, modelname, verbose = False):
        """
        Retrieve information about the model to print the results. The KMEANS 
        IDAX function stores its result in 4 tables:
            * <MODELNAME>_MODEL
            * <MODELNAME>_COLUMNS
            * <MODELNAME>_COLUMN_STATISTICS
            * <MODELNAME>_CLUSTERS

        Parameters
        ----------
        modelname : str
            The name of the model that is retrieved.

        verbose : bol, default: False
            Verbosity mode.
        """
        modelname = ibmdbpy.utils.check_tablename(modelname)

        if self._idadb is None:
            raise IdaKMeansError("No KMeans model was trained before")

        model_main = self._idadb.ida_query('SELECT * FROM "' +
        self._idadb.current_schema + '"."' + modelname + '_MODEL"')
        # Woraround for specific version of ODBC
        model_main.columns = ['MODELCLASS', 'COMPARISONTYPE', 'COMPARISONMEASURE', 'NUMCLUSTERS']
        model_main.columns = [x.upper() for x in model_main.columns]


        col_info = self._idadb.ida_query('SELECT * FROM "' +
        self._idadb.current_schema + '"."' + modelname + '_COLUMNS"',)
        col_info.columns = ['COLUMNNAME', 'DATATYPE', 'OPTYPE', 'USAGETYPE', 'COLUMNWEIGHT',
       'AUTOTRANSFORM', 'TRANSFORMEDCOLUMN', 'COMPAREFUNCTION', 'IMPORTANCE',
       'OUTLIERTREATMENT', 'LOWERLIMIT', 'UPPERLIMIT', 'CLOSURE',
       'STATISTICSTYPE']
        col_info.columns = [x.upper() for x in col_info.columns]

        col_stats = self._idadb.ida_query('SELECT * FROM "' +
        self._idadb.current_schema + '"."' + modelname + '_COLUMN_STATISTICS"')
        col_stats.columns = ['CLUSTERID', 'COLUMNNAME', 'CARDINALITY', 'MODE', 'MINIMUM', 'MAXIMUM',
       'MEAN', 'VARIANCE', 'VALIDFREQ', 'MISSINGFREQ', 'INVALIDFREQ',
       'IMPORTANCE']
        col_stats.columns = [x.upper() for x in col_stats.columns]

        km_out_stat = self._idadb.ida_query('SELECT * FROM "' +
        self._idadb.current_schema + '"."' + modelname + '_CLUSTERS"')
        km_out_stat.columns = ['CLUSTERID', 'NAME', 'DESCRIPTION', 'SIZE', 'RELSIZE', 'WITHINSS']
        km_out_stat.columns = [x.upper() for x in km_out_stat.columns]

        k = model_main.iloc[0][3]
        distance = model_main.iloc[0][2]
        cont_cols = col_info.loc[(col_info['USAGETYPE'] == 'active') & (col_info['OPTYPE'] == 'continuous'), ['COLUMNNAME']]
        cat_cols = col_info.loc[(col_info['USAGETYPE'] == 'active') & (col_info['OPTYPE'] == 'categorical'), ['COLUMNNAME']]

        columns = []
        for x in col_stats['COLUMNNAME'].values:
            if x not in columns:
                columns.append(x)

        clusters = km_out_stat['CLUSTERID'].values
        clusters.sort()

        cluster_centers = []
        for cluster in clusters:
            tmp = [cluster]
            for column in columns:
                if column in cont_cols.values:
                    tmp.append(col_stats.loc[(col_stats['CLUSTERID'] == cluster) & (col_stats['COLUMNNAME'] == column)]['MEAN'].values[0])
                elif column in cat_cols.values:
                    tmp.append(col_stats.loc[(col_stats['CLUSTERID'] == cluster) & (col_stats['COLUMNNAME'] == column)]['MODE'].values[0])
                else:
                    raise TypeError("Unexpected column category")
            cluster_centers.append(tmp)

        centers = pd.DataFrame([tuple(x) for x in cluster_centers])
        centers.columns = ['CLUSTERID'] + columns

        if verbose is True:
            print("MODEL")
            print(model_main)
            print("COLUMNS")
            print(col_info)
            print("COLUMNS_STATISTICS")
            print(col_stats)
            print("CLUSTERS")
            print(km_out_stat)

        result = dict()
        result['withinss'] = km_out_stat['WITHINSS'].values
        result['size'] = km_out_stat['SIZE'].values
        result['relsize'] = km_out_stat['RELSIZE'].values
        result['distance'] = distance

        result['k'] = k
        result['centers'] = centers

        return result