Source code for ibmdbpy.learn.kmeans

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------
# Copyright (c) 2015, IBM Corp.
# All rights reserved.
#
# Distributed under the terms of the BSD Simplified License.
#
# The full license is in the LICENSE file, distributed with this software.
#-----------------------------------------------------------------------------

"""
In-Database K-means. Copies the interface of sklearn.cluster.KMeans
"""
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from builtins import int
from builtins import dict
from builtins import str
from future import standard_library
standard_library.install_aliases()

from lazy import lazy
import pandas as pd

import ibmdbpy
from ibmdbpy import IdaDataFrame
from ibmdbpy.exceptions import IdaKMeansError
import six

[docs]class KMeans(object): """ The K-means algorithm is the most widely used clustering algorithm that uses an explicit distance measure to partition the data set into clusters. The K-means algorithm represents each cluster by the vector of the mean attribute values of all training instances - for numeric attributes - and by the vector of modal (most frequent) values - for nominal attributes - that are assigned to that cluster. This cluster representation is called cluster center. The KMeans class provides an interface for using the KMEANS and PREDICT_KMEANS IDAX methods of dashDB/DB2. """
[docs] def __init__(self, n_clusters=3, modelname = None, max_iter = 5, distance = "euclidean", random_state = 12345, idbased = False, statistics = None): """ Constructor for K-means clustering. Parameters ---------- n_cluster : int, optional, default: 3 The number of cluster centers. Range : > 2 modelname : str, optional The name of the clustering model that is built. If it is not given, it is generated automatically. If the parameter corresponds to an existing model in the database, it is replaced during the fitting step. max_iter : int, > 1 and <= 1000, default = 5 The maximum number of iterations. distance : str, default: "euclidean" The distance function. The following values are allowed: “euclidean” and “norm_euclidean”. random_state : int, default: 12345 The random seed of the generator. idbased : bool, optional, default: False Specifies that the random seed of the generator is based on the value of the ID column. statistics : str, optional Indicates the statistics that are collected. The following values are allowed: ‘none’, ‘columns’, ‘values:n’, and ‘all’: * If statistics='none' is specified, no statistics are collected. * If statistics='columns' is specified, statistics on the columns of the input table are collected, for example, mean values. * If statistics='values:n' is specified, and if n is a positive number, statistics on the columns and the column values are collected. Up to <n> column value statistics are collected. * If a nominal column contains more than <n> values, only the <n> most frequent column statistics are kept. * If a numeric column contains more than <n> values, the values are discretized, and the statistics are collected on the discretized values. * statistics=all is identical to statistics=values:100. Attributes ---------- centers TODO cluster_centers_ TODO withinss TODO size_clusters TODO inertia_ TODO Returns ------- The KMeans object, ready to be used for fitting and prediction Examples -------- >>> idadb = IdaDataBase("DASHDB") >>> idadf = IdaDataFrame(idadb, "IRIS", indexer = "ID") >>> kmeans = KMeans(3) # clustering with 3 clusters >>> kmeans.fit(idadf) >>> kmeans.predict(idadf) Notes ----- Inner parameters of the model can be printed and modified by using get_params and set_params. But we recommend creating a new KMeans model instead of modifying it. """ self.modelname = modelname self.n_clusters = n_clusters self.max_iter = max_iter self.distance = distance self.random_state = random_state self.idbased = idbased self.statistics = statistics # Get set at fit step self._idadb = None self._idadf = None self._column_id = None self.incolumn = None self.coldeftype = None self.coldefrole = None self.colPropertiesTable = None # Get set at predict step self.outtable = None
@lazy
[docs] def labels_(self): """ Return the corresponding labels for each ID. """ try: return self.predict(self._idadf, self._column_id) except: raise AttributeError(str(self.__class__) + " object has no attribute 'labels_'")
[docs] def get_params(self): """ Return the parameters of the K-means clustering. """ params = dict() params['modelname'] = self.modelname params['n_clusters'] = self.n_clusters params['max_iter'] = self.max_iter params['distance'] = self.distance params['random_state'] = self.random_state params['idbased'] = self.idbased params['statistics'] = self.statistics params['incolumn'] = self.incolumn params['coldeftype'] = self.coldeftype params['coldefrole'] = self.coldefrole params['colPropertiesTable'] = self.colPropertiesTable params['outtable'] = self.outtable return params
[docs] def set_params(self, **params): """ Change the parameters of the K-means clustering. """ if not params: # Simple optimisation to gain speed (inspect is slow) return self valid_params = self.get_params() for key, value in six.iteritems(params): if key not in valid_params: raise ValueError('Invalid parameter %s for estimator %s' % (key, self.__class__.__name__)) setattr(self, key, value) return self
[docs] def fit(self, idadf, column_id="ID", incolumn=None, coldeftype=None, coldefrole=None, colPropertiesTable=None, verbose=False): """ Use the KMEANS stored procedure to build a K-means clustering model that clusters the input data into k centers. Parameters ---------- idadf : IdaDataFrame The name of the input IdaDataFrame. column_id : str, default: "ID" The column of the input IdaDataFrame that identifies a unique instance ID. incolumn : dict, optional The columns of the input table that have specific properties, which are separated by a semi-colon (;). Each column is succeeded by one or more of the following properties: * By type nominal (':nom') or by type continuous (':cont'). By default, numerical types are continuous, and all other types nominal. * By role ':id', ':target', ':input', or ':ignore'. If this parameter is not specified, all columns of the input table have default properties. coldeftype : dict, optional The default type of the input table columns. The following values are allowed: ‘nom’ and ‘cont’. If the parameter is not specified, numeric columns are continuous and all other columns are nominal. coldefrole : dict, optional The default role of the input table columns. The following values are allowed: ‘input’ and ‘ignore’. If the parameter is not specified, all columns are input columns. colPropertiesTable : idaDataFrame, optional The input IdaDataFrame where the properties of the columns of the input IdaDataFrame (idadf) are stored. If this parameter is not specified, the column properties of the input table column properties are detected automatically. verbose : bool, default: False Verbosity mode. """ if not type(idadf).__name__ == 'IdaDataFrame': raise TypeError("Argument should be an IdaDataFrame") idadf._idadb._check_procedure("KMEANS", "KMeans") # Check the ID if column_id not in idadf.columns: raise ValueError("No id columns is available in IdaDataFrame:" + column_id + ". Either create a new ID column using add_column_id function" + " or give the name of a column that can be used as ID") self._idadb = idadf._idadb self._idadf = idadf self._column_id = column_id # Check or create a model name if self.modelname is None: self.modelname = idadf._idadb._get_valid_modelname('KMEANS_') elif " " in self.modelname: raise ValueError("Space in model name is not allowed") else: if idadf._idadb.exists_model(self.modelname): idadf._idadb.drop_model(self.modelname) # Create a temporay view idadf.internal_state._create_view() tmp_view_name = idadf.internal_state.current_state # deprecated, hange to idadf.name if "." in tmp_view_name: tmp_view_name = tmp_view_name.split('.')[-1] try: # TODO: outtable is optional but this does not match with the doc # Defect to declare idadf._idadb._call_stored_procedure("IDAX.KMEANS ", model = self.modelname, intable = tmp_view_name, k = self.n_clusters, maxiter = self.max_iter, #outtable = self.outtable_fit, distance = self.distance, id = self._column_id, randseed = self.random_state, statistics = self.statistics, idbased = self.idbased, incolumn = self.incolumn, coldeftype = self.coldeftype, coldefrole = self.coldefrole, colPropertiesTable = self.colPropertiesTable) except: raise finally: idadf.internal_state._delete_view() idadf.commit() result = self._retrieve_KMeans_Model(self.modelname, verbose) self.centers = result['centers'] self.cluster_centers_ = result['centers'].values self.withinss = result['withinss'] self.size_clusters = [int(str(x)) for x in result['size']] self.inertia_ = sum(self.withinss) if verbose is True: self.describe() return
[docs] def predict(self, idadf, column_id=None, outtable = None): """ Apply the K-means clustering model to new data. Parameters ---------- idadf : IdaDataFrame IdaDataFrame to be used as input. column_id : str The column of the input table that identifies a unique instance ID. Default: the same id column that is specified in the stored procedure to build the model. outtable : str The name of the output table where the assigned clusters are stored. If this parameter is not specified, it is generated automatically. If the parameter corresponds to an existing table in the database, it is replaced. Returns ------- IdaDataFrame IdaDataFrame containing the closest cluster for each data point referenced by its ID. """ if not type(idadf).__name__ == 'IdaDataFrame': raise TypeError("Argument should be an IdaDataFrame") # Check the ID if column_id is None: column_id = self._column_id if column_id not in idadf.columns: raise ValueError("No id columns is available in IdaDataFrame:" + column_id + ". Either create a new ID column using add_column_id function" + " or give the name of a column that can be used as ID") if self._idadb is None: raise IdaKMeansError("No KMeans model was trained before") if outtable is None: outtable = idadf._idadb._get_valid_modelname('PREDICT_KMEANS_') else: if self.outtable: outtable = self.outtable outtable = ibmdbpy.utils.check_tablename(outtable) if idadf._idadb.exists_table(outtable): idadf._idadb.drop_table(outtable) self.outtable = outtable # Create a temporay view idadf.internal_state._create_view() tmp_view_name = idadf.internal_state.current_state if "." in tmp_view_name: tmp_view_name = tmp_view_name.split('.')[-1] try: idadf._idadb._call_stored_procedure("IDAX.PREDICT_KMEANS ", model = self.modelname, intable = tmp_view_name, id = column_id, outtable = self.outtable ) except: raise finally: idadf.internal_state._delete_view() idadf._idadb.commit() self.labels_ = ibmdbpy.IdaDataFrame(idadf._idadb, outtable, indexer=column_id) return self.labels_
[docs] def fit_predict(self, idadf, column_id="ID", incolumn=None, coldeftype=None, coldefrole=None, colPropertiesTable=None, outtable = None, verbose=False): """ Convenience function for fitting the model and using it to make predictions on the same dataset. See the fit and predict documentation for an explanation about their attributes. """ self.fit(idadf, column_id, incolumn, coldeftype, coldefrole, colPropertiesTable, verbose) return self.predict(idadf, column_id, outtable)
[docs] def describe(self): """ Return a description of the K-means clustering, if a prediction was made. Otherwise, this function returns the parameters of the model. """ if self._idadb is None: return self.get_params else: print("KMeans clustering with " + str(self.n_clusters) + " clusters of sizes " + ', '.join([str(x) for x in self.size_clusters])) print() print("Cluster means: ") print(self.centers) print() print("Within cluster sum of squares by cluster:") print(self.withinss) try: self._idadb._call_stored_procedure("IDAX.PRINT_MODEL ", model = self.modelname) except: raise return
[docs] def _retrieve_KMeans_Model(self, modelname, verbose = False): """ Retrieve information about the model to print the results. The KMEANS IDAX function stores its result in 4 tables: * <MODELNAME>_MODEL * <MODELNAME>_COLUMNS * <MODELNAME>_COLUMN_STATISTICS * <MODELNAME>_CLUSTERS Parameters ---------- modelname : str The name of the model that is retrieved. verbose : bol, default: False Verbosity mode. """ modelname = ibmdbpy.utils.check_tablename(modelname) if self._idadb is None: raise IdaKMeansError("No KMeans model was trained before") model_main = self._idadb.ida_query('SELECT * FROM "' + self._idadb.current_schema + '"."' + modelname + '_MODEL"') # Woraround for specific version of ODBC model_main.columns = ['MODELCLASS', 'COMPARISONTYPE', 'COMPARISONMEASURE', 'NUMCLUSTERS'] model_main.columns = [x.upper() for x in model_main.columns] col_info = self._idadb.ida_query('SELECT * FROM "' + self._idadb.current_schema + '"."' + modelname + '_COLUMNS"',) col_info.columns = ['COLUMNNAME', 'DATATYPE', 'OPTYPE', 'USAGETYPE', 'COLUMNWEIGHT', 'AUTOTRANSFORM', 'TRANSFORMEDCOLUMN', 'COMPAREFUNCTION', 'IMPORTANCE', 'OUTLIERTREATMENT', 'LOWERLIMIT', 'UPPERLIMIT', 'CLOSURE', 'STATISTICSTYPE'] col_info.columns = [x.upper() for x in col_info.columns] col_stats = self._idadb.ida_query('SELECT * FROM "' + self._idadb.current_schema + '"."' + modelname + '_COLUMN_STATISTICS"') col_stats.columns = ['CLUSTERID', 'COLUMNNAME', 'CARDINALITY', 'MODE', 'MINIMUM', 'MAXIMUM', 'MEAN', 'VARIANCE', 'VALIDFREQ', 'MISSINGFREQ', 'INVALIDFREQ', 'IMPORTANCE'] col_stats.columns = [x.upper() for x in col_stats.columns] km_out_stat = self._idadb.ida_query('SELECT * FROM "' + self._idadb.current_schema + '"."' + modelname + '_CLUSTERS"') km_out_stat.columns = ['CLUSTERID', 'NAME', 'DESCRIPTION', 'SIZE', 'RELSIZE', 'WITHINSS'] km_out_stat.columns = [x.upper() for x in km_out_stat.columns] k = model_main.iloc[0][3] distance = model_main.iloc[0][2] cont_cols = col_info.loc[(col_info['USAGETYPE'] == 'active') & (col_info['OPTYPE'] == 'continuous'), ['COLUMNNAME']] cat_cols = col_info.loc[(col_info['USAGETYPE'] == 'active') & (col_info['OPTYPE'] == 'categorical'), ['COLUMNNAME']] columns = [] for x in col_stats['COLUMNNAME'].values: if x not in columns: columns.append(x) clusters = km_out_stat['CLUSTERID'].values clusters.sort() cluster_centers = [] for cluster in clusters: tmp = [cluster] for column in columns: if column in cont_cols.values: tmp.append(col_stats.loc[(col_stats['CLUSTERID'] == cluster) & (col_stats['COLUMNNAME'] == column)]['MEAN'].values[0]) elif column in cat_cols.values: tmp.append(col_stats.loc[(col_stats['CLUSTERID'] == cluster) & (col_stats['COLUMNNAME'] == column)]['MODE'].values[0]) else: raise TypeError("Unexpected column category") cluster_centers.append(tmp) centers = pd.DataFrame([tuple(x) for x in cluster_centers]) centers.columns = ['CLUSTERID'] + columns if verbose is True: print("MODEL") print(model_main) print("COLUMNS") print(col_info) print("COLUMNS_STATISTICS") print(col_stats) print("CLUSTERS") print(km_out_stat) result = dict() result['withinss'] = km_out_stat['WITHINSS'].values result['size'] = km_out_stat['SIZE'].values result['relsize'] = km_out_stat['RELSIZE'].values result['distance'] = distance result['k'] = k result['centers'] = centers return result