Source code for ibmdbpy.feature_selection.correlation

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------
# Copyright (c) 2015, IBM Corp.
# All rights reserved.
#
# Distributed under the terms of the BSD Simplified License.
#
# The full license is in the LICENSE file, distributed with this software.
#-----------------------------------------------------------------------------

from __future__ import unicode_literals
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
from future import standard_library
standard_library.install_aliases()
from collections import OrderedDict

import ibmdbpy
from ibmdbpy.internals import idadf_state
from ibmdbpy.utils import timed, chunklist

import pandas as pd

from ibmdbpy.feature_selection.private import _check_input


@idadf_state
@timed
[docs]def pearson(idadf, target=None, features=None, ignore_indexer=True): """ Compute the pearson correlation coefficients between a set of features and a set of target in an IdaDataFrame. Provide more granualirity than IdaDataFrame.corr Parameters ---------- idadf : IdaDataFrame target : str or list of str, optional A column or list of columns against to be used as target. Per default, consider all columns features : str or list of str, optional A column or list of columns to be used as features. Per default, consider all columns. ignore_indexer : bool, default: True Per default, ignore the column declared as indexer in idadf Returns ------- Pandas.DataFrame or Pandas.Series if only one target Notes ----- Input columns as target and features should be numerical. Examples -------- >>> idadf = IdaDataFrame(idadb, "IRIS") >>> pearson(idadf) """ numerical_columns = idadf._get_numerical_columns() if features is None: features = numerical_columns target, features = _check_input(idadf, target, features, ignore_indexer) value_dict = OrderedDict() for feature in features: if feature not in numerical_columns: raise TypeError("Correlation-based measure not available for non-numerical column %s"%feature) if target == features: return idadf.corr(features = features, ignore_indexer=ignore_indexer) else: for t in target: if feature not in numerical_columns: raise TypeError("Correlation-based measure not available for non-numerical column %s"%t) for t in target: value_dict[t] = OrderedDict() features_notarget = [x for x in features if x != t] if len(features_notarget) < 64: agg_list = ["CORRELATION(\"%s\",\"%s\")"%(x, t) for x in features_notarget] agg_string = ', '.join(agg_list) name = idadf.internal_state.current_state data = idadf.ida_query("SELECT %s FROM %s"%(agg_string, name), first_row_only = True) else: chunkgen = chunklist(features_notarget, 100) data = () for chunk in chunkgen: agg_list = ["CORRELATION(\"%s\",\"%s\")"%(x, t) for x in chunk] agg_string = ', '.join(agg_list) name = idadf.internal_state.current_state data += idadf.ida_query("SELECT %s FROM %s"%(agg_string, name), first_row_only = True) for i, feature in enumerate(features_notarget): value_dict[t][feature] = data[i] ### Fill the matrix result = pd.DataFrame(value_dict).fillna(1) if len(result.columns) == 1: if len(result) == 1: result = result.iloc[0,0] else: result = result[result.columns[0]].copy() result.sort_values(inplace=True, ascending=False) else: order = [x for x in result.columns if x in features] + [x for x in features if x not in result.columns] result = result.reindex(order) return result
@idadf_state @timed
[docs]def spearman(idadf, target=None, features = None, ignore_indexer=True): """ Compute the spearman rho correlation coefficients between a set of features and a set of target in an IdaDataFrame. Parameters ---------- idadf : IdaDataFrame target : str or list of str, optional A column or list of columns against to be used as target. Per default, consider all columns features : str or list of str, optional A column or list of columns to be used as features. Per default, consider all columns. ignore_indexer : bool, default: True Per default, ignore the column declared as indexer in idadf Returns ------- Pandas.DataFrame or Pandas.Series if only one target Notes ----- Input columns as target and features should be numerical. This function is a wrapper for pearson. The scalability of this approach is not very good. Should not be used on high dimensional data. Examples -------- >>> idadf = IdaDataFrame(idadb, "IRIS") >>> spearman(idadf) """ numerical_columns = idadf._get_numerical_columns() if features is None: features = numerical_columns target, features = _check_input(idadf, target, features, ignore_indexer) for feature in features: if feature not in numerical_columns: raise TypeError("Correlation-based measure not available for non-numerical column %s"%feature) if ignore_indexer is True: if idadf.indexer: if idadf.indexer in numerical_columns: features.remove(idadf.indexer) if features is None: features = list(idadf.columns) numerical_features = [x for x in features if x in numerical_columns] numerical_targets = [x for x in target if x in numerical_columns] numerical_features = list(set(numerical_features) | set(numerical_targets)) agg_list = ["CAST(RANK() OVER (ORDER BY \"%s\") AS INTEGER) AS \"%s\""%(x, x) for x in numerical_features] agg_string = ', '.join(agg_list) expression = "SELECT %s FROM %s"%(agg_string, idadf.name) viewname = idadf._idadb._create_view_from_expression(expression) try: idadf_rank = ibmdbpy.IdaDataFrame(idadf._idadb, viewname) return pearson(idadf_rank, target = target, features=numerical_features, ignore_indexer=ignore_indexer) except: raise finally: idadf._idadb.drop_view(viewname)