Source code for ibmdbpy.feature_selection.gini

# -*- coding: utf-8 -*-
"""
Created on Tue Dec  1 12:29:30 2015

@author: efouche
"""
from __future__ import unicode_literals
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
from future import standard_library
standard_library.install_aliases()


from collections import OrderedDict

import pandas as pd
import numpy as np
import six

from ibmdbpy.internals import idadf_state
from ibmdbpy.utils import timed
from ibmdbpy.feature_selection.private import _check_input

@idadf_state
@timed
[docs]def gini_pairwise(idadf, target=None, features=None, ignore_indexer=True): """ Compute the conditional gini coefficients between a set of features and a set of target in an IdaDataFrame. Parameters ---------- idadf : IdaDataFrame target : str or list of str, optional A column or list of columns against to be used as target. Per default, consider all columns features : str or list of str, optional A column or list of columns to be used as features. Per default, consider all columns. ignore_indexer : bool, default: True Per default, ignore the column declared as indexer in idadf Returns ------- Pandas.DataFrame or Pandas.Series if only one target Notes ----- Input columns as target and features should be categorical, otherwise this measure does not make much sense. Examples -------- >>> idadf = IdaDataFrame(idadb, "IRIS") >>> gini_pairwise(idadf) """ # Check input target, features = _check_input(idadf, target, features, ignore_indexer) gini_dict = OrderedDict() length = len(idadf) for t in target: gini_dict[t] = OrderedDict() features_notarget = [x for x in features if (x != t)] for feature in features_notarget: if t not in gini_dict: gini_dict[t] = OrderedDict() query = ("SELECT SUM((POWER(c,2) - gini)/c)/%s FROM "+ "(SELECT SUM(POWER(count,2)) as gini, SUM(count) as c FROM "+ "(SELECT CAST(COUNT(*) AS FLOAT) AS count, \"%s\" FROM %s GROUP BY \"%s\",\"%s\") "+ "GROUP BY \"%s\")") query0 = query%(length, feature, idadf.name, t, feature, feature) gini_dict[t][feature] = idadf.ida_scalar_query(query0) result = pd.DataFrame(gini_dict).fillna(np.nan) if len(result.columns) > 1: order = [x for x in result.columns if x in features] + [x for x in features if x not in result.columns] result = result.reindex(order) result = result.dropna(axis=1, how="all") if len(result.columns) == 1: if len(result) == 1: result = result.iloc[0,0] else: result = result[result.columns[0]].copy() result.sort(ascending = True) else: result = result.fillna(0) return result
@idadf_state @timed
[docs]def gini(idadf, features=None, ignore_indexer=True): """ Compute the gini coefficients for a set of features in an IdaDataFrame. Parameters ---------- idadf : IdaDataFrame features : str or list of str, optional A column or list of columns to be used as features. Per default, consider all columns. ignore_indexer : bool, default: True Per default, ignore the column declared as indexer in idadf Returns ------- Pandas.Series Notes ----- Input column should be categorical, otherwise this measure does not make much sense. Examples -------- >>> idadf = IdaDataFrame(idadb, "IRIS") >>> gini(idadf) """ if features is None: features = list(idadf.columns) else: if isinstance(features, six.string_types): features = [features] if ignore_indexer is True: if idadf.indexer: if idadf.indexer in features: features.remove(idadf.indexer) value_dict = OrderedDict() length = len(idadf)**2 for feature in features: subquery = "SELECT COUNT(*) AS count FROM %s GROUP BY \"%s\""%(idadf.name, feature) query = "SELECT (%s - SUM(POWER(count,2)))/%s FROM (%s)"%(length, length, subquery) value_dict[feature] = idadf.ida_scalar_query(query) if len(features) > 1: result = pd.Series(value_dict) else: result = value_dict[feature] return result