Source code for ibmdbpy.feature_selection.discretize

# -*- coding: utf-8 -*-
"""
Created on Mon Nov 23 09:02:30 2015

@author: efouche
"""
from __future__ import unicode_literals
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
from future import standard_library
standard_library.install_aliases()

from ibmdbpy.internals import idadf_state
import ibmdbpy
from ibmdbpy.utils import timed

import six

@timed
@idadf_state(force = True)   
[docs]def discretize(idadf, columns = None, disc= "em", target = None, bins = None, outtable = None, clear_existing=False): """ Discretize a set of numerical columns from an IdaDataFrame and returns an IdaDataFrame open on the discretized version of the dataset. Parameters ---------- idadf : IdaDataFrame columns : str or list of str, optional A column or list of columns to be discretized disc : "ef", "em", "ew", "ewn" default: "em" Discretization method to be used - ef: Discretization bins of equal frequency - em: Discretization bins of minimal entropy - ew: Discretization bins of equal width - ewn: Discretization bins of equal width with human-friendly limits target : str Target column again which the discretization will be done. Relevant only for "em" discretization. bins: int, optional Number of bins. Not relevant for "em" discretization. outtable: str, optional The name of the output table where the assigned clusters are stored. If this parameter is not specified, it is generated automatically. If the parameter corresponds to an existing table in the database, it is replaced. clear_existing: bool, default: False If set to True, a table will be replaced when a table with the same name already exists in the database. """ if columns is None: columns = idadf._get_numerical_columns() if target is not None: columns = [x for x in columns if columns != target] else: if isinstance(columns, six.string_types): columns = [columns] stored_proc = _check(idadf, columns, disc, target, bins, outtable) bound_outtable = idadf._idadb._get_valid_tablename('DISC_BOUNDS_%s_'%idadf.tablename) intable = idadf.name # either the table or a view on the top incolumn = "\";\"".join(columns) # Calculate bounds idadf._idadb._call_stored_procedure("IDAX.%s"%stored_proc, outtable=bound_outtable, intable=intable, incolumn=incolumn, target=target, bins=bins) # Create discretized dataset if outtable is None: disc_outtable = idadf._idadb._get_valid_tablename('DISC_%s_'%idadf.tablename) else: if clear_existing is True: try: idadf._idadb.drop_table(outtable) except: pass disc_outtable = outtable try: idadf._idadb._call_stored_procedure("IDAX.APPLY_DISC", outtable=disc_outtable, intable=intable, btable=bound_outtable, replace="T") except: raise finally: idadf._idadb.drop_table(bound_outtable) return ibmdbpy.IdaDataFrame(idadf._idadb, disc_outtable)
def _check(idadf, columns, disc, target, bins, outtable): """ Helper function to handle basic checks for ibmdbpy.feature_selection.discretize """ if outtable is not None: ibmdbpy.utils.check_tablename(outtable) if bins is not None: if not isinstance(bins, int): raise TypeError("bins argument is not of integer type") if columns is not None: if target is not None: if target in columns: raise ValueError("Target in columns.") unknown = [] for column in columns: if column not in idadf.columns: unknown.append(column) if unknown: raise ValueError("Undefined columns: %s"%", ".join(unknown)) if disc == "em": if bins is not None: raise ValueError("Number of bins is automatically detected for Entropy Minimization discretization.") if target is None: raise ValueError("Need to define a target for Entropy Minimization discretization.") if target in columns: raise ValueError("Target column %s cannot be discretize too"%target) if target not in idadf.columns: raise ValueError("Undefined target column %s"%target) stored_proc = "EMDISC" else: if target is not None: raise ValueError("Target attribute defined only for Entropy Minimization discretization.") if bins is None: bins = 10 if disc == "ef": stored_proc = "EFDISC" elif disc == "ew": stored_proc = "EWDISC" elif disc == "ewn": stored_proc = "EWDISC_NICE" else: raise ValueError("Unknown discretization method.") return stored_proc