Source code for ibmdbpy.utils

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------
# Copyright (c) 2015, IBM Corp.
# All rights reserved.
#
# Distributed under the terms of the BSD Simplified License.
#
# The full license is in the LICENSE file, distributed with this software.
#-----------------------------------------------------------------------------

""" Utility functions """

# Python 2 compatibility
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from builtins import range
from builtins import input
from future import standard_library
standard_library.install_aliases()

import sys
import os
import warnings
from time import time
from functools import wraps

import six
import pandas as pd
from copy import deepcopy

#-----------------------------------------------------------------------------
# Environment variable setter

[docs]def set_verbose(is_verbose):
    """
    Set the environment variable “VERBOSE” to ‘TRUE’ or ‘FALSE’. If it is set 
    to ‘TRUE’, all SQL request are printed in the console.
    """
    if is_verbose is True:
        os.environ['VERBOSE'] = 'True'
    elif is_verbose is False:
        os.environ['VERBOSE'] = 'False'
    else:
        raise ValueError("is_verbose should be a boolean")

[docs]def set_autocommit(is_autocommit):
    """
    Set the environment variable “AUTCOMMIT” to ‘TRUE’ or ‘FALSE’. If it is set 
    to ‘TRUE’, all operations are committed automatically.
    """
    if is_autocommit is True:
        os.environ['AUTOCOMMIT'] = 'True'
    elif is_autocommit is False:
        os.environ['AUTOCOMMIT'] = 'False'
    else:
        raise ValueError("is_autocommit should be a boolean")

#-----------------------------------------------------------------------
# Performance measurement wrapper

[docs]def timed(function):
    """
    Measure the elapsed time of custom functions of the package. Should be used 
    as a decorator.
    """
    @wraps(function)
    def wrapper(*args, **kwds):
        """Calculate elapsed time in seconds"""
        start = time()
        result = function(*args, **kwds)
        elapsed = time() - start
        if os.environ['VERBOSE'] == 'True':
            print("Execution time: %s seconds." %elapsed)
        return result
    return wrapper

[docs]def query_yes_no(question, default=None):
    """
    Ask a yes/no question via raw_input() and return its answer.

    Parameters
    ----------
    question : str
        Question to be asked to the user. Should be a yes/no question.
    default : "yes"/"no"
        The presumed answer if the user just hits <Enter>.

    Returns
    -------
    bool
    """
    valid = {"yes": True, "y": True, "ye": True,
             "no": False, "n": False}
    if default is None:
        prompt = " [y/n] "
    elif default == "yes":
        prompt = " [Y/n] "
    elif default == "no":
        prompt = " [y/N] "
    else:
        raise ValueError("invalid default answer: '%s'" % default)

    while True:
        sys.stdout.write(question + prompt)
        choice = input().lower() # in Python 2.7, it is raw_input
        if default is not None and choice == '':
            return valid[default]
        elif choice in valid:
            return valid[choice]
        else:
            sys.stdout.write("Please respond with 'yes' or 'no' "
                             "(or 'y' or 'n').\n")
                             
[docs]def chunklist(l, n):
    """ Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i+n]


[docs]def silent(function):
    """
    Decorate that silent the function it decorates,
    i.e SQL queries will not be printed.

    Notes
    -----
    * Legagy from Benchmark submodule

    """
    @wraps(function)
    def wrapper(self, *args, **kwds):
        original_value = os.environ['VERBOSE']
        set_verbose(False)
        result = function(self, *args, **kwds)
        os.environ['VERBOSE'] = original_value
        return result
    return wrapper

[docs]def to_nK(dataframe, nKrow):
    """
    Create a version of the dataframe that has n Krows.

    Parameters
    ----------
    dataframe : DataFrame
        DataFrame to use as a basis
    nKrow : int
        Number of Krows the return dataframe should contain

    Returns
    -------
    DataFrame
        A Version of the inputed dataframe with n Krows

    Notes
    -----
    * If the dataset is smaller than nKrow, it will be imputed randomly 
    with some existing rows, otherwise ve return a random sample

    * Legagy from Benchmark submodule
    """
    if nKrow < 1:
        raise ValueError("n should be an integer with minimum value 1")
    if not isinstance(nKrow, int):
        raise ValueError("n should be an integer with minimum value 1")
    def some(df, nrow):
        """Return n random rows from the given dataframe"""
        from numpy import random
        return df.ix[random.choice(range(len(df)), nrow)]

    if nKrow*1000 > dataframe.shape[0]:
        df = pd.concat([dataframe, some(dataframe, nKrow*1000 - dataframe.shape[0])], ignore_index=True)
    else:
        df = pd.concat([some(dataframe, nKrow*1000)], ignore_index=True)
    return df

[docs]def extend_dataset(df, n):
    """
    Extend a dataframe horizontaly by duplicating its columns n times

    Note
    ----

    * Legagy from Benchmark submodule
    """
    df_2 = deepcopy(df)
    df_out = deepcopy(df)
    while n > 0:
        df_2.columns = ["%s_extended_%s"%(column,n) for column in df.columns]
        df_out = pd.concat([df_out,df_2], axis = 1)
        n -= 1
    return df_out

[docs]def check_tablename(tablename):
    """
    Check if a string is upper case. This function converts the string to upper 
    case and checks if it is a valid table name.

    Parameters
    ----------
    tablename : str
        string to be checked. 

    Returns
    -------
    str
        Checked and upper cased table name.

    Notes
    -----
        Table names should consist of upper case characters or numbers that can 
        be separated by underscores (“_”) characters.
    """
    tablename = check_case(tablename)
    if not all([(char.isalnum() | (char == '_') | (char == '.')) for char in tablename]):
        raise ValueError("Table name is not valid, only alphanumeric characters and underscores are allowed.")
    if tablename.count(".") > 1:
        raise ValueError("Table name is not valid, only one '.' character is allowed.")
    return tablename

[docs]def check_viewname(viewname):
    """
    Convenience function. Just an alternative name to check_tablename for 
    checking view names, which have the same prerequisites as tablenames. See 
    the check_tablename documentation.
    """
    return check_tablename(viewname)

[docs]def check_case(name):
    """
    Check if the name given as parameter is in upper case and convert it to 
    upper cases.
    """
    if name != name.upper():
        warnings.warn("Mixed case names are not supported in database object names.", UserWarning)
    return name.upper()

[docs]def _convert_dtypes(idadf, data):
    """
    DEPRECATED - CURRENTLY NOT IN USE
    was used for formatting dataframe types

    Convert datatypes in a dataframe to float or to int according to the 
    corresponding type in database. This function only works if the dataframe 
    that is given as a parameter has the same columns as the current 
    IdaDataFrame.
    """
    # Note : Here I made the choice to convert every numeric attributes to
    # float, the reason is that converting to int for an attribute that has
    # missing values leads to an error, this is known as a pandas' gotcha
    # We could use statistics._get_number_of_nas to know if there are
    # missing values for each attributes, but I made the choice no to do it,
    # this for reducing complexity and improve performance. However, it no
    # efficient in terms of memory.
    import pdb ; pdb.set_trace() ;
    original_dtype = idadf.dtypes

    int_attributes = ['SMALLINT', 'INTEGER', 'BIGINT', 'BOOLEAN']
    float_attributes = ['REAL', 'DOUBLE', 'FLOAT', 'DECIMAL',
                        'DECFLOAT', 'APPROXIMATE', 'NUMERIC']
    for index, dtype in enumerate(original_dtype.values):
        if dtype[0] in int_attributes + float_attributes:
            data[[data.columns[index]]] = data[[data.columns[index]]].astype(float)
        else:
            data[[data.columns[index]]] = data[[data.columns[index]]].astype(object)
    return data

[docs]def _reset_attributes(idaobject, attributes):
    """
    Delete an attribute of a list of attributes of an object, if the attribute exists.
    """
    if (not hasattr(attributes, "__iter__"))|isinstance(attributes, six.string_types):
        attributes = [attributes]
    for attribute in attributes:
        try:
            delattr(idaobject, attribute)
        except AttributeError:
            pass
        
def _check_input(idadf, target, features):
    """
    Check if the input is valid. 
    Target should be a string that exists as a column in the IdaDataFrame, or None.
    Features should a string or a list of columns in the IdaDataFrame, or None.
    If features is None, a list with all features different from target is returned
    """
    if target is not None:
        if not isinstance(target, six.string_types):
            raise ValueError("target should be a string")
        if target not in idadf.columns:
            raise ValueError("Unknown column %s"%target)
    
    if features is not None:
        if isinstance(features, six.string_types):
            if features not in idadf.columns:
                raise ValueError("Unknown column %s"%features)
            features = [features]
        for x in features:
            if x not in idadf.columns:
                raise ValueError("Unknown column %s"%x)
    else:
        if target is not None:
            features = [x for x in idadf.columns if x not in target]
        else:
            features = list(idadf.columns)
            
    return target, features