Source code for ibmdbpy.utils

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------
# Copyright (c) 2015, IBM Corp.
# All rights reserved.
#
# Distributed under the terms of the BSD Simplified License.
#
# The full license is in the LICENSE file, distributed with this software.
#-----------------------------------------------------------------------------

""" Utility functions """

# Python 2 compatibility
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from builtins import range
from builtins import input
from future import standard_library
standard_library.install_aliases()

import sys
import os
import warnings
from time import time
from functools import wraps

import six
import pandas as pd
from copy import deepcopy

#-----------------------------------------------------------------------------
# Environment variable setter

[docs]def set_verbose(is_verbose): """ Set the environment variable “VERBOSE” to ‘TRUE’ or ‘FALSE’. If it is set to ‘TRUE’, all SQL request are printed in the console. """ if is_verbose is True: os.environ['VERBOSE'] = 'True' elif is_verbose is False: os.environ['VERBOSE'] = 'False' else: raise ValueError("is_verbose should be a boolean")
[docs]def set_autocommit(is_autocommit): """ Set the environment variable “AUTCOMMIT” to ‘TRUE’ or ‘FALSE’. If it is set to ‘TRUE’, all operations are committed automatically. """ if is_autocommit is True: os.environ['AUTOCOMMIT'] = 'True' elif is_autocommit is False: os.environ['AUTOCOMMIT'] = 'False' else: raise ValueError("is_autocommit should be a boolean") #----------------------------------------------------------------------- # Performance measurement wrapper
[docs]def timed(function): """ Measure the elapsed time of custom functions of the package. Should be used as a decorator. """ @wraps(function) def wrapper(*args, **kwds): """Calculate elapsed time in seconds""" start = time() result = function(*args, **kwds) elapsed = time() - start if os.environ['VERBOSE'] == 'True': print("Execution time: %s seconds." %elapsed) return result return wrapper
[docs]def query_yes_no(question, default=None): """ Ask a yes/no question via raw_input() and return its answer. Parameters ---------- question : str Question to be asked to the user. Should be a yes/no question. default : "yes"/"no" The presumed answer if the user just hits <Enter>. Returns ------- bool """ valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False} if default is None: prompt = " [y/n] " elif default == "yes": prompt = " [Y/n] " elif default == "no": prompt = " [y/N] " else: raise ValueError("invalid default answer: '%s'" % default) while True: sys.stdout.write(question + prompt) choice = input().lower() # in Python 2.7, it is raw_input if default is not None and choice == '': return valid[default] elif choice in valid: return valid[choice] else: sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
[docs]def chunklist(l, n): """ Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): yield l[i:i+n]
[docs]def silent(function): """ Decorate that silent the function it decorates, i.e SQL queries will not be printed. Notes ----- * Legagy from Benchmark submodule """ @wraps(function) def wrapper(self, *args, **kwds): original_value = os.environ['VERBOSE'] set_verbose(False) result = function(self, *args, **kwds) os.environ['VERBOSE'] = original_value return result return wrapper
[docs]def to_nK(dataframe, nKrow): """ Create a version of the dataframe that has n Krows. Parameters ---------- dataframe : DataFrame DataFrame to use as a basis nKrow : int Number of Krows the return dataframe should contain Returns ------- DataFrame A Version of the inputed dataframe with n Krows Notes ----- * If the dataset is smaller than nKrow, it will be imputed randomly with some existing rows, otherwise ve return a random sample * Legagy from Benchmark submodule """ if nKrow < 1: raise ValueError("n should be an integer with minimum value 1") if not isinstance(nKrow, int): raise ValueError("n should be an integer with minimum value 1") def some(df, nrow): """Return n random rows from the given dataframe""" from numpy import random return df.ix[random.choice(range(len(df)), nrow)] if nKrow*1000 > dataframe.shape[0]: df = pd.concat([dataframe, some(dataframe, nKrow*1000 - dataframe.shape[0])], ignore_index=True) else: df = pd.concat([some(dataframe, nKrow*1000)], ignore_index=True) return df
[docs]def extend_dataset(df, n): """ Extend a dataframe horizontaly by duplicating its columns n times Note ---- * Legagy from Benchmark submodule """ df_2 = deepcopy(df) df_out = deepcopy(df) while n > 0: df_2.columns = ["%s_extended_%s"%(column,n) for column in df.columns] df_out = pd.concat([df_out,df_2], axis = 1) n -= 1 return df_out
[docs]def check_tablename(tablename): """ Check if a string is upper case. This function converts the string to upper case and checks if it is a valid table name. Parameters ---------- tablename : str string to be checked. Returns ------- str Checked and upper cased table name. Notes ----- Table names should consist of upper case characters or numbers that can be separated by underscores (“_”) characters. """ tablename = check_case(tablename) if not all([(char.isalnum() | (char == '_') | (char == '.')) for char in tablename]): raise ValueError("Table name is not valid, only alphanumeric characters and underscores are allowed.") if tablename.count(".") > 1: raise ValueError("Table name is not valid, only one '.' character is allowed.") return tablename
[docs]def check_viewname(viewname): """ Convenience function. Just an alternative name to check_tablename for checking view names, which have the same prerequisites as tablenames. See the check_tablename documentation. """ return check_tablename(viewname)
[docs]def check_case(name): """ Check if the name given as parameter is in upper case and convert it to upper cases. """ if name != name.upper(): warnings.warn("Mixed case names are not supported in database object names.", UserWarning) return name.upper()
[docs]def _convert_dtypes(idadf, data): """ DEPRECATED - CURRENTLY NOT IN USE was used for formatting dataframe types Convert datatypes in a dataframe to float or to int according to the corresponding type in database. This function only works if the dataframe that is given as a parameter has the same columns as the current IdaDataFrame. """ # Note : Here I made the choice to convert every numeric attributes to # float, the reason is that converting to int for an attribute that has # missing values leads to an error, this is known as a pandas' gotcha # We could use statistics._get_number_of_nas to know if there are # missing values for each attributes, but I made the choice no to do it, # this for reducing complexity and improve performance. However, it no # efficient in terms of memory. import pdb ; pdb.set_trace() ; original_dtype = idadf.dtypes int_attributes = ['SMALLINT', 'INTEGER', 'BIGINT', 'BOOLEAN'] float_attributes = ['REAL', 'DOUBLE', 'FLOAT', 'DECIMAL', 'DECFLOAT', 'APPROXIMATE', 'NUMERIC'] for index, dtype in enumerate(original_dtype.values): if dtype[0] in int_attributes + float_attributes: data[[data.columns[index]]] = data[[data.columns[index]]].astype(float) else: data[[data.columns[index]]] = data[[data.columns[index]]].astype(object) return data
[docs]def _reset_attributes(idaobject, attributes): """ Delete an attribute of a list of attributes of an object, if the attribute exists. """ if (not hasattr(attributes, "__iter__"))|isinstance(attributes, six.string_types): attributes = [attributes] for attribute in attributes: try: delattr(idaobject, attribute) except AttributeError: pass
def _check_input(idadf, target, features): """ Check if the input is valid. Target should be a string that exists as a column in the IdaDataFrame, or None. Features should a string or a list of columns in the IdaDataFrame, or None. If features is None, a list with all features different from target is returned """ if target is not None: if not isinstance(target, six.string_types): raise ValueError("target should be a string") if target not in idadf.columns: raise ValueError("Unknown column %s"%target) if features is not None: if isinstance(features, six.string_types): if features not in idadf.columns: raise ValueError("Unknown column %s"%features) features = [features] for x in features: if x not in idadf.columns: raise ValueError("Unknown column %s"%x) else: if target is not None: features = [x for x in idadf.columns if x not in target] else: features = list(idadf.columns) return target, features