Source code for ibmdbpy.frame

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------
# Copyright (c) 2015, IBM Corp.
# All rights reserved.
#
# Distributed under the terms of the BSD Simplified License.
#
# The full license is in the LICENSE file, distributed with this software.
#-----------------------------------------------------------------------------

"""
idaDataFrame
---------
An efficient 2D container that looks like a panda's DataFrame and behave the
same, the only difference is that it uses only a reference to a remote database
instead of having the data loaded into memory

Also similar to its R counterpart, data.frame, except providing automatic data
alignment and a host of useful data manipulation methods having to do with the
labeling information
"""

# Ensure Python 2 compatibility
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals
from __future__ import absolute_import
from builtins import dict
from builtins import zip
from builtins import str
from builtins import int
from future import standard_library
standard_library.install_aliases()

import sys
import os
from copy import deepcopy
import warnings
from numbers import Number
from collections import OrderedDict

import numpy as np
import pandas as pd
from pandas.core.index import Index

from lazy import lazy
import six

import ibmdbpy
import ibmdbpy.statistics
import ibmdbpy.indexing
import ibmdbpy.aggregation
import ibmdbpy.filtering
import ibmdbpy.utils

from ibmdbpy.utils import timed, chunklist
from ibmdbpy.internals import InternalState
from ibmdbpy.exceptions import IdaDataFrameError
from ibmdbpy.internals import idadf_state

[docs]class IdaDataFrame(object): """ An IdaDataFrame object is a reference to a table in a remote instance of dashDB/DB2. IDA stands for In-DataBase Analytics. IdaDataFrame copies the Pandas interface for DataFrame objects to ensure intuitive interaction for end-users. Examples -------- >>> idadb = IdaDataBase('DASHDB') # See documentation for IdaDataBase >>> ida_iris = IdaDataFrame(idadb, 'IRIS') >>> ida_iris.cov() sepal_length sepal_width petal_length petal_width sepal_length 0.685694 -0.042434 1.274315 0.516271 sepal_width -0.042434 0.189979 -0.329656 -0.121639 petal_length 1.274315 -0.329656 3.116278 1.295609 petal_width 0.516271 -0.121639 1.295609 0.581006 """ # TODO: Check if everything is ok when selecting AND projecting with loc # TODO: BUG: Filtering after selection/projection
[docs] def __init__(self, idadb, tablename, indexer = None): """ Constructor for IdaDataFrame objects. Parameters ---------- idadb : IdaDataBase IdaDataBase instance which contains the connection to be used. tablename : str Name of the table to be opened in the database. indexer : str, optional Name of the column that should be used as an index. This is optional. However, if no indexer is given, the order of rows issued by the head and tail functions is not guaranteed. Also, several in-database machine learning algorithms need an indexer as a parameter to be executed. Attributes ---------- _idadb : IdaDataBase IdaDataBase object parent of the current instance. tablename : str Name of the table self references. name : str Full name of the table self references, including schema. schema : str Name of the schema the table belongs to. indexer : str Name of the column used as an index. "None" if no indexer. loc : str Indexer that enables the selection and projection of IdaDataFrame instances. For more information, see the loc class documentation. internal_state : InternalState Object used to internally store the state of the IdaDataFrame. It also allows several non-destructive manipulation methods. type : str Type of the IdaDataFrame : “Table”, “View”, or “Unknown”. dtypes : DataFrame Data type in the database for each column. index : pandas.core.index Index containing the row names in this table. columns : pandas.core.index Index containing the columns names in this table. axes : list List containing columns and index attributes. shape : Tuple Number of rows and number of columns. Notes ----- Attributes "type", "dtypes", "index", "columns", "axes", and "shape" are evaluated in a lazy way to avoid an overhead when creating an IdaDataFrame. Sometimes the index may be too big to be downloaded. Examples -------- >>> idadb = IdaDataBase('DASHDB') >>> ida_iris = IdaDataFrame(idadb, "IRIS") """ #TODO: Implement equality comparision between two IdaDataFrames if not idadb.__class__.__name__ == "IdaDataBase": idadb_class = idadb.__class__.__name__ raise TypeError("Argument 'idadb' is of type %s, expected : IdaDataBase"%idadb_class) tablename = ibmdbpy.utils.check_case(tablename) #idadb._reset_attributes("cache_show_tables") # TODO: Test what kind of error append when a table in use and cached # is suddently deleted if idadb.exists_table_or_view(tablename) is False: # Try again after refreshing the cache idadb._reset_attributes("cache_show_tables") # Refresh the show table cache in parent IdaDataBase, because a table # could have been created by other mean / user and we have to make sure # the lookup is done and is actual. if idadb.exists_table_or_view(tablename) is False: raise NameError("Table %s does not exist in the database %s." %(tablename, idadb.data_source_name)) self._idadb = idadb self._indexer = indexer # Initialise indexer object self.loc = ibmdbpy.indexing.Loc(self) if "." in tablename: self.schema = tablename.split('.')[0] #self.name = tablename.split('.')[-1] self._name = tablename self.tablename = tablename.split('.')[-1] else: self.schema = idadb.current_schema self._name = idadb.current_schema + '.' + tablename self.tablename = tablename # self._name is the original name, this is a "final" variable # Push a reference to itself in its parent IdaDataBase self._idadb._idadfs.append(self) # TODO : self.size # A cache for unique value of each column self._unique = dict() ############################################################################### ### Attributes & Metadata computation ###############################################################################
@lazy
[docs] def internal_state(self): """ InternalState instances manage the state of an IdaDataFrame instance and allow several non-destructive data manipulation methods, such as the selection, projection, filtering, and aggregation of columns. """ return InternalState(self)
@property @idadf_state def name(self): return self.internal_state.current_state @property def indexer(self): """ The indexer attribute refers to the name of a column that should be used to index the table. This makes sense because dashDB is a column-based database, so row IDs do not make sense and are not deterministic. As a consequence, the only way to address a particular row is to refer to it by its index. If no indexer is provided, ibmdbpy still works but a correct row order is not guaranteed as far as the dataset is not sorted. Also, note that the indexer column is not taken into account in data mining algorithms. """ if hasattr(self, "_indexer"): return self._indexer else: None @indexer.setter def indexer(self, value): """ Basic checks for indexer : * The column exists in the table. * All values are unique. """ if value is None: return if value not in self.columns: raise IdaDataFrameError("'%s' cannot be used as indexer "%value + " because this is not a column in '%s'"%self._name) del self.columns #count = self[value].count_distinct() ## TODO: TO FIX, should return directly just a number count = self.levels(value) if count < self.shape[0]: raise IdaDataFrameError("'%s' cannot be used as indexer "%value + " because it contains non unique values.") self._indexer = value @lazy
[docs] def type(self): """ Type of self: 'Table', 'View' or 'Unknown'. Returns ------- str idaDataFrame type. Examples -------- >>> ida_iris.type 'Table' """ return self._get_type()
@lazy @idadf_state(force = True)
[docs] def dtypes(self): """ Data type in database for each column in self. Returns ------- DataFrame In-Database type for each columns. Examples -------- >>> ida_iris.dtypes TYPENAME sepal_length DOUBLE sepal_width DOUBLE petal_length DOUBLE petal_width DOUBLE species VARCHAR """ #import pdb ; pdb.set_trace() return self._get_columns_dtypes()
@lazy @idadf_state # to deprecate
[docs] def index(self): """ Index containing the row names in self. Returns ------- Index Examples -------- >>> ida_iris.index Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ... 140, 141, 142, 143, 144, 145, 146, 147, 148, 149], dtype='int64', length=150) Notes ----- Because indexes in a database can be only numeric, it is not that interesting for an IdaDataFrame but can still be useful sometimes. The function can break if the table is too large. Ask for the user’s approval before downloading an index which has more than 10000 values. """ return self._get_index()
@lazy
[docs] def columns(self): """ Index containing the column names in self. Returns ------- Index Examples -------- >>> ida_iris.columns Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'], dtype='object') """ if hasattr(self, "internal_state"): self.internal_state._create_view() cols = self._get_columns() self.internal_state._delete_view() return cols else: return self._get_columns()
@lazy @idadf_state # to deprecate (no index)
[docs] def axes(self): """ List containing IdaDataFrame.columns and IdaDataFrame.index attributes. Returns ------- list List containing two indexes (indexes and column attributes). Examples -------- >>> ida_iris.axes [Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ... 140, 141, 142, 143, 144, 145, 146, 147, 148, 149], dtype='int64', length=150), Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'], dtype='object')] """ return [self.index, self.columns]
@lazy @idadf_state
[docs] def shape(self): """ Tuple containing number of rows and number of columns. Returns ------- tuple Examples -------- >>> ida_iris.shape (150, 5) """ return self._get_shape()
@property @idadf_state def empty(self): """ Boolean that is True if the table is empty (no rows). Returns ------- Boolean """ if self.shape[0] == 0: return True else: return False
[docs] def __len__(self): """ Number of records. Returns ------- int Examples -------- >>> len(idadf) 150 """ return self.shape[0]
[docs] def __iter__(self): """ Iterate over columns. """ return iter(self.columns)
[docs] def __getitem__(self, item): """ Enable label based projection (selection of columns) in IdaDataFrames. Enable slice based selection of rows in IdaDataFrames. Enable row filtering. The syntax is similar to Pandas. Examples -------- >>> idadf['col1'] # return an IdaSeries >>> idadf[['col1']] # return an IdaDataFrame with one column >>> idadf[['col1', 'col2', 'col3']] # return an IdaDataFrame with 3 columns >>> idadf[0:9] # Select the 10 first rows >>> idadf[idadf['col1'] = "test"] # select of rows for which attribute col1 is equal to "test" Notes ----- The row order is not guaranteed if no indexer is given and the dataset is not sorted """ if isinstance(item, ibmdbpy.filtering.FilterQuery): newidadf = self._clone() newidadf.internal_state.update(item) newidadf._reset_attributes(["shape"]) else: if isinstance(item, slice): return self.loc[item] if not (isinstance(item,six.string_types)|isinstance(item, list)): raise KeyError(item) if isinstance(item, six.string_types): # Case when only one column was selected if item not in self.columns: raise KeyError(item) newidaseries = self._clone_as_serie(item) # Form the new columndict for column in list(newidaseries.internal_state.columndict): if column != item: del newidaseries.internal_state.columndict[column] newColumndict = newidaseries.internal_state.columndict # Erase attributes newidaseries._reset_attributes(["columns", "shape", "dtypes"]) # Set columns and columndict attributes newidaseries.internal_state.columns = ["\"%s\""%col for col in item] newidaseries.internal_state.columndict = newColumndict # Update, i.e. appends an entry to internal_state._cumulative newidaseries.internal_state.update() # Performance improvement # avoid, caused wrong dtypes for the result # newidaseries.dtypes = self.dtypes.loc[[item]] return newidaseries # Case of multiple columns not_a_column = [x for x in item if x not in self.columns] if not_a_column: raise KeyError("%s"%not_a_column) newidadf = self._clone() # Form the new columndict newColumndict = OrderedDict() for col in item: # Column name as key, its definition as value newColumndict[col] = self.internal_state.columndict[col] # Erase attributes newidadf._reset_attributes(["columns", "shape", "dtypes"]) # Set columns and columndict attributes newidadf.internal_state.columns = ["\"%s\""%col for col in item] newidadf.internal_state.columndict = newColumndict # Update, i.e. appends an entry to internal_state._cumulative newidadf.internal_state.update() # Performance improvement # avoid, caused wrong dtypes for the result # newidadf.dtypes = self.dtypes.loc[item] return newidadf
[docs] def __setitem__(self, key, item): """ Enable the creation and aggregation of columns. Examples -------- >>> idadf['new'] = idadf['sepal_length'] * idadf['sepalwidth'] # select a new column as the product of two existing columns >>> idadf['sepal_length'] = idadf['sepal_length'] / idadf['sepal_length'].mean() # modify an existing column """ if not (isinstance(item, IdaDataFrame)): raise TypeError("Modifying columns is supported only using "+ "IdaDataFrames.") if isinstance(key, six.string_types): key = [key] if len(key) != len(item.columns): raise ValueError("Wrong number of items passed %s, placement implies %s"%(len(item.columns),len(key))) #form the new columndict for newname, oldname in zip(key, item.columns): self.internal_state.columndict[newname] = item.internal_state.columndict[oldname] newColumndict = self.internal_state.columndict #erase attributes self._reset_attributes(["columns", "shape", "dtypes"]) #set columns and columndict attributes self.internal_state.columndict = newColumndict self.internal_state.columns = ["\"%s\""%col for col in newColumndict.keys()] #update, i.e. appends an entry to internal_state._cumulative self.internal_state.update() # Flush the "unique" cache for column in key: if column in self._unique: del self._unique[column]
[docs] def __delitem__(self, item): """ Enable non-destructive deletion of columns using a Pandas style syntax. This happens inplace, which means that the current IdaDataFrame is modified. Examples -------- >>> idadf = IdaDataFrame(idadb, "IRIS") >>> idadf.columns Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'], dtype='object') >>> del idadf['sepal_length'] >>> idadf.columns Index(['sepal_width', 'petal_length', 'petal_width', 'species'], dtype='object') """ if not (isinstance(item,six.string_types)): raise TypeError if item not in self.columns: raise KeyError(item) # Flush the "unique" cache if item in self._unique: del self._unique[item] self._idadb.delete_column(self, item, destructive = False) return
def __enter__(self): """ Allow the object to be used with a "with" statement """ return self def __exit__(self): """ Allow the object to be used with a "with" statement. Make sure that allow possible views related to the IdaDataFrame with be deleted when the object goes out of scope """ while self.internal_state.viewstack: try : view = self.internal_state.viewstack.pop() # Just a last check to make sure not to drop user's db if view != self.tablename: drop = "DROP VIEW \"%s\"" %view self._prepare_and_execute(drop, autocommit = True) except: pass #We decided not to allow columns access idadf.columnname like this for now. #We could decide to allow it but for this we may have to switch all #existing attributes to private ("_" as first character) so to avoid #conflicts between attributes and columns because for example IdaDataFrame #has an attribute "name" -> if a column is labelled "name" this will make #it unavailable. Pandas do also poorly manage this issue, for example : #DataFrame attribute "size". #May be implemented in the future #def __getattr__(self, name): # """ # After regular attribute access, try look up the name # This allows simpler access to columns for interactive use. # """ # Note: obj.x will always call obj.__getattribute__('x') prior to # calling obj.__getattr__('x'). #if hasattr(self, name): # return object.__getattribute__(self,name) #else: # if name not in self.__dict__["columns"]: #return self[name] # return object.__getattribute__(self, name) # raise AttributeError("'%s' object has no attribute '%s'" % # (type(self).__name__, name)) #def __setattr__(self, name, value): # if name not in self.__dict__["columns"]: # self.__dict__[name] = value # else: # raise ValueError("It is not allowed to set the value of a column in an IdaDataFrame.")
[docs] def __lt__(self, value): """ ibmdbpy.filtering.FilterQuery object when comparing self using "<". """ return ibmdbpy.filtering.FilterQuery(self.columns, self._name, "lt", value)
[docs] def __le__(self, value): """ ibmdbpy.filtering.FilterQuery object when comparing self using "<=". """ return ibmdbpy.filtering.FilterQuery(self.columns, self._name, "le", value)
[docs] def __eq__(self, value): """ ibmdbpy.filtering.FilterQuery object when comparing self using "==". """ return ibmdbpy.filtering.FilterQuery(self.columns, self._name, "eq", value)
[docs] def __ne__(self, value): """ ibmdbpy.filtering.FilterQuery object when comparing self using "!=". """ return ibmdbpy.filtering.FilterQuery(self.columns, self._name, "ne", value)
[docs] def __ge__(self, value): """ ibmdbpy.filtering.FilterQuery object when comparing self using ">=". """ return ibmdbpy.filtering.FilterQuery(self.columns, self._name, "ge", value)
[docs] def __gt__(self, value): """ ibmdbpy.filtering.FilterQuery object when comparing self using ">". """ return ibmdbpy.filtering.FilterQuery(self.columns, self._name, "gt", value) ################ Arithmetic operations
[docs] def __add__(self, other): """ Perform an addition between self and another IdaDataFrame or number. Examples -------- >>> ida = idadf['sepal_length'] + 3 Notes ----- Arithmetic operations only make sense if self contains only numeric columns. """ self._combine_check(other) return ibmdbpy.aggregation.aggregate_idadf(self, "add", other)
[docs] def __radd__(self, other): """ Enable the reflexivity of the addtion operation. Examples -------- >>> ida = idadf['sepal_length'] + 3 >>> ida = 3 + idadf['sepal_length'] """ self._combine_check(other) return ibmdbpy.aggregation.aggregate_idadf(other, "add", self, swap = True)
[docs] def __div__(self, other): """ Perform a division between self and another IdaDataFrame or number. When __future__.division is not in effect. Examples -------- >>> ida = idadf['sepal_length'] / 3 Notes ----- Arithmetic operations only make sense if self contains only numeric columns. """ self._combine_check(other) return ibmdbpy.aggregation.aggregate_idadf(self, "div", other)
[docs] def __rdiv__(self, other): """ Enable the reflexivity of the division operation. When __future__.division is not in effect. Examples -------- >>> ida = idadf['sepal_length'] / 3 >>> ida = 3 / idadf['sepal_length'] """ self._combine_check(other) return ibmdbpy.aggregation.aggregate_idadf(other, "div", self, swap = True)
[docs] def __truediv__(self, other): """ Perform a division between self and another IdaDataFrame or number. When __future__.division is in effect. Examples -------- >>> ida = idadf['sepal_length'] / 3 Notes ----- Arithmetic operations only make sense if self contains only numeric columns. """ self._combine_check(other) return ibmdbpy.aggregation.aggregate_idadf(self, "div", other)
[docs] def __rtruediv__(self, other): """ Enable the reflexivity of the division operation. When __future__.division is in effect. Examples -------- >>> ida = idadf['sepal_length'] / 3 >>> ida = 3 / idadf['sepal_length'] """ self._combine_check(other) return ibmdbpy.aggregation.aggregate_idadf(other, "div", self, swap = True)
[docs] def __floordiv__(self,other): """ Perform an integer division between self and another IdaDataFrame or number. Examples -------- >>> ida = idadf['sepal_length'] // 3 Notes ----- Arithmetic operations only make sense if self contains only numeric columns. """ self._combine_check(other) return ibmdbpy.aggregation.aggregate_idadf(self, "floordiv", other)
[docs] def __rfloordiv__(self, other): """ Enable the reflexivity of the integer division operation. Examples -------- >>> ida = idadf['sepal_length'] // 3 >>> ida = 3 // idadf['sepal_length'] """ self._combine_check(other) return ibmdbpy.aggregation.aggregate_idadf(other, "floordiv", self, swap = True)
[docs] def __mod__(self,other): """ Perform a modulo operation between self and another IdaDataFrame or number. Examples -------- >>> ida = idadf['sepal_length'] % 3 Notes ----- Arithmetic operations make sense if self has only numeric columns. """ self._combine_check(other) return ibmdbpy.aggregation.aggregate_idadf(self, "mod", other)
[docs] def __rmod__(self, other): """ Enable the reflexivity of the modulo operation. Examples -------- >>> ida = idadf['sepal_length'] % 3 >>> ida = 3 % idadf['sepal_length'] """ self._combine_check(other) return ibmdbpy.aggregation.aggregate_idadf(other, "mod", self, swap = True)
[docs] def __mul__(self,other): """ Perform a multiplication between self and another IdaDataFrame or number. Examples -------- >>> ida = idadf['sepal_length'] * 3 Notes ----- Arithmetic operations only make sense if self contains only numeric columns. """ self._combine_check(other) return ibmdbpy.aggregation.aggregate_idadf(self, "mul", other)
[docs] def __rmul__(self, other): """ Enable the reflexivity of the multiplication operation. Examples -------- >>> ida = idadf['sepal_length'] % 3 >>> ida = 3 % idadf['sepal_length'] """ self._combine_check(other) return ibmdbpy.aggregation.aggregate_idadf(other, "mul", self, swap = True)
[docs] def __neg__(self): """ Calculate the absolute negative of all columns in self. Notes ----- Arithmetic operations only make sense if self contains only numeric columns. """ other = None return ibmdbpy.aggregation.aggregate_idadf(self, "neg", other)
[docs] def __rpos__(self,other): """ Calculate the absolute positive. No operation required. """ return self
[docs] def __pow__(self,other): """ Perform a power operation between self and another IdaDataFrame or number. Examples -------- >>> ida = idadf['sepal_length'] ** 3 Notes ----- Arithmetic operations only make sense if self contains only numeric columns. """ self._combine_check(other) return ibmdbpy.aggregation.aggregate_idadf(self, "pow", other)
[docs] def __rpow__(self, other): """ Enable the reflexivity of the power operation. Examples -------- >>> ida = idadf['sepal_length'] ** 3 >>> ida = 3 ** idadf['sepal_length'] """ self._combine_check(other) return ibmdbpy.aggregation.aggregate_idadf(other, "pow", self, swap = True)
[docs] def __sub__(self,other): """ Perform a substraction between self and another IdaDataFrame or number. Examples -------- >>> ida = idadf['sepal_length'] - 3 Notes ----- Arithmetic operations only make sense if self contains only numeric columns. """ self._combine_check(other) return ibmdbpy.aggregation.aggregate_idadf(self, "sub", other)
[docs] def __rsub__(self, other): """ Enable the reflexivity of the substraction operation. Examples -------- >>> ida = idadf['sepal_length'] - 3 >>> ida = 3 - idadf['sepal_length'] """ self._combine_check(other) return ibmdbpy.aggregation.aggregate_idadf(other, "sub", self, swap = True) #def __truediv__(self,value): ######## # pass #def __concat__(self,value): #### # pass #def __contains__(self,value): # pass # TODO: Do the inplace versions (with "i" in front) ############################################################################### ### Database Features ############################################################################### ############################## ## Delegation from IdaDataBase ##############################
[docs] def exists(self): """ Convenience function delegated from IdaDataBase. Check if the data still exists in the database. """ return self._idadb.exists_table_or_view(self._name)
[docs] def is_view(self): """ Convenience function delegated from IdaDataBase. Check if the IdaDataFrame corresponds to a view in the database. """ if self.type == 'View': return True else: return False
[docs] def is_table(self): """ Convenience function delegated from IdaDataBase. Check if the IdaDataFrame corresponds to a table in the database. """ if self.type == 'Table': return True else: return False
[docs] def get_primary_key(self): # TODO: What happen if the primary key is composed of several columns ? """ Get the name of the primary key of self, if there is one. Otherwise, this function returns 0. This function may be deprecated in future versions because it is not very useful. """ name = self.internal_state.current_state pk = self.ida_query("SELECT NAME FROM SYSIBM.SYSCOLUMNS WHERE TBNAME = '" + name + "' AND KEYSEQ > 0 ORDER BY KEYSEQ ASC", first_row_only = True) if pk: return pk[0] else: return False # Should we maybe allow this only in IdaDataBase object ? #@timed
[docs] def ida_query(self, query, silent = False, first_row_only = False, autocommit = False): """ Convenience function delegated from IdaDataBase. Prepare, execute and format the result of a query in a data frame or in a tuple. See the IdaDataBase.ida_query documentation. """ return self._idadb.ida_query(query, silent, first_row_only, autocommit)
[docs] def ida_scalar_query(self, query, silent = False, autocommit = False): """ Convenience function delegated from IdaDataBase. Prepare and execute a query and return only the first element as a string. See the IdaDataBase.ida_scalar_query documentation. """ return self._idadb.ida_scalar_query(query, silent, autocommit) ############################################################################### ### Data Exploration ###############################################################################
def print(self): print(self.internal_state.get_state()) @idadf_state
[docs] def head(self, nrow=5, sort=True): """ Print the n first rows of the instance, n is set to 5 by default. Parameters ---------- nrow : int > 0 Number of rows to be included in the result. sort: default is True If set to True and no indexer is set the data will be sorted by the first numeric column or if no numeric column is available by the first column of the dataframe. If set to False and no indexer is set the row order is not guaranteed and can vary with each execution. For big tables this option might save query processing time. Returns ------- DataFrame or Series The index of the corresponding row number and the columns are all columns of self. If the IdaDataFrame has only one column, it returns a Series. Examples -------- >>> ida_iris.head() sepal_length sepal_width petal_length petal_width species 0 5.1 3.5 1.4 0.2 setosa 1 4.9 3.0 1.4 0.2 setosa 2 4.7 3.2 1.3 0.2 setosa 3 4.6 3.1 1.5 0.2 setosa 4 5.0 3.6 1.4 0.2 setosa """ if (nrow < 1) | (not isinstance(nrow, int)): raise ValueError("Parameter nrow should be an int greater than 0.") else: name = self.internal_state.current_state order = '' if not " ORDER BY " in self.internal_state.get_state(): if (self.indexer is not None)&(self.indexer in self.columns): order = " ORDER BY \"" + str(self.indexer) + "\" ASC" elif self.indexer is None: if sort: column = self.columns[0] if self._get_numerical_columns(): column = self._get_numerical_columns()[0] order = " ORDER BY \"" + column + "\" ASC" else: order = '' data = self.ida_query("SELECT * FROM %s%s FETCH FIRST %s ROWS ONLY"%(name, order, nrow)) if data.shape[0] != 0: # otherwise column sort order is reverted if not 'SELECT ' in name: columns = self.columns data.columns = columns # data = ibmdbpy.utils._convert_dtypes(self, data) if isinstance(self, ibmdbpy.IdaSeries): data = pd.Series(data) return data # TODO : There is a warning in anaconda when there are missing values -> why ?
@idadf_state
[docs] def tail(self, nrow=5, sort=True): """ Print the n last rows of the instance, n is set to 5 by default. Parameters ---------- nrow : int > 0 The number of rows to be included in the result. sort: default is True If set to True and no indexer is set the data will be sorted by the first numeric column or if no numeric column is available by the first column of the dataframe. If set to False and no indexer is set the row order is not guaranteed and can vary with each execution. For big tables this option might save query processing time. Returns ------- DataFrame The index of the corresponding row number and the columns are all columns of self. Examples -------- >>> ida_iris.tail() sepal_length sepal_width petal_length petal_width species 145 6.7 3.0 5.2 2.3 virginica 146 6.3 2.5 5.0 1.9 virginica 147 6.5 3.0 5.2 2.0 virginica 148 6.2 3.4 5.4 2.3 virginica 149 5.9 3.0 5.1 1.8 virginica """ if (nrow < 1) | (not isinstance(nrow, int)): raise ValueError("Parameter nrow should be an int greater than 0.") else: column_string = '\"' + '\", \"'.join(self.columns) + '\"' name = self.internal_state.current_state if " ORDER BY " in self.internal_state.get_state(): query = "SELECT * FROM %s FETCH FIRST %s ROWS ONLY"%(name, nrow) data = self.ida_query(query) data.columns = self.columns data.set_index(data[self.indexer], inplace=True) else: order = '' if self.indexer: sortkey = str(self.indexer) order = "ORDER BY \"" + sortkey + "\"" else: if sort: sortkey = self.columns[0] if self._get_numerical_columns(): sortkey = self._get_numerical_columns()[0] order = "ORDER BY \"" + sortkey + "\"" query = ("SELECT * FROM (SELECT * FROM (SELECT " + column_string + ", ((ROW_NUMBER() OVER(" + order + "))-1) AS ROWNUMBER FROM " + name + ") ORDER BY ROWNUMBER DESC FETCH FIRST " + str(nrow) + " ROWS ONLY) ORDER BY ROWNUMBER ASC") data = self.ida_query(query) data.set_index(data.columns[-1], inplace=True) # Set the index data.columns = self.columns del data.index.name # data = ibmdbpy.utils._convert_dtypes(self, data) if isinstance(self, ibmdbpy.IdaSeries): data = pd.Series(data[data.columns[0]]) return data
@idadf_state
[docs] def pivot_table(self, values=None, columns=None, max_entries=1000, sort=None, factor_threshold=20, interactive=False, aggfunc='count'): """ Compute an aggregation function over all rows of each column that is specified as a value on the dataset. The result grouped by the columns defined in “columns”. Parameters ---------- values: str or list or str optional List of columns on which “aggfunc” is computed. columns: str or list or str optional List of columns that is used as an index and by which the dataframe is grouped. max_entries: int, default=1000 The maximum number of cells to be part of the output. By default, set to 1000. sort: str, optional Admissible values are: “alpha” and “factors”. * If “alpha”, the index of the output is sorted according to the alphabetical order. * If “factors”, the index of the output will be sorted according to increasing number of the distinct values. By default, the index will be sorted in the same order that is specified in “columns” argument. factor_threshold: int, default: 20 Number of distinct values above which a categorical column should not be considered categorical anymore and under which a numerical column column should not be considered numerical anymore. interactive: bool If True, the user is asked if he wants to display the output, given its size. aggfunc: str Aggregation function to be computed on each column specified in the argument “values”. Admissible values are: “count”, “sum”, “avg”. This entry is not case-sensitive. Returns ------- Pandas Series with Multi-index (columns) Examples -------- >>> val = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] >>> ida_iris.pivot_table(values= val, aggfunc="avg") species sepal_length setosa 5.006 versicolor 5.936 virginica 6.588 sepal_width setosa 3.428 versicolor 2.770 virginica 2.974 petal_length setosa 1.462 versicolor 4.260 virginica 5.552 petal_width setosa 0.246 versicolor 1.326 virginica 2.026 dtype: float64 """ # TODO : Support index from ibmdbpy.statistics import pivot_table return pivot_table(idadf=self, values=values, columns=columns, max_entries=max_entries, sort=sort, factor_threshold=factor_threshold, interactive=interactive, aggfunc=aggfunc)
@idadf_state def groupby(self, by): # TODO: create an IdadataFrame groupby raise NotImplementedError() @idadf_state def merge(self): raise NotImplementedError() @idadf_state def concat(self): raise NotImplementedError() @idadf_state def join(self): raise NotImplementedError() # TODO : implement NULL FIRST option @idadf_state
[docs] def sort(self, columns=None, axis=0, ascending=True, inplace=False): """ Sort the IdaDataFrame row wise or column wise. Parameters ---------- columns : str or list of str Columns that should be used to sort the rows in the IdaDataFrame. If columns is set to None and axis to 0, then the IdaDataFrame columns are sorted in lexicographical order. axis : int (0/1) Axis that is sorted. 0 for sorting row wise, 1 for sorting column wise. ascending : bool, default: True Sorting order, True : ascending, False : descending inplace : bool, default: False The current object is modified or creates a modified copy. If False, the function creates a modified copy of the current dataframe. If True, the function modifies the current dataframe. Raises ------ ValueError * When sorting by column (column not None), the axis value must be 0 (rows). * A column does not belong to self. * The axis argument has a value other than 0 or 1. Notes ----- If columns is set to None and axis to 0, this undoes all sorting the IdaDataFrame and returns the original sorting in the dashDB/DB2 database. No actual changes are made in dashDB/DB2, only the querying changes. Everything is registered in an InternalState object. Changes can be observed by using head and tail function. """ if isinstance(columns, six.string_types): columns = [columns] # Sanitiy check if columns: if axis != 0: raise ValueError("When sorting by column, axis must be 0 (rows)") for column in columns: if column not in self.columns: raise ValueError("Column "+column+" is not in "+self._name) if axis not in [0,1]: raise ValueError("Allowed values for axis is 0 (rows) or 1 (columns)") if inplace: idadf = self else: idadf = self._clone() if axis: # Sort the columns in ascending or descending lexicographic order idadf.internal_state.columndict = OrderedDict(sorted(idadf.internal_state.columndict.items(), reverse = not ascending)) idadf.internal_state.update() else: if columns: idadf.internal_state.set_order(columns, ascending) idadf.internal_state.update() else: if isinstance(idadf, ibmdbpy.IdaSeries): idadf.internal_state.set_order([idadf.column], ascending) idadf.internal_state.update() else: idadf.internal_state.reset_order() if not inplace: return idadf
@idadf_state def levels(self, columns = None): # TODO: Test, doc, name? """ Return the numbers of distinct values """ if columns is not None: if isinstance(columns, six.string_types): columns = [columns] for column in columns: message = '' if column not in self.columns: message += "Column %s does not belong to current idadataframe. \n"%column if message: raise ValueError(message) else: columns = self.columns agglist = [] for column in columns: agglist.append("COUNT(DISTINCT \"%s\")"%column) aggstr = ",".join(agglist) query = "SELECT %s FROM %s"%(aggstr, self.name) levels_tuple = self.ida_query(query, first_row_only = True) if len(levels_tuple) == 1: return levels_tuple[0] result = pd.Series(levels_tuple) result.index = columns return result #@timed @idadf_state def count_groupby(self, columns = None, count_only = False, having = None): """ Count the occurence of the values of a column or group of columns """ # TODO: Document, test if columns is not None: if isinstance(columns, six.string_types): columns = [columns] for column in columns: message = '' if column not in self.columns: message += "Column %s does not belong to current idadataframe. \n" if message: raise ValueError(message) else: columns = list(self.columns) if having: if not isinstance(having, Number): raise TypeError("having argument should be a number") select_columnstr = "\"" + "\",\"".join(columns) + "\", COUNT(*)" if count_only: select_columnstr = "COUNT(*)" groupby_columnstr = "\"" + "\",\"".join(columns) + "\"" if having: groupby_columnstr = groupby_columnstr + " HAVING count >= %s"%having data = self.ida_query("SELECT %s AS count FROM %s GROUP BY %s"%(select_columnstr,self.name,groupby_columnstr)) data.columns = columns + ["count"] return data def mean_freq_of_instance(self, columns = None): """ Return the average occurence of the values of a column or group of columns """ # TODO: Document, test if columns is not None: if isinstance(columns, six.string_types): columns = [columns] for column in columns: message = '' if column not in self.columns: message += "Column %s does not belong to current idadataframe. \n" if message: raise ValueError(message) else: columns = list(self.columns) select_columnstr = "COUNT(*)" groupby_columnstr = "\"" + "\",\"".join(columns) + "\"" subquery = "SELECT %s AS count FROM %s GROUP BY %s"%(select_columnstr,self.name,groupby_columnstr) data = self.ida_scalar_query("SELECT AVG(DISTINCT count) FROM (%s)"%subquery) return int(data) # @timed @idadf_state def unique(self, column): """ Return the unique values of a column """ # TODO: Document, test if column in self._unique: return self._unique[column] #name = self.internal_state.current_state if not isinstance(column, six.string_types): raise TypeError("column argument not of string type") if column not in self.columns: # idadf.name somewhat false in case of modification raise ValueError("Undefined column \"%s\" in table %s"%(column, self._name)) result = self.ida_query("SELECT DISTINCT \"%s\" FROM %s"%(column, self.name)) self._unique[column] = result return result # TODO: to implement @timed @idadf_state def info(self, buf=None): """Some information about current IdaDataFrame. NOTIMPLEMENTED""" # There is a lot more from pandas.core.format import _put_lines if buf is None: # pragma: no cover buf = sys.stdout lines = [] lines.append(str(type(self))) lines.append(self.index.summary()) if len(self.columns) == 0: lines.append('Empty %s' % type(self).__name__) _put_lines(buf, lines) return ############################################################################### ### Descriptive statistics ############################################################################### # TODO: to implement for categorical attributes @timed @idadf_state
[docs] def describe(self, percentiles=[0.25, 0.50, 0.75]): """ A basic statistical summary about current IdaDataFrame. If at least one numerical column exists, the summary includes: * The count of non-missing values for each numerical column. * The mean for each numerical column. * The standart deviation for each numerical column. * The minimum and maximum for each numerical column. * A list of percentiles set by the user (default : the quartiles). Parameters ---------- idadf : IdaDataFrame percentiles : Float or list of floats, default: [0.25, 0.50, 0.75]. percentiles to be computed on numerical columns. All values in percentiles must be > 0 and < 1. Returns ------- summary: DataFrame, where * Index is the name of the computed values. * Columns are either numerical or categorical columns of self. """ from ibmdbpy.statistics import describe return describe(idadf=self, percentiles=percentiles)
@timed @idadf_state
[docs] def cov(self): """ Compute the covariance matrix, composed of covariance coefficients between all pairs of columns in self. Returns ------- covariance matrix: DataFrame The axes are the columns of self and the values are the covariance coefficients. """ from ibmdbpy.statistics import cov return cov(idadf=self)
@timed @idadf_state
[docs] def corr(self, method="pearson", features=None, ignore_indexer=True): """ Compute the correlation matrix, composed of correlation coefficients between all pairs of columns in self. Parameters ---------- method : str, default: pearson Method to be used to compute the correlation. By default, compute the pearson correlation coefficient. The Spearman rank correlation is also available. Admissible values are: "pearson", "spearman". Returns ------- correlation matrix: DataFrame The axes are the columns of self and the values are the correlation coefficients. Notes ----- For the Spearman rank correlation, the ordinal rank of columns is computed. For performance reasons this is easier to compute than the fractional rank traditionally computed for the Spearman rank correlation method. This strategy has the property that the sum of the ranking numbers is the same as under ordinal ranking. We then apply the pearson correlation coefficient method to these ranks. """ from ibmdbpy.statistics import corr return corr(idadf=self, features=features, ignore_indexer=ignore_indexer) # TODO: to implement
@timed @idadf_state def corrwith(self, other): """ Compute the correlation matrix, composed of correlation coefficients between the columns of self and the columns of another IdaDataFrame. Parameters ---------- other : DataFrame Returns ------- correlation matrix: DataFrame The columns are the columns of self and the index the columns of other. The values are the covariance coefficients. """ raise NotImplementedError("TODO") # TODO: to implement @timed @idadf_state def mode(self): """ Compute the most common value for each non numeric column self. NOTIMPLEMENTED Returns ------- mode: Series """ raise NotImplementedError("TODO") @timed @idadf_state
[docs] def quantile(self, q=0.5): """ Compute row wise quantiles for each numeric column. Parameters ---------- q : float or array-like, default 0.5 (50% quantile) 0 <= q <= 1, the quantile(s) to compute Returns ------- quantiles: Series or DataFrame If q is an array, the function returns a DataFrame in which the index is q. The columns are the columns of sel, and the values are the quantiles. If q is a float, a Series is returned where the index is the columns of self and the values are the quantiles. """ from ibmdbpy.statistics import quantile return quantile(idadf=self, q=q)
@timed @idadf_state
[docs] def mad(self): """ Compute the mean absolute distance for all numeric columns of self. Returns ------- mad: Series The index consists of the columns of self and the values are the mean absolute distance. """ from ibmdbpy.statistics import mad return mad(idadf=self)
@timed @idadf_state
[docs] def min(self): """ Compute the minimum value for all numerics column of self. Returns ------- min: Series The index consists of the columns of self and the values are the minimum. """ from ibmdbpy.statistics import ida_min return ida_min(idadf=self)
@timed @idadf_state
[docs] def max(self): """ Compute the maximum value over for all numeric columns of self. Returns ------- max: Series. The index consists of the columns of self and the values are the maximum. """ from ibmdbpy.statistics import ida_max return ida_max(idadf=self)
@timed @idadf_state
[docs] def count(self): """ Compute the count of non-missing values for all columns of self. Returns ------- count: Series. The index consists of the columns of self and the values are the number of non-missing values. """ from ibmdbpy.statistics import count return count(idadf=self)
@timed @idadf_state
[docs] def count_distinct(self): # deprecated, use levels instead """ Compute the count of distinct values for all numeric columns of self. Returns ------- disctinct count: Series The index consists of the columns of self and values are the number of distinct values. """ from ibmdbpy.statistics import count_distinct return count_distinct(idadf=self)
@timed @idadf_state
[docs] def std(self): """ Compute the standart deviation for all numeric columns of self. Returns ------- std: Series The index consists of the columns of self and the values are the standart deviation. """ from ibmdbpy.statistics import std return std(idadf=self)
@timed @idadf_state def within_class_var(self, target, features = None, ignore_indexer=True): if features is None: numerical_columns = self._get_numerical_columns() features = [x for x in numerical_columns if x != target] else: if isinstance(features, six.string_types): features = [features] if ignore_indexer is True: if self.indexer: if self.indexer in features: features.remove(self.indexer) result = pd.Series() #C = self.levels(target) N = len(self) if len(features) < 5: avglist = ["AVG(\"%s\") as \"average%s\""%(feature, index) for index, feature in enumerate(features)] sumlist = ["SUM(CAST(POWER(\"%s\" - \"average%s\", 2) as DOUBLE))"%(feature, index) for index, feature in enumerate(features)] avgstr = ", ".join(avglist) sumstr = ", ".join(sumlist) subquery = "(SELECT \"%s\", %s FROM %s GROUP BY \"%s\") AS B"%(target, avgstr, self.name, target) condition = "ON A.\"%s\" = B.\"%s\""%(target, target) groupby = "GROUP BY A.\"%s\""%target query = "SELECT %s FROM %s AS A INNER JOIN %s %s %s"%(sumstr, self.name, subquery, condition, groupby) classvar = self.ida_query(query) if len(features) == 1: classvar = pd.DataFrame(classvar) C = len(classvar) if N == C: N += 1 classvar.columns = pd.Index(features) result = pd.Series() for attr in classvar: result[attr] = classvar[attr].sum()/(N -C) else: chunkgen = chunklist(features, 5) result = pd.Series() for chunk in chunkgen: avglist = ["AVG(\"%s\") as \"average%s\""%(feature, index) for index, feature in enumerate(chunk)] sumlist = ["SUM(CAST(POWER(\"%s\" - \"average%s\", 2) as DOUBLE))"%(feature, index) for index, feature in enumerate(chunk)] avgstr = ", ".join(avglist) sumstr = ", ".join(sumlist) subquery = "(SELECT \"%s\", %s FROM %s GROUP BY \"%s\") AS B"%(target, avgstr, self.name, target) condition = "ON A.\"%s\" = B.\"%s\""%(target, target) groupby = "GROUP BY A.\"%s\""%target query = "SELECT %s FROM %s AS A INNER JOIN %s %s %s"%(sumstr, self.name, subquery, condition, groupby) classvar = self.ida_query(query) if len(chunk) == 1: classvar = pd.DataFrame(classvar) C = len(classvar) if N == C: N += 1 classvar.columns = pd.Index(chunk) for attr in classvar: result[attr] = classvar[attr].sum()/(N -C) return result @timed @idadf_state def within_class_std(self, target, features = None, ignore_indexer= True): return np.sqrt(self.within_class_var(target, features, ignore_indexer)) @timed @idadf_state
[docs] def var(self): """ Compute the variance for all numeric columns of self. Returns ------- var: Series The index consists of the columns of self and the values are the variance. """ from ibmdbpy.statistics import var return var(idadf=self)
@timed @idadf_state
[docs] def mean(self): """ Compute the mean for each numeric columns of self. Returns ------- mean: Series The index consists of the columns of self and the values are the mean. """ from ibmdbpy.statistics import mean return mean(idadf=self)
@timed @idadf_state def mean_groupby(self, groupby, features = None): if features is None: features = [x for x in self.columns if x != groupby] else: if isinstance(features, six.string_types): features = [features] avglist = ["CAST(AVG(\"%s\") as FLOAT)"%feature for feature in features] avgstr = ", ".join(avglist) query = "SELECT \"%s\", %s FROM %s GROUP BY \"%s\""%(groupby, avgstr, self.name, groupby) result = self.ida_query(query) result = result.pivot_table(index = result.columns[0]) result.columns = pd.Index(features) return result @timed @idadf_state
[docs] def sum(self): """ Compute the sum of values for all numeric columns of self. Returns ------- sum: Series The index consists of the columns of self and the values are the sum. """ # Behave like having the option "numeric only" to true # TODO: Implement the option from ibmdbpy.statistics import ida_sum return ida_sum(idadf=self)
@timed @idadf_state
[docs] def median(self): """ Compute the median for all numeric columns of self. Returns ------- median: Series The index consists of the columns of self and the values are the median. """ # Behave like having the option "numeric only" to true # TODO: Implement the option from ibmdbpy.statistics import median return median(idadf=self) # Maybe not be possible
@idadf_state def rank(self): """Compute the rank over all entries for all columns of self.""" raise NotImplementedError("TODO") # TODO : cumsum, cummean, cummcountm cummax, cumprod ############################################################################### ### Save current IdaDataFrame to dashDB as a table ############################################################################### # TODO: Should this function be in IdaDataBase ? # To my mind, it is more intuitive to let it here, but it is "destructive" @timed @idadf_state
[docs] def save_as(self, tablename, clear_existing = False): """ Save self as a table name in the remote database with the name tablename. This function might erase an existing table if tablename already exists and clear_existing is True. """ # TODO: to test ! if tablename == self.tablename: if clear_existing is False: raise ValueError("Cannot overwrite current IdaDataFrame if "+ " clear_existing option set to False.") message = "Table %s already exists."%(tablename) warnings.warn(message, UserWarning) question = "Are you sure that you want to overwrite %s"%(tablename) display_yes = ibmdbpy.utils.query_yes_no(question) if not display_yes: return tempname = self._idadb._get_valid_tablename() self._prepare_and_execute("CREATE TABLE %s LIKE %s"%(tempname, tablename)) self._prepare_and_execute("INSERT INTO %s (SELECT * FROM %s)"%(tempname, tablename)) try: self._idadb.drop_table(tablename) except: self._idadb.drop_view(tablename) newidadf = IdaDataFrame(self._idadb, tempname) self._idadb.rename(newidadf, tablename) self = newidadf if self._idadb.exists_table_or_view(tablename): if clear_existing: message = "Table %s already exists."%(tablename) warnings.warn(message, UserWarning) question = "Are you sure that you want to overwrite %s"%(tablename) display_yes = ibmdbpy.utils.query_yes_no(question) if not display_yes: return try: self._idadb.drop_table(tablename) except: self._idadb.drop_view(tablename) else: raise NameError(("%s already exists, choose a different name "+ "or use clear_existing option.")%tablename) name = self.internal_state.current_state self._prepare_and_execute("CREATE TABLE %s LIKE %s"%(tablename, name)) self._prepare_and_execute("INSERT INTO %s (SELECT * FROM %s)"%(tablename, name)) # Reset the cache self._idadb._reset_attributes("cache_show_tables") ############################################################################### ### Import as DataFrame ###############################################################################
@timed @idadf_state
[docs] def as_dataframe(self): """ Download and return an in-memory representation of the dataset as a Pandas DataFrame. Returns ------- DataFrame Columns and records are the same as in self. Examples -------- >>> iris = ida_iris.as_dataframe() >>> iris.head() sepal_length sepal_width petal_length petal_width species 0 5.1 3.5 1.4 0.2 setosa 1 4.9 3.0 1.4 0.2 setosa 2 4.7 3.2 1.3 0.2 setosa 3 4.6 3.1 1.5 0.2 setosa 4 5.0 3.6 1.4 0.2 setosa """ if os.environ['VERBOSE'] == 'True': # We use an empirical estimation # Experimental results : # 1M row, 12 columns => 550 secs # 3.77M, 12 columns => 2256 secs raw_estimation = ((self.shape[0]*self.shape[1]) / 4000000) * 3 if raw_estimation < 0.01: # There is a small offset, which is negligeable for higher measurements estimation = str(raw_estimation*60 + 0.717) + " seconds." elif raw_estimation > 60: estimation = str(raw_estimation/60) + " hours." else: estimation = "%s minutes."%raw_estimation print("Your IdaDataFrame has %s rows and %s columns." % (self.shape[0], self.shape[1])) print("Estimated download time : ~ " + estimation) if raw_estimation > 30: message = ("Estimated Download time is greater than" + " %s minutes.")%raw_estimation warnings.warn(message, UserWarning) question = "Do you want to download the dataset?" display_yes = ibmdbpy.utils.query_yes_no(question) if not display_yes: return data = self.ida_query(self.internal_state.get_state()) data.columns = self.columns data.name = self.tablename # Handle datatypes # data = ibmdbpy.utils._convert_dtypes(self, data) return data ############################################################################### ### Connection Management ############################################################################### # Connection is a bit different, this is made to allow user to work on several # IdaDataFrame and close them, without closing the connection. This is why # function "close" is overwritten and a function reopen is provided # Not sure commit and rollback should be here ?
[docs] def commit(self): """ Commit operations in the database. Notes ----- All changes that are made in the database after the last commit, including those in the child IdaDataFrames, are commited. If the environment variable ‘VERBOSE’ is set to True, the commit operations are notified in the console. """ self._idadb.commit()
[docs] def rollback(self): """ Rollback operations in the database. Notes ----- All changes that are made in the database after the last commit, including those in the child IdaDataFrames, are discarded. """ self._idadb.rollback() ############################################################################### ### Private functions ###############################################################################
[docs] def _clone(self): """ Clone the actual object. """ newida = IdaDataFrame(self._idadb, self._name, self.indexer) newida.columns = self.columns newida.dtypes = self.dtypes # avoid recomputing it # otherwise risk of infinite loop between # idadf.columns and internalstate.columndict # This is not possible to use deepcopy on an IdaDataFrame object # because the reference to the parents IdaDataBase with the connection # object is not pickleable. As a consequence, we create a new # IdaDataFrame and copy all the relevant attributes newida.internal_state.name = deepcopy(self.internal_state.name) newida.internal_state.ascending = deepcopy(self.internal_state.ascending) #newida.internal_state.views = deepcopy(self.internal_state.views) newida.internal_state._views = deepcopy(self.internal_state._views) newida.internal_state._cumulative = deepcopy(self.internal_state._cumulative) newida.internal_state.order = deepcopy(self.internal_state.order) newida.internal_state.columndict = deepcopy(self.internal_state.columndict) return newida
[docs] def _clone_as_serie(self, column): """ Clone the actual object as an IdaSeries and select one of its columns. """ newida = ibmdbpy.IdaSeries(idadb = self._idadb, tablename = self._name, indexer = self.indexer, column = column) newida.internal_state.name = deepcopy(self.internal_state.name) newida.internal_state.ascending = deepcopy(self.internal_state.ascending) #newida.internal_state.views = deepcopy(self.internal_state.views) newida.internal_state._views = deepcopy(self.internal_state._views) newida.internal_state._cumulative = deepcopy(self.internal_state._cumulative) newida.internal_state.order = deepcopy(self.internal_state.order) newida.internal_state.columndict = deepcopy(self.internal_state.columndict) return newida
[docs] def _get_type(self): """ Type of the IdaDataFrame : "Table", "View" or "Unknown". """ if self._idadb.is_table(self._name): return "Table" elif self._idadb.is_view(self._name): return "View" else: return "Unkown"
[docs] def _get_columns(self): """ Index containing a list of the columns in self. """ tablename = self.internal_state.current_state if self._idadb._con_type == 'odbc': if '.' in tablename: tablename = tablename.split('.')[-1] columns = self._idadb._con.cursor().columns(table=tablename) columnlist = [column[3] for column in columns] return Index(columnlist) elif self._idadb._con_type == 'jdbc': cursor = self._idadb._con.cursor() cursor.execute("SELECT * FROM %s"%tablename) columnlist = [column[0] for column in cursor.description] return Index(columnlist)
[docs] def _get_all_columns_in_table(self): """Get all columns that exists in the physical table.""" return self._get_columns() # TODO: To deprecate
[docs] def _get_index(self, force=False): """ Index containing a list of the row names in self. """ # Prevent user from loading an index that is too big if not force: threshold = 10000 if self.shape[0] > threshold: print("WARNING : the index has %s elements." %self.shape[0]) question = "Do you want to download it in memory ?" display_yes = ibmdbpy.utils.query_yes_no(question) if not display_yes: return # (ROW_NUMBER() OVER())-1 is because ROWID starts with 1 instead of 0 df = self.ida_query("SELECT ((ROW_NUMBER() OVER())-1) AS ROWNUMBER FROM %s" %self._name) # Fix a bug in the jpype interface, where the element of the series # actually are of type jpype._jclass.java.lang.Long if "jpype" in str(type(df[0])): return Index(map(lambda x: int(x.toString()),df)) return Index(df)
[docs] def _get_shape(self): """ Tuple containing the number of rows and the number of columns in self. """ name = self.internal_state.current_state nrow = self.ida_scalar_query("SELECT CAST(COUNT(*) AS INTEGER) FROM %s"%name) ncol = len(self.columns) return (nrow, ncol)
[docs] def _get_columns_dtypes(self): """ DataFrame containing the column names and database types in self. """ name = self.internal_state.current_state # In case the name is composed the following way : SCHEMA.TABLENAME # We need to separate it to keep only the TABLENAME for this query. if '.' in name : name = name.split('.')[-1] if name.find("TEMP_VIEW_") == 0: #When the column names are going to be retrieved from a temporary #view that was created with the definition of the current state of #the IdaDataFrame, the schema name cannot be assumed as the same of #the IdaDataFrame. Also mind that the name of the temporary view #is thought to be random enough to avoid collisions data = self.ida_query(("SELECT COLNAME, TYPENAME FROM SYSCAT.COLUMNS "+ "WHERE TABNAME=\'%s\' "+ "ORDER BY COLNO")%(name)) else: data = self.ida_query(("SELECT COLNAME, TYPENAME FROM SYSCAT.COLUMNS "+ "WHERE TABNAME=\'%s\' AND TABSCHEMA=\'%s\' "+ "ORDER BY COLNO")%(name, self.schema)) # Workaround for some ODBC version which does not get the entire # string of the column name in the cursor descriptor. # This is hardcoded, so be careful data.columns = ["COLNAME", "TYPENAME"] data.columns = [x.upper() for x in data.columns] data.set_index(keys='COLNAME', inplace=True) del data.index.name return data
[docs] def _reset_attributes(self, attributes): """ Delete an attribute of self to force its refreshing at the next call. """ if isinstance(attributes, six.string_types): attributes = [attributes] # Special case : resetting columns if "columns" in attributes: try: del self.internal_state.columndict except: pass ibmdbpy.utils._reset_attributes(self, attributes) ############################################################################### ### DashDB/DB2) to pandas type mapping ###############################################################################
[docs] def _table_def(self, factor_threshold=None): """ Classify columns in the idaDataFrame into 4 classes: CATEGORICAL, STRING, NUMERIC or NONE. Use the database data type and a user-threshold “factor_threshold”: * CATEGORICAL columns that have a number of distinct values that is greater than the factor_threshold should be considered a STRING. * NUMERIC columns that have a number of distinct values that is smaller or equal to the factor_threshold should be considered CATEGORICAL. Returns ------- DataFrame * Index is the columns of self. * Column "FACTORS" contains the number of distinct values. * Column "VALTYPE" contains the resulting class. Examples -------- >>> ida_iris._table_def() TYPENAME FACTORS VALTYPE sepal_length DOUBLE 35 NUMERIC sepal_width DOUBLE 23 NUMERIC petal_length DOUBLE 43 NUMERIC petal_width DOUBLE 22 NUMERIC species VARCHAR 3 CATEGORICAL """ # We don't want to change the value of the attribute data = deepcopy(self.dtypes) def _valtype_from_dbtype(tup): """ Decides if a column should be considered categorical or numerical """ categorical_attributes = ['VARCHAR', 'CHARACTER', 'VARGRAPHIC', 'GRAPHIC', 'CLOB'] numerical_attributes = ['SMALLINT', 'INTEGER', 'BIGINT', 'REAL', 'DOUBLE', 'FLOAT', 'DECIMAL', 'NUMERIC'] if tup[0] in categorical_attributes: if factor_threshold is None: return "CATEGORICAL" elif tup[1] <= factor_threshold: return "CATEGORICAL" else: return "STRING" elif tup[0] in numerical_attributes: if factor_threshold is None: return "NUMERIC" elif tup[1] > factor_threshold: return "NUMERIC" else: return "CATEGORICAL" else: return "NONE" data['FACTORS'] = ibmdbpy.statistics._count_level(self, data.index.values) data['VALTYPE'] = [_valtype_from_dbtype(x) for x in data[['TYPENAME', 'FACTORS']].to_records(index=0)] return data
[docs] def _get_numerical_columns(self): """ Get the columns of self that are considered as numerical. Their data type in the database determines whether these columns are numerical. The following data types are considered numerical: 'SMALLINT', 'INTEGER','BIGINT','REAL', 'DOUBLE','FLOAT','DECIMAL','NUMERIC' Returns ------- list List of numerical column names. Examples -------- >>> ida_iris._get_numerical_columns() ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] """ num = ['SMALLINT', 'INTEGER', 'BIGINT', 'REAL', 'DOUBLE', 'FLOAT', 'DECIMAL', 'NUMERIC'] return list(self.dtypes.loc[self.dtypes['TYPENAME'].isin(num)].index)
[docs] def _get_categorical_columns(self): """ Get the columns of self that are considered as categorical. Their data type in the database determines whether these columns are categorical. The following data types are considered categorical: "VARCHAR","CHARACTER", "VARGRAPHIC", "GRAPHIC", "CLOB". Returns ------- list List of categorical column names. Examples -------- >>> ida_iris._get_categorical_columns() ['species'] """ cat = ['VARCHAR', 'CHARACTER', 'VARGRAPHIC', 'GRAPHIC', 'CLOB'] return list(self.dtypes.loc[self.dtypes['TYPENAME'].isin(cat)].index)
[docs] def _prepare_and_execute(self, query, autocommit = True, silent = False): """ Prepare and execute a query. Parameters ---------- query : str Query to be executed. autocommit : bool If set to true, the autocommit function is available. """ return self._idadb._prepare_and_execute(query, autocommit, silent)
[docs] def _autocommit(self): """ Autocommit the connection. If the environment variable ‘AUTOCOMMIT’ is set to True, the function commits the changes. Notes ----- If you commit, all changes that are made in the database after the last commit, including those in the child IdaDataFrames, are commited. If the environment variable ‘VERBOSE’ is set to True, the autocommit operations are notified in the console. """ self._idadb._autocommit()
[docs] def _combine_check(self, other): """ Check if self and other refer to the same table and if all columns in self and other are numeric. This sanity check is used before performing aggregation operations between IdaDataFrame/IdaSeries. """ def check_numeric_columns(idaobject): not_valid = [] numeric_columns = idaobject._get_numerical_columns() for column in idaobject.columns: if column not in numeric_columns: not_valid.append(column) if not_valid: raise TypeError("Arithmetic operation are not defined for %s"%not_valid) if isinstance(other, IdaDataFrame) | isinstance(other, ibmdbpy.IdaSeries): if self._name != other._name: raise IdaDataFrameError("It is not possible to aggregate columns using columns of a different table.") if not(isinstance(other, IdaDataFrame) | isinstance(other, ibmdbpy.IdaSeries) | isinstance(other, Number)): if other is not None: raise TypeError("Aggregation makes only sense with numbers, "+ "or IdaDataFrames refering to the same table.") check_numeric_columns(self) if isinstance(other, ibmdbpy.IdaSeries)|isinstance(other, IdaDataFrame): check_numeric_columns(other)