Source code for ibmdbpy.filtering

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------
# Copyright (c) 2015, IBM Corp.
# All rights reserved.
#
# Distributed under the terms of the BSD Simplified License.
#
# The full license is in the LICENSE file, distributed with this software.
#-----------------------------------------------------------------------------

"""The module implement objects and functions that are used to filter IdaDataFrame objects"""

from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from builtins import str
from future import standard_library
standard_library.install_aliases()

from copy import deepcopy
from numbers import Number
from ibmdbpy.exceptions import IdaDataBaseError
import six

[docs]class FilterQuery(object):
    """
    FilterQueries are used to represent the filtering of an IdaDataFrame
    in ibmdbpy. The use of comparison operators, such as <, <=, ==, >=, >
    on an IdaDataFrame instance produces a FilterQuery instance which acts as a
    container for the where clause of the corresponding SQL request.

    Filtering is possible using a Pandas-like syntax. Applying comparison
    operators to IdaDataFrames produces a FilterQuery instance which
    contains the string embedding the corresponding where clause in the 
    “wherestr” attribute.

    FilterQuery objects also contain a logic that allows them to be
    combined, thus allowing complex filtering.

      You can combine the following operators: |, &, ^ (OR, AND and XOR)

    Examples
    --------
    >>> idadf[['sepal_length', 'petal_width'] < 5
    >>> <ibmdbpy.filtering.FilterQuery at 0xa65ba90>
    >>> _.wherestr
    '("sepal_length" < 5 AND "petal_width" < 5)'

    >>> idadf[idadf[['sepal_length', 'petal_width']] < 5]
    <ibmdbpy.frame.IdaDataFrame at 0xa73a860>
    >>> _.head() # filtered IdaDataFrame
        sepal_length  sepal_width  petal_length  petal_width     species
    0           4.4          2.9           1.4          0.2      setosa
    1           4.7          3.2           1.6          0.2      setosa
    2           4.9          2.5           4.5          1.7   virginica
    3           4.9          2.4           3.3          1.0  versicolor
    4           4.6          3.2           1.4          0.2      setosa

    >>> idadf[(idadf['sepal_length'] < 5) & (idadf[petal_width'] > 1.5)]
    <ibmdbpy.frame.IdaDataFrame at 0xa74b9b0>
    >>> _.head()
       sepal_length  sepal_width  petal_length  petal_width    species
    0           4.9          2.5           4.5          1.7  virginica

    Notes
    -----
    It is not possible to filter an IdaDataFrame by using an IdaDataFrame that 
    is opened in a different data source in the database. This is due to the 
    fact that, using a Pandas-like syntax, “idadf[‘petal_width’] < 5” will 
    return a Boolean array that is used to subset the original DataFrame. This 
    is a fundamental restriction of ibmdbpy: we cannot afford to compute and 
    download such an array because we cannot assume that the result will fit 
    into user’s memory. Download time can also be a performance issue.

    """
    def __init__(self, columns, tablename, method, value):
        """
        Constructor for filterquery objects.

        Parameters
        ----------
        columns : str or list
            columns on which the filter operation should be applied
        tablename : str
            Name of the table to which columns belong
        method : str
            value representing the comparision operator.
            Admissible value:  ["lt","le","eq","ne","ge,"gt"]
        value: str or number
            Value to use to filter.

        Attributes
        ----------
        columns : str or list
            columns on which the filter operation should be applied
        tablename : str
            Name of the table to which columns belong
        method : str
            value representing the comparision operator.
            Admissible value:  ["lt","le","eq","ne","ge,"gt"]
        value: str or number
            Value to use to filter.
        wherestr : str
            SQL where clause to be used for filtering

        Raises
        ------
        IdaDataBaseError
            * The value for filtering is not a string or a number
            * The filtering method is not suppoted
        Notes
        -----
        This object is considered as private, and should be called internally
        by IdaDataFrame instances.
        """
        # Sanity checks
        if isinstance(columns, six.string_types):
            columns = [columns]
        if isinstance(value, six.string_types):
            value = "'" + value + "'"
        if not ((isinstance(value,six.string_types)|isinstance(value,Number))):
            raise IdaDataBaseError("Value for Filterquery is expected to be a string or a number. Type %s"%str(type(value)))
        dictmethod = {"lt": " < ", "le": " <= ",  "eq": " = ", "ne": " != ", "ge": " >= ", "gt": " > "}
        if method not in dictmethod.keys():
            raise IdaDataBaseError("The filtering method is not suppoted. Admissible values for method argument are %s"%list(dictmethod.keys()))

        self.columns = columns     # TO DEPRECATE
        self.tablename = tablename
        self.method = method       # TO DEPRECATE
        self.value = value         # TO DEPRECATE
        self.wherestr = ("(\"" + ("\"" + str(dictmethod[method]) + str(value) + " AND \"").join(columns) +
                        "\"" +  str(dictmethod[method]) + str(value) + ")")

    # TO DEPRECATE
    @property
    def query(self):
        """
        Return an SQL query like "SELECT * FROM %s WHERE <WHERESTR>", where
        <WHERESTR> is the value of the attribute "wherestr".
        """
        return "SELECT * FROM %s WHERE " + self.wherestr

    def __and__(self, other):
        """
        Combine two FilterQuery instances with the operator "&" (AND)
        For example : (idadf['sepal_length'] < 5) & (idadf[petal_width'] > 3)
        """
        self._combine_check(other)
        newquery = deepcopy(self)
        newquery.wherestr = "(%s AND %s)"%(self.wherestr,other.wherestr)
        return newquery

    def __or__(self, other):
        """
        Combine two FilterQuery instances with the operator "|" (OR)
        For example : (idadf['sepal_length'] < 5) | (idadf[petal_width'] > 3)
        """
        self._combine_check(other)
        newquery = deepcopy(self)
        newquery.wherestr =  "(%s OR %s)"%(self.wherestr,other.wherestr)
        return newquery

    def __xor__(self, other):
        """
        Combine two FilterQuery instances with the operator "^" (XOR)
        For example : (idadf['sepal_length'] < 5) ^ (idadf[petal_width'] > 3)
        """
        self._combine_check(other)
        newquery = deepcopy(self)
        newquery.wherestr = ("((NOT %s AND %s) OR (%s AND NOT %s))"%(self.wherestr,other.wherestr,self.wherestr,other.wherestr))
        return newquery

    def _combine_check(self, other):
        """
        Check if the name of two  FilterQuery is the same. Raise an
        IdaDataBaseError in case it is different.
        This function is used before performing logical operations between
        FilterQuery instances.

        Raises
        ------
        IdaDataBaseError
        """
        if self.tablename != other.tablename:
            raise IdaDataBaseError("Combining filtering criterions from "+
                                   "columns belongings to different tables "+
                                   "is not possible.")