Source code for speedml.feature

"""
Speedml Feature component with methods that work on dataset features or the feature engineering workflow. Contact author https://twitter.com/manavsehgal. Code, docs and demos https://speedml.com.
"""

from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import *

from speedml.base import Base
from speedml.util import DataFrameImputer

import numpy as np
from sklearn.preprocessing import LabelEncoder
import re

[docs]class Feature(Base):
[docs]    def drop(self, features):
        """
        Drop one or more list of strings naming ``features`` from train and test datasets.
        """
        start = Base.train.shape[1]

        Base.train = Base.train.drop(features, axis=1)
        Base.test = Base.test.drop(features, axis=1)

        end = Base.train.shape[1]
        message = 'Dropped {} features with {} features available.'
        return message.format(start - end, end)

[docs]    def impute(self):
        """
        Replace empty values in the entire dataframe with median value for numerical features and most common values for text features.
        """
        start = Base.train.isnull().sum().sum()

        Base.test[Base.target] = -1
        combine = Base.train.append(Base.test)
        combine = DataFrameImputer().fit_transform(combine)
        Base.train = combine[0:Base.train.shape[0]]
        Base.test = combine[Base.train.shape[0]::]
        Base.test = Base.test.drop([Base.target], axis=1)

        end = Base.train.isnull().sum().sum()
        message = 'Imputed {} empty values to {}.'
        return message.format(start, end)

[docs]    def mapping(self, a, data):
        """
        Convert values for categorical feature ``a`` using ``data`` dictionary. Use when number of categories are limited otherwise use labels.
        """
        Base.train[a] = Base.train[a].apply(lambda x: data[x])
        Base.test[a] = Base.test[a].apply(lambda x: data[x])

[docs]    def fillna(self, a, new):
        """
        Fills empty or null values in ``a`` feature name with ``new`` string value.
        """
        start = Base.train[a].isnull().sum() + Base.test[a].isnull().sum()

        Base.train[a] = Base.train[a].fillna(new)
        Base.test[a] = Base.test[a].fillna(new)

        message = 'Filled {} null values across test and train datasets.'
        return message.format(start)

[docs]    def replace(self, a, match, new):
        """
        In feature ``a`` values ``match`` string or list of strings and replace with a ``new`` string.
        """
        if type(match) is str:
            # [TODO] What is the performance cost of message ops?
            start = Base.train[Base.train[a] == match][a].shape[0] + Base.test[Base.test[a] == match][a].shape[0]
            message = 'Replaced {} matching values across train and test datasets.'
            message = message.format(start)
        else:
            # [TODO] Can we possibly use pandas.isin to check counts?
            message = 'Replaced matching list of strings across train and test datasets.'

        Base.train[a] = Base.train[a].replace(match, new)
        Base.test[a] = Base.test[a].replace(match, new)

        return message

[docs]    def outliers(self, a, lower = None, upper = None):
        """
        Fix outliers for ``lower`` or ``upper`` or both percentile of values within ``a`` feature.
        """
        if upper:
            upper_value = np.percentile(Base.train[a].values, upper)
            change = Base.train.loc[Base.train[a] > upper_value, a].shape[0]
            Base.train.loc[Base.train[a] > upper_value, a] = upper_value
            message = 'Fixed {} or {:.2f}% upper outliers. '.format(change, change/Base.train.shape[0]*100)

        if lower:
            lower_value = np.percentile(Base.train[a].values, lower)
            change = Base.train.loc[Base.train[a] < lower_value, a].shape[0]
            Base.train.loc[Base.train[a] < lower_value, a] = lower_value
            message = message + 'Fixed {} or {:.2f}% lower outliers.'.format(change, change/Base.train.shape[0]*100)

        return message

    def _density_by_feature(self, a):
        vals = Base.train[a].value_counts()
        dvals = vals.to_dict()
        Base.train[a + '_density'] = Base.train[a].apply(lambda x: dvals.get(x, vals.min()))
        Base.test[a + '_density'] = Base.test[a].apply(lambda x: dvals.get(x, vals.min()))

[docs]    def density(self, a):
        """
        Create new feature named ``a`` feature name + suffix '_density', based on density or value_counts for each unique value in ``a`` feature specified as a string or multiple features as a list of strings.
        """
        if isinstance(a, str):
            self._density_by_feature(a)

        if isinstance(a, list):
            for feature in a:
                self._density_by_feature(feature)

[docs]    def add(self, a, num):
        """
        Update ``a`` numeric feature by adding ``num`` number to each values.
        """
        Base.train[a] = Base.train[a] + num
        Base.test[a] = Base.test[a] + num

[docs]    def sum(self, new, a, b):
        """
        Create ``new`` numeric feature by adding ``a`` + ``b`` feature values.
        """
        Base.train[new] = Base.train[a] + Base.train[b]
        Base.test[new] = Base.test[a] + Base.test[b]

[docs]    def diff(self, new, a, b):
        """
        Create ``new`` numeric feature by subtracting ``a`` - ``b`` feature values.
        """
        Base.train[new] = Base.train[a] - Base.train[b]
        Base.test[new] = Base.test[a] - Base.test[b]

[docs]    def product(self, new, a, b):
        """
        Create ``new`` numeric feature by multiplying ``a`` * ``b`` feature values.
        """
        Base.train[new] = Base.train[a] * Base.train[b]
        Base.test[new] = Base.test[a] * Base.test[b]

[docs]    def divide(self, new, a, b):
        """
        Create ``new`` numeric feature by dividing ``a`` / ``b`` feature values. Replace division-by-zero with zero values.
        """
        Base.train[new] = Base.train[a] / Base.train[b]
        Base.test[new] = Base.test[a] / Base.test[b]
        # Histograms require finite values
        Base.train[new] = Base.train[new].replace([np.inf, -np.inf], 0)
        Base.test[new] = Base.test[new].replace([np.inf, -np.inf], 0)

[docs]    def round(self, new, a, precision):
        """
        Create ``new`` numeric feature by rounding ``a`` feature value to ``precision`` decimal places.
        """
        Base.train[new] = round(Base.train[a], precision)
        Base.test[new] = round(Base.test[a], precision)

[docs]    def concat(self, new, a, sep, b):
        """
        Create ``new`` text feature by concatenating ``a`` and ``b`` text feature values, using ``sep`` separator.
        """
        Base.train[new] = Base.train[a].astype(str) + sep + Base.train[b].astype(str)
        Base.test[new] = Base.test[a].astype(str) + sep + Base.test[b].astype(str)

[docs]    def list_len(self, new, a):
        """
        Create ``new`` numeric feature based on length or item count from ``a`` feature containing list object as values.
        """
        Base.train[new] = Base.train[a].apply(len)
        Base.test[new] = Base.test[a].apply(len)

[docs]    def word_count(self, new, a):
        """
        Create ``new`` numeric feature based on length or word count from ``a`` feature containing free-form text.
        """
        Base.train[new] = Base.train[a].apply(lambda x: len(x.split(" ")))
        Base.test[new] = Base.test[a].apply(lambda x: len(x.split(" ")))

    def _regex_text(self, regex, text):
        regex_search = re.search(regex, text)
        # If the word exists, extract and return it.
        if regex_search:
            return regex_search.group(1)
        return ""

[docs]    def extract(self, a, regex, new=None):
        """
        Match ``regex`` regular expression with ``a`` text feature values to update ``a`` feature with matching text if ``new`` = None. Otherwise create ``new`` feature based on matching text.
        """
        Base.train[new if new else a] = Base.train[a].apply(lambda x: self._regex_text(regex=regex, text=x))
        Base.test[new if new else a] = Base.test[a].apply(lambda x: self._regex_text(regex=regex, text=x))

[docs]    def labels(self, features):
        """
        Generate numerical labels replacing text values from list of categorical ``features``.
        """
        Base.test[Base.target] = -1
        combine = Base.train.append(Base.test)

        le = LabelEncoder()
        for feature in features:
            combine[feature] = le.fit_transform(combine[feature])

        Base.train = combine[0:Base.train.shape[0]]
        Base.test = combine[Base.train.shape[0]::]
        Base.test = Base.test.drop([Base.target], axis=1)