"""
Speedml Feature component with methods that work on dataset features or the feature engineering workflow. Contact author https://twitter.com/manavsehgal. Code, docs and demos https://speedml.com.
"""
from __future__ import (absolute_import, division,
print_function, unicode_literals)
from builtins import *
from speedml.base import Base
from speedml.util import DataFrameImputer
import numpy as np
from sklearn.preprocessing import LabelEncoder
import re
[docs]class Feature(Base):
[docs] def drop(self, features):
"""
Drop one or more list of strings naming ``features`` from train and test datasets.
"""
start = Base.train.shape[1]
Base.train = Base.train.drop(features, axis=1)
Base.test = Base.test.drop(features, axis=1)
end = Base.train.shape[1]
message = 'Dropped {} features with {} features available.'
return message.format(start - end, end)
[docs] def impute(self):
"""
Replace empty values in the entire dataframe with median value for numerical features and most common values for text features.
"""
start = Base.train.isnull().sum().sum()
Base.test[Base.target] = -1
combine = Base.train.append(Base.test)
combine = DataFrameImputer().fit_transform(combine)
Base.train = combine[0:Base.train.shape[0]]
Base.test = combine[Base.train.shape[0]::]
Base.test = Base.test.drop([Base.target], axis=1)
end = Base.train.isnull().sum().sum()
message = 'Imputed {} empty values to {}.'
return message.format(start, end)
[docs] def mapping(self, a, data):
"""
Convert values for categorical feature ``a`` using ``data`` dictionary. Use when number of categories are limited otherwise use labels.
"""
Base.train[a] = Base.train[a].apply(lambda x: data[x])
Base.test[a] = Base.test[a].apply(lambda x: data[x])
[docs] def fillna(self, a, new):
"""
Fills empty or null values in ``a`` feature name with ``new`` string value.
"""
start = Base.train[a].isnull().sum() + Base.test[a].isnull().sum()
Base.train[a] = Base.train[a].fillna(new)
Base.test[a] = Base.test[a].fillna(new)
message = 'Filled {} null values across test and train datasets.'
return message.format(start)
[docs] def replace(self, a, match, new):
"""
In feature ``a`` values ``match`` string or list of strings and replace with a ``new`` string.
"""
if type(match) is str:
# [TODO] What is the performance cost of message ops?
start = Base.train[Base.train[a] == match][a].shape[0] + Base.test[Base.test[a] == match][a].shape[0]
message = 'Replaced {} matching values across train and test datasets.'
message = message.format(start)
else:
# [TODO] Can we possibly use pandas.isin to check counts?
message = 'Replaced matching list of strings across train and test datasets.'
Base.train[a] = Base.train[a].replace(match, new)
Base.test[a] = Base.test[a].replace(match, new)
return message
[docs] def outliers(self, a, lower = None, upper = None):
"""
Fix outliers for ``lower`` or ``upper`` or both percentile of values within ``a`` feature.
"""
if upper:
upper_value = np.percentile(Base.train[a].values, upper)
change = Base.train.loc[Base.train[a] > upper_value, a].shape[0]
Base.train.loc[Base.train[a] > upper_value, a] = upper_value
message = 'Fixed {} or {:.2f}% upper outliers. '.format(change, change/Base.train.shape[0]*100)
if lower:
lower_value = np.percentile(Base.train[a].values, lower)
change = Base.train.loc[Base.train[a] < lower_value, a].shape[0]
Base.train.loc[Base.train[a] < lower_value, a] = lower_value
message = message + 'Fixed {} or {:.2f}% lower outliers.'.format(change, change/Base.train.shape[0]*100)
return message
def _density_by_feature(self, a):
vals = Base.train[a].value_counts()
dvals = vals.to_dict()
Base.train[a + '_density'] = Base.train[a].apply(lambda x: dvals.get(x, vals.min()))
Base.test[a + '_density'] = Base.test[a].apply(lambda x: dvals.get(x, vals.min()))
[docs] def density(self, a):
"""
Create new feature named ``a`` feature name + suffix '_density', based on density or value_counts for each unique value in ``a`` feature specified as a string or multiple features as a list of strings.
"""
if isinstance(a, str):
self._density_by_feature(a)
if isinstance(a, list):
for feature in a:
self._density_by_feature(feature)
[docs] def add(self, a, num):
"""
Update ``a`` numeric feature by adding ``num`` number to each values.
"""
Base.train[a] = Base.train[a] + num
Base.test[a] = Base.test[a] + num
[docs] def sum(self, new, a, b):
"""
Create ``new`` numeric feature by adding ``a`` + ``b`` feature values.
"""
Base.train[new] = Base.train[a] + Base.train[b]
Base.test[new] = Base.test[a] + Base.test[b]
[docs] def diff(self, new, a, b):
"""
Create ``new`` numeric feature by subtracting ``a`` - ``b`` feature values.
"""
Base.train[new] = Base.train[a] - Base.train[b]
Base.test[new] = Base.test[a] - Base.test[b]
[docs] def product(self, new, a, b):
"""
Create ``new`` numeric feature by multiplying ``a`` * ``b`` feature values.
"""
Base.train[new] = Base.train[a] * Base.train[b]
Base.test[new] = Base.test[a] * Base.test[b]
[docs] def divide(self, new, a, b):
"""
Create ``new`` numeric feature by dividing ``a`` / ``b`` feature values. Replace division-by-zero with zero values.
"""
Base.train[new] = Base.train[a] / Base.train[b]
Base.test[new] = Base.test[a] / Base.test[b]
# Histograms require finite values
Base.train[new] = Base.train[new].replace([np.inf, -np.inf], 0)
Base.test[new] = Base.test[new].replace([np.inf, -np.inf], 0)
[docs] def round(self, new, a, precision):
"""
Create ``new`` numeric feature by rounding ``a`` feature value to ``precision`` decimal places.
"""
Base.train[new] = round(Base.train[a], precision)
Base.test[new] = round(Base.test[a], precision)
[docs] def concat(self, new, a, sep, b):
"""
Create ``new`` text feature by concatenating ``a`` and ``b`` text feature values, using ``sep`` separator.
"""
Base.train[new] = Base.train[a].astype(str) + sep + Base.train[b].astype(str)
Base.test[new] = Base.test[a].astype(str) + sep + Base.test[b].astype(str)
[docs] def list_len(self, new, a):
"""
Create ``new`` numeric feature based on length or item count from ``a`` feature containing list object as values.
"""
Base.train[new] = Base.train[a].apply(len)
Base.test[new] = Base.test[a].apply(len)
[docs] def word_count(self, new, a):
"""
Create ``new`` numeric feature based on length or word count from ``a`` feature containing free-form text.
"""
Base.train[new] = Base.train[a].apply(lambda x: len(x.split(" ")))
Base.test[new] = Base.test[a].apply(lambda x: len(x.split(" ")))
def _regex_text(self, regex, text):
regex_search = re.search(regex, text)
# If the word exists, extract and return it.
if regex_search:
return regex_search.group(1)
return ""
[docs] def labels(self, features):
"""
Generate numerical labels replacing text values from list of categorical ``features``.
"""
Base.test[Base.target] = -1
combine = Base.train.append(Base.test)
le = LabelEncoder()
for feature in features:
combine[feature] = le.fit_transform(combine[feature])
Base.train = combine[0:Base.train.shape[0]]
Base.test = combine[Base.train.shape[0]::]
Base.test = Base.test.drop([Base.target], axis=1)