Source code for speedml

"""
Speedml is a Python package to speed start machine learning projects. Contact author https://twitter.com/manavsehgal. Code, docs and demos https://speedml.com.
"""

from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import *

from speedml.base import Base
from speedml.plot import Plot
from speedml.feature import Feature
from speedml.xgb import Xgb
from speedml.model import Model

import numpy as np
import pandas as pd
import os

from IPython.core.interactiveshell import InteractiveShell

# Used by Speedml.about
_RELEASE = 'v0.9.3'

[docs]class Speedml(Base):
    def __init__(self, train, test, target, uid=None):
        """
        Open datasets ``train`` and ``test`` as CSV or JSON files and store in pandas DataFrames ``Base.train`` and ``Base.test``. Set ``Base.target`` and ``Base.uid`` values based on parameters. Initialize ``Plot``, ``Feature``, and ``Xgb`` components.
        """
        self._setup_environment()

        Base.target = target

        # TODO: Add more file formats supported by pandas.read_
        if train.endswith('.csv'):
            Base.train = pd.read_csv(train)
            Base.test = pd.read_csv(test)

        if train.endswith('.json'):
            Base.train = pd.read_json(train)
            Base.test = pd.read_json(test)

        if not Base.train.empty and not Base.test.empty:
            if uid:
                Base.uid = Base.test.pop(uid)
                Base.train = Base.train.drop([uid], axis=1)

            self.plot = Plot()
            self.feature = Feature()
            self.xgb = Xgb()
            self.model = Model()

            self.np = np
            self.pd = pd
        else:
            print('ERROR: SpeedML can only process .csv and .json file extensions.')

[docs]    def configure(self, option=None, value=None):
        """
        Configure Speedml defaults with ``option`` configuration parameter, ``value`` setting. When method is called without parameters it simply returns the current config dictionary, otherwise returns the updated configuration.
        """
        if option and value:
            Base._config[option] = value
        return Base._config

    def _setup_environment(self):
        Base._config = {}
        # Used by data out path 'internally' within Speedml methods
        Base._config['outpath'] = 'output/'
        # Positive and negative skew within +- this value
        Base._config['outlier_threshold'] = 3
        # #Features/#Samples Train < this value
        Base._config['overfit_threshold'] = 0.01
        # Feature is high-cardinality if categories > this value
        Base._config['high_cardinality'] = 10
        # Unique (continuous) if Base._config['unique_ratio']% non-repeat values
        Base._config['unique_ratio'] = 80

        # Setup for Notebook environment
        try:
            __IPYTHON__
        except NameError:
            Base.notebook = False
        else:
            Base.notebook = True
            # Multiple outputs from single input cell
            InteractiveShell.ast_node_interactivity = "all"
            # Plots inline within Notebook output
            ipython = get_ipython()
            ipython.magic('matplotlib inline')

[docs]    def info(self):
        """
        Runs DataFrame.info() on both Train and Test datasets.
        """
        self.train.info()
        print('-'*40)
        self.test.info()

[docs]    def eda(self):
        """
        Performs speed exploratory data analysis (EDA) on the current state of datasets. Returns metrics and recommendations as a dataframe. Progressively hides metrics as they achieve workflow completion goals or meet the configured defaults and thresholds.
        """
        Base.data_n()

        eda_metrics = []

        eda_index = ['Speedml Release']
        eda_metrics.append([_RELEASE, 'Visit https://speedml.com for release notes.'])

        nulls_by_features = Base.train.isnull().sum() + Base.test.isnull().sum()
        nulls = nulls_by_features[1].sum()
        if nulls:
            eda_index.append('Nulls')
            eda_metrics.append([nulls, 'Use feature.impute.'])

        skew = Base.train_n.skew()
        skew_upper = skew[skew > Base._config['outlier_threshold']]
        skew_lower = skew[skew < -Base._config['outlier_threshold']]
        if not skew_upper.empty:
            eda_index.append('Outliers Upper')
            eda_metrics.append(
                [skew_upper.axes[0].tolist(),
                 'Positive skew (> {}). Use feature.outliers(upper).'.format(
                     Base._config['outlier_threshold'])])
        if not skew_lower.empty:
            eda_index.append('Outliers Lower')
            eda_metrics.append(
                [skew_lower.axes[0].tolist(),
                 'Negative skew (< -{}). Use feature.outliers(lower).'.format(
                     Base._config['outlier_threshold'])])

        eda_index.append('Shape')
        feature_by_sample = Base.train.shape[1] / Base.train.shape[1]
        message = '#Features / #Samples > {}. Over-fitting.'.format(Base._config['overfit_threshold'])
        message = message if feature_by_sample < Base._config['overfit_threshold'] else ''
        eda_metrics.append([self.shape(), message])

        numerical_ratio = int(Base.train_n.shape[1] / Base.train.shape[1] * 100)
        if numerical_ratio < 100:
            eda_index.append('Numerical Ratio')
            eda_metrics.append(['{}%'.format(numerical_ratio),
                                  'Aim for 100% numerical.'])

        numerical_features = Base.train_n.columns.values

        if numerical_features != []:
            high_cardinality_num = []
            categorical_num = []
            continuous = []
            for feature in numerical_features:
                repeating = Base.train[feature].value_counts()
                if repeating.count() > (Base._config['unique_ratio'])/100*Base.train.shape[0]:
                    continuous.append(feature)
                    if feature == Base.target:
                        target_analysis = ['Model ready.',
                                           'Use regression models.']
                    continue
                if repeating.count() > Base._config['high_cardinality']:
                    high_cardinality_num.append(feature)
                    if feature == Base.target:
                        target_analysis = ['Pre-process.',
                                           'Dimensionality reduction?']
                    continue
                if repeating.count() > 1:
                    categorical_num.append(feature)
                    if feature == Base.target:
                        target_analysis = ['Model ready.',
                                           'Use classification models.']
                    continue

            if high_cardinality_num:
                eda_index.append('Numerical High-cardinality')
                eda_metrics.append([
                    high_cardinality_num,
                    '(>{}) categories. Use feature.density'.format(
                        Base._config['high_cardinality'])])

            if categorical_num:
                eda_index.append('Numerical Categorical')
                eda_metrics.append([
                    categorical_num,
                    ' Use plot.ordinal.'])

            if continuous:
                eda_index.append('Numerical Continuous')
                eda_metrics.append([
                    continuous,
                    '~{}% unique. Use plot.continuous.'.format(Base._config['unique_ratio'])])

        if Base.train_n.shape[1] != Base.train.shape[1]:
            text_features = []
            text_features = list(set(Base.train.columns.values) - set(numerical_features))

            if text_features != []:
                high_cardinality_text = []
                categorical_text = []
                text = []
                for feature in text_features:
                    repeating = Base.train[feature].value_counts()
                    if repeating.count() > (Base._config['unique_ratio'])/100*Base.train.shape[0]:
                        text.append(feature)
                        if feature == Base.target:
                            target_analysis = [
                                'ERROR.',
                                'Unique text cannot be a target variable.']
                        continue
                    if repeating.count() > Base._config['high_cardinality']:
                        high_cardinality_text.append(feature)
                        if feature == Base.target:
                            target_analysis = [
                                'Pre-process.',
                                'Use feature.labels.']
                        continue
                    if repeating.count() > 1:
                        categorical_text.append(feature)
                        if feature == Base.target:
                            target_analysis = [
                                'Pre-process.',
                                'Use feature.labels or feature.mapping.']
                        continue

                if high_cardinality_text:
                    eda_index.append('Text High-cardinality')
                    eda_metrics.append([
                        high_cardinality_text,
                        '(>{}) categories. Use feature.labels.'.format(Base._config['high_cardinality'])])

                if categorical_text:
                    eda_index.append('Text Categorical')
                    eda_metrics.append([
                        categorical_text,
                        'Use feature.labels or feature.mapping.'])

                if text:
                    eda_index.append('Text Unique')
                    eda_metrics.append([
                        text,
                        '~{}% unique. Use feature.extract or feature.drop.'.format(Base._config['unique_ratio'])])

        eda_index += ['Target Analysis ({})'.format(Base.target)]
        eda_metrics.append(target_analysis)

        eda_df = pd.DataFrame(eda_metrics,
                                index=eda_index,
                                columns=['Results', 'Observations'])

        return eda_df

[docs]    def shape(self):
        """
        Print shape (samples, features) of train, test datasets and number of numerical features in each dataset.
        """
        Base.data_n()
        message = 'train {} | test {}'
        return message.format(Base.train.shape, Base.test.shape)

[docs]    def save_results(self, columns, file_path):
        """
        Saves the ``columns`` dictionary input to a DataFrame as ``file_path`` CSV file.
        """
        submission = pd.DataFrame(columns)
        submission.to_csv(file_path,
                          index=False)
        return 'Results saved.'

[docs]    def slug(self):
        performance_slug = 'e{:.2f}-m{:.2f}-s{:.2f}-f{:.2f}'.format(
            self.xgb.error * 100,
            self.model.xgb_accuracy * 100,
            self.xgb.sample_accuracy,
            self.xgb.feature_accuracy)
        return performance_slug