Source code for yellowbrick.regressor

# yellowbrick.regressor
# Visualizations related to evaluating Scikit-Learn regressor models
#
# Author:   Rebecca Bilbro <rbilbro@districtdatalabs.com>
# Author:   Benjamin Bengfort <bbengfort@districtdatalabs.com>
# Created:  Fri Jun 03 10:30:36 2016 -0700
#
# Copyright (C) 2016 District Data Labs
# For license information, see LICENSE.txt
#
# ID: regressor.py [4a59c49] benjamin@bengfort.com $

"""
Visualizations related to evaluating Scikit-Learn regressor models
"""

##########################################################################
## Imports
##########################################################################

import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.cross_validation import train_test_split

from .bestfit import draw_best_fit
from .style.palettes import LINE_COLOR
from .exceptions import YellowbrickTypeError
from .utils import get_model_name, isestimator, isregressor
from .base import Visualizer, ScoreVisualizer, MultiModelMixin


##########################################################################
## Regression Visualization Base Object
##########################################################################

[docs]class RegressionScoreVisualizer(ScoreVisualizer):

    def __init__(self, model, ax=None, **kwargs):
        """
        Check to see if model is an instance of a regressor.
        Should return an error if it isn't.
        """
        if not isregressor(model):
            raise YellowbrickTypeError(
                "This estimator is not a regressor; try a classifier or "
                "clustering score visualizer instead!"
        )

        super(RegressionScoreVisualizer, self).__init__(model, ax=ax, **kwargs)


##########################################################################
## Prediction Error Plots
##########################################################################

[docs]class PredictionError(RegressionScoreVisualizer):
    """
    Plot the actual targets from the dataset against the
    predicted values generated by our model(s).
    """
    def __init__(self, model, ax=None, **kwargs):
        """
        Parameters
        ----------

        :param ax: the axis to plot the figure on.

        :param model: the Scikit-Learn estimator
            Should be an instance of a regressor, else the __init__ will
            return an error.

        :param point_color: color of the error points
            Any matplotlib color

        :param line_color: color of the best fit line
            Any matplotlib color

        :param kwargs: keyword arguments passed to the super class.
            Currently passing in hard-coded colors for the prediction error
            points and the line of best fit.
            These will be refactored to a default Yellowbrick style.


        These parameters can be influenced later on in the visualization
        process, but can and should be set as early as possible.
        """

        super(PredictionError, self).__init__(model, ax=ax, **kwargs)

        self.colors = {
            'point': kwargs.pop('point_color', None),
            'line': kwargs.pop('line_color', LINE_COLOR),
        }

[docs]    def score(self, X, y=None, **kwargs):
        """
        Originally score  for prediction error was conceived as generating
        y_pred by calling the sklearn function cross_val_predict on the
        model, X, y, and the specified number of folds, e.g.:

            y_pred = cv.cross_val_predict(model, X, y, cv=12)

        With the new API, there's not much for score to do.

        Parameters
        ----------
        X : array-like
            X (also X_test) are the dependent variables of test set to predict

        y : array-like
            y (also y_test) is the independent actual variables to score against

        Returns
        ------

        ax : the axis with the plotted figure

        """
        y_pred = self.predict(X)
        return self.draw(y, y_pred)

[docs]    def draw(self, y, y_pred):
        """
        Parameters
        ----------

        y : ndarray or Series of length n
            An array or series of target or class values

        y_pred : ndarray or Series of length n
            An array or series of predicted target values

        Returns
        ------

        ax : the axis with the plotted figure
        """
        # Create the axis if it doesn't exist
        if self.ax is None:
            self.ax = plt.gca()

        self.ax.scatter(y, y_pred, c=self.colors['point'])

        # TODO If score is happening inside a loop, draw would get called multiple times.
        # Ideally we'd want the best fit line to be drawn only once
        draw_best_fit(y, y_pred, self.ax, 'linear', ls='--', lw=2, c=self.colors['line'])

        self.ax.set_xlim(y.min()-1, y.max()+1)
        self.ax.set_ylim(y_pred.min()-1, y_pred.max()+1)

        return self.ax

[docs]    def finalize(self, **kwargs):
        """
        Finalize executes any subclass-specific axes finalization steps.
        The user calls poof and poof calls finalize.

        Parameters
        ----------
        kwargs: generic keyword arguments.

        """
        # Set the title on the plot
        self.set_title('Prediction Error for {}'.format(self.name))

        # Set the axes labels
        self.ax.set_ylabel('Predicted')
        self.ax.set_xlabel('Measured')


[docs]def prediction_error(model, X, y=None, ax=None, **kwargs):
    """Quick method:

    Plot the actual targets from the dataset against the
    predicted values generated by our model(s).

    This helper function is a quick wrapper to utilize the PredictionError
    ScoreVisualizer for one-off analysis.

    Parameters
    ----------
    X  : ndarray or DataFrame of shape n x m
        A matrix of n instances with m features.

    y  : ndarray or Series of length n
        An array or series of target or class values.

    ax : matplotlib axes
        The axes to plot the figure on.

    model : the Scikit-Learn estimator (should be a regressor)

    Returns
    -------
    ax : matplotlib axes
        Returns the axes that the prediction error plot was drawn on.
    """
    # Instantiate the visualizer
    visualizer = PredictionError(model, ax, **kwargs)

    # Create the train and test splits
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # Fit and transform the visualizer (calls draw)
    visualizer.fit(X_train, y_train, **kwargs)
    visualizer.score(X_test, y_test)

    # Return the axes object on the visualizer
    return visualizer.ax


##########################################################################
## Residuals Plots
##########################################################################

[docs]class ResidualsPlot(RegressionScoreVisualizer):
    """
    A residual plot shows the residuals on the vertical axis
    and the independent variable on the horizontal axis.

    If the points are randomly dispersed around the horizontal axis,
    a linear regression model is appropriate for the data;
    otherwise, a non-linear model is more appropriate.
    """
    def __init__(self, model, ax=None, **kwargs):
        """
        Parameters
        ----------

        :param model: the Scikit-Learn estimator
            Should be an instance of a regressor, else the __init__ will
            return an error.

        :param ax: the axis to plot the figure on.

        :param train_color: color of the training data residuals
            Residuals for training data are ploted with this color but also
            given an opacity of 0.5 to ensure that the test data residuals
            are more visible. Default color is 'b' for training data.

        :param test_color: color of test data residuals
            Residuals for test data are plotted with this color. In order to
            create generalizable models, reserved test data residuals are of
            the most analytical interest, so these points are highlighted by
            hvaing full opacity. Default color is 'g' for test data.

        :param line_color: color of the zero error line
            Any matplotlib color. Default is a dark grey.

        :param kwargs: keyword arguments passed to the super class.
            Currently passing in hard-coded colors for the residual train and
            test points and the horizontal line.
            These will be refactored to a default Yellowbrick style.

        These parameters can be influenced later on in the visualization
        process, but can and should be set as early as possible.

        """

        super(ResidualsPlot, self).__init__(model, ax=ax, **kwargs)

        # TODO Is there a better way to differentiate between train and test points?
        # We'd like to color them differently in draw...
        # Can the user pass those in as keyword arguments?
        self.colors = {
            'train_point': kwargs.pop('train_color', 'b'),
            'test_point': kwargs.pop('test_color', 'g'),
            'line': kwargs.pop('line_color', LINE_COLOR),
        }

[docs]    def fit(self, X, y=None, **kwargs):
        """
        Parameters
        ----------

        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target values

        kwargs: keyword arguments passed to Scikit-Learn API.
        """
        super(ResidualsPlot, self).fit(X, y, **kwargs)
        self.score(X, y, train=True)

[docs]    def score(self, X, y=None, train=False, **kwargs):
        """
        Generates predicted target values using the Scikit-Learn
        estimator.

        Parameters
        ----------
        X : array-like
            X (also X_test) are the dependent variables of test set to predict

        y : array-like
            y (also y_test) is the independent actual variables to score against

        train : boolean
            If False, `score` assumes that the residual points being plotted
            are from the test data; if True, `score` assumes the residuals
            are the train data.

        Returns
        ------

        ax : the axis with the plotted figure

        """
        y_pred = self.predict(X)
        scores = y_pred - y
        self.draw(y_pred, scores, train=train)

[docs]    def draw(self, y_pred, residuals, train=False, **kwargs):
        """
        Parameters
        ----------
        y_pred : ndarray or Series of length n
            An array or series of predicted target values

        residuals : ndarray or Series of length n
            An array or series of the difference between the predicted and the
            target values

        train : boolean
            If False, `draw` assumes that the residual points being plotted
            are from the test data; if True, `draw` assumes the residuals
            are the train data.

        Returns
        ------

        ax : the axis with the plotted figure

        """
        # Create the axis if it doesn't exist
        if self.ax is None:
            self.ax = plt.gca()

        color = self.colors['train_point'] if train else self.colors['test_point']
        alpha = 0.5 if train else 1.0

        self.ax.scatter(y_pred, residuals, c=color, s=40, alpha=alpha)

        return self.ax

[docs]    def finalize(self, **kwargs):
        """
        Finalize executes any subclass-specific axes finalization steps.
        The user calls poof and poof calls finalize.

        Parameters
        ----------
        kwargs: generic keyword arguments.

        """
        # Add the title to the plot
        self.set_title('Residuals for {} Model'.format(self.name))

        # Set the legend
        # Assumes that the first set of points are training data, and the next are test
        # Assumes that you want a box around legend
        self.ax.legend(['Training Data', 'Test Data'], loc = 'best', frameon = True)

        # Create a full line across the figure at zero error.
        self.ax.axhline(y=0, c=self.colors['line'])

        # Set the axes labels
        self.ax.set_ylabel('Residuals')
        self.ax.set_xlabel("Predicted Value")


[docs]def residuals_plot(model, X, y=None, ax=None, **kwargs):
    """Quick method:

    Plot  the residuals on the vertical axis and the
    independent variable on the horizontal axis.

    This helper function is a quick wrapper to utilize the ResidualsPlot
    ScoreVisualizer for one-off analysis.

    Parameters
    ----------
    X  : ndarray or DataFrame of shape n x m
        A matrix of n instances with m features.

    y  : ndarray or Series of length n
        An array or series of target or class values.

    ax : matplotlib axes
        The axes to plot the figure on.

    model : the Scikit-Learn estimator (should be a regressor)

    Returns
    -------
    ax : matplotlib axes
        Returns the axes that the residuals plot was drawn on.
    """
    # Instantiate the visualizer
    visualizer = ResidualsPlot(model, ax, **kwargs)

    # Create the train and test splits
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # Fit and transform the visualizer (calls draw)
    visualizer.fit(X_train, y_train, **kwargs)
    visualizer.score(X_test, y_test)

    # Return the axes object on the visualizer
    return visualizer.ax