Source code for yellowbrick.classifier

# yellowbrick.classifier
# Visualizations related to evaluating Scikit-Learn classification models
#
# Author:   Rebecca Bilbro <rbilbro@districtdatalabs.com>
# Author:   Benjamin Bengfort <bbengfort@districtdatalabs.com>
# Created:  Wed May 18 12:39:40 2016 -0400
#
# Copyright (C) 2016 District Data Labs
# For license information, see LICENSE.txt
#
# ID: classifier.py [5eee25b] benjamin@bengfort.com $

"""
Visualizations related to evaluating Scikit-Learn classification models
"""

##########################################################################
## Imports
##########################################################################

import numpy as np
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
from sklearn.metrics import auc, roc_auc_score, roc_curve
from sklearn.metrics import precision_recall_fscore_support

from .exceptions import YellowbrickTypeError
from .utils import get_model_name, isestimator, isclassifier
from .base import Visualizer, ScoreVisualizer, MultiModelMixin
from .style.palettes import color_sequence, color_palette, LINE_COLOR


##########################################################################
## Classification Visualization Base Object
##########################################################################

[docs]class ClassificationScoreVisualizer(ScoreVisualizer): def __init__(self, model, ax=None, **kwargs): """ Check to see if model is an instance of a classifer. Should return an error if it isn't. """ if not isclassifier(model): raise YellowbrickTypeError( "This estimator is not a classifier; try a regression or clustering score visualizer instead!" ) super(ClassificationScoreVisualizer, self).__init__(model, ax=ax, **kwargs)
########################################################################## ## Classification Report ##########################################################################
[docs]class ClassificationReport(ClassificationScoreVisualizer): """ Classification report that shows the precision, recall, and F1 scores for the model. Integrates numerical scores as well color-coded heatmap. """ def __init__(self, model, ax=None, classes=None, **kwargs): """ Pass in a fitted model to generate a classification report. Parameters ---------- :param ax: the axis to plot the figure on. :param model: the Scikit-Learn estimator Should be an instance of a classifier, else the __init__ will return an error. :param classes: a list of class names for the legend If classes is None and a y value is passed to fit then the classes are selected from the target vector. :param colormap: optional string or matplotlib cmap to colorize lines Use sequential heatmap. :param kwargs: keyword arguments passed to the super class. These parameters can be influenced later on in the visualization process, but can and should be set as early as possible. """ super(ClassificationReport, self).__init__(model, ax=ax, **kwargs) ## hoisted to ScoreVisualizer base class self.estimator = model self.name = get_model_name(self.estimator) self.cmap = color_sequence(kwargs.pop('cmap', 'YlOrRd')) self.classes_ = classes
[docs] def fit(self, X, y=None, **kwargs): """ Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values kwargs: keyword arguments passed to Scikit-Learn API. """ super(ClassificationReport, self).fit(X, y, **kwargs) if self.classes_ is None: self.classes_ = self.estimator.classes_ return self
[docs] def score(self, X, y=None, **kwargs): """ Generates the Scikit-Learn classification_report Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values """ y_pred = self.predict(X) keys = ('precision', 'recall', 'f1') self.scores = precision_recall_fscore_support(y, y_pred) self.scores = map(lambda s: dict(zip(self.classes_, s)), self.scores[0:3]) self.scores = dict(zip(keys, self.scores)) return self.draw(y, y_pred)
[docs] def draw(self, y, y_pred): """ Renders the classification report across each axis. Parameters ---------- y : ndarray or Series of length n An array or series of target or class values y_pred : ndarray or Series of length n An array or series of predicted target values """ # Create the axis if it doesn't exist if self.ax is None: self.ax = plt.gca() self.matrix = [] for cls in self.classes_: self.matrix.append([self.scores['precision'][cls],self.scores['recall'][cls],self.scores['f1'][cls]]) for column in range(len(self.matrix)+1): for row in range(len(self.classes_)): self.ax.text(column,row,self.matrix[row][column],va='center',ha='center') fig = plt.imshow(self.matrix, interpolation='nearest', cmap=self.cmap, vmin=0, vmax=1) return self.ax
[docs] def finalize(self, **kwargs): """ Finalize executes any subclass-specific axes finalization steps. The user calls poof and poof calls finalize. Parameters ---------- kwargs: generic keyword arguments. """ # Set the title of the classifiation report self.set_title('{} Classification Report'.format(self.name)) # Add the color bar plt.colorbar() # Compute the tick marks for both x and y x_tick_marks = np.arange(len(self.classes_)+1) y_tick_marks = np.arange(len(self.classes_)) # Set the tick marks appropriately # TODO: make sure this goes through self.ax not plt plt.xticks(x_tick_marks, ['precision', 'recall', 'f1-score'], rotation=45) plt.yticks(y_tick_marks, self.classes_) # Set the labels for the two axes self.ax.set_ylabel('Classes') self.ax.set_xlabel('Measures')
[docs]def classification_report(model, X, y=None, ax=None, classes=None, **kwargs): """Quick method: Displays precision, recall, and F1 scores for the model. Integrates numerical scores as well color-coded heatmap. This helper function is a quick wrapper to utilize the ClassificationReport ScoreVisualizer for one-off analysis. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features. y : ndarray or Series of length n An array or series of target or class values. ax : matplotlib axes The axes to plot the figure on. model : the Scikit-Learn estimator (should be a classifier) classes : list of strings The names of the classes in the target Returns ------- ax : matplotlib axes Returns the axes that the classification report was drawn on. """ # Instantiate the visualizer visualizer = ClassificationReport(model, ax, classes, **kwargs) # Create the train and test splits X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Fit and transform the visualizer (calls draw) visualizer.fit(X_train, y_train, **kwargs) visualizer.score(X_test, y_test) # Return the axes object on the visualizer return visualizer.ax
########################################################################## ## Receiver Operating Characteristics ##########################################################################
[docs]class ROCAUC(ClassificationScoreVisualizer): """ Plot the ROC to visualize the tradeoff between the classifier's sensitivity and specificity. """ def __init__(self, model, ax=None, **kwargs): """ Pass in a fitted model to generate a ROC curve. Parameters ---------- :param ax: the axis to plot the figure on. :param model: the Scikit-Learn estimator Should be an instance of a classifier, else the __init__ will return an error. :param roc_color: color of the ROC curve Specify the color as a matplotlib color: you can specify colors in many weird and wonderful ways, including full names ('green'), hex strings ('#008000'), RGB or RGBA tuples ((0,1,0,1)) or grayscale intensities as a string ('0.8'). :param diagonal_color: color of the diagonal Specify the color as a matplotlib color. :param kwargs: keyword arguments passed to the super class. Currently passing in hard-coded colors for the Receiver Operating Characteristic curve and the diagonal. These will be refactored to a default Yellowbrick style. These parameters can be influenced later on in the visualization process, but can and should be set as early as possible. """ super(ROCAUC, self).__init__(model, ax=ax, **kwargs) ## hoisted to ScoreVisualizer base class self.name = get_model_name(self.estimator) # Color map defaults as follows: # ROC color is the current color in the cycle # Diagonal color is the default LINE_COLOR self.colors = { 'roc': kwargs.pop('roc_color', None), 'diagonal': kwargs.pop('diagonal_color', LINE_COLOR), }
[docs] def score(self, X, y=None, **kwargs): """ Generates the predicted target values using the Scikit-Learn estimator. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values Returns ------ ax : the axis with the plotted figure """ y_pred = self.predict(X) self.fpr, self.tpr, self.thresholds = roc_curve(y, y_pred) self.roc_auc = auc(self.fpr, self.tpr) return self.draw(y, y_pred)
[docs] def draw(self, y, y_pred): """ Renders ROC-AUC plot. Called internally by score, possibly more than once Parameters ---------- y : ndarray or Series of length n An array or series of target or class values y_pred : ndarray or Series of length n An array or series of predicted target values Returns ------ ax : the axis with the plotted figure """ # Create the axis if it doesn't exist if self.ax is None: self.ax = plt.gca() plt.plot(self.fpr, self.tpr, c=self.colors['roc'], label='AUC = {:0.2f}'.format(self.roc_auc)) # Plot the line of no discrimination to compare the curve to. plt.plot([0,1],[0,1],'m--',c=self.colors['diagonal']) return self.ax
[docs] def finalize(self, **kwargs): """ Finalize executes any subclass-specific axes finalization steps. The user calls poof and poof calls finalize. Parameters ---------- kwargs: generic keyword arguments. """ # Set the title and add the legend self.set_title('ROC for {}'.format(self.name)) self.ax.legend(loc='lower right') # Set the limits for the ROC/AUC (always between 0 and 1) self.ax.set_xlim([-0.02, 1.0]) self.ax.set_ylim([ 0.00, 1.1])
[docs]def roc_auc(model, X, y=None, ax=None, **kwargs): """Quick method: Displays the tradeoff between the classifier's sensitivity and specificity. This helper function is a quick wrapper to utilize the ROCAUC ScoreVisualizer for one-off analysis. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features. y : ndarray or Series of length n An array or series of target or class values. ax : matplotlib axes The axes to plot the figure on. model : the Scikit-Learn estimator (should be a classifier) Returns ------- ax : matplotlib axes Returns the axes that the roc-auc curve was drawn on. """ # Instantiate the visualizer visualizer = ROCAUC(model, ax, **kwargs) # Create the train and test splits X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Fit and transform the visualizer (calls draw) visualizer.fit(X_train, y_train, **kwargs) visualizer.score(X_test, y_test) # Return the axes object on the visualizer return visualizer.ax
########################################################################## ## Class Balance Chart ##########################################################################
[docs]class ClassBalance(ClassificationScoreVisualizer): """ Class balance chart that shows the support for each class in the fitted classification model displayed as a bar plot. It is initialized with a fitted model and generates a class balance chart on draw. Parameters ---------- ax: axes the axis to plot the figure on. model: estimator Scikit-Learn estimator object. Should be an instance of a classifier, else ``__init__()`` will raise an exception. classes: list A list of class names for the legend. If classes is None and a y value is passed to fit then the classes are selected from the target vector. kwargs: dict Keyword arguments passed to the super class. Here, used to colorize the bars in the histogram. These parameters can be influenced later on in the visualization process, but can and should be set as early as possible. """ def __init__(self, model, ax=None, classes=None, **kwargs): super(ClassBalance, self).__init__(model, ax=ax, **kwargs) self.colors = color_palette(kwargs.pop('colors', None)) self.classes_ = classes
[docs] def fit(self, X, y=None, **kwargs): """ Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values kwargs: keyword arguments passed to Scikit-Learn API. Returns ------- self : instance Returns the instance of the classification score visualizer """ super(ClassBalance, self).fit(X, y, **kwargs) if self.classes_ is None: self.classes_ = self.estimator.classes_ return self
[docs] def score(self, X, y=None, **kwargs): """ Generates the Scikit-Learn precision_recall_fscore_support Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values Returns ------- ax : the axis with the plotted figure """ y_pred = self.predict(X) self.scores = precision_recall_fscore_support(y, y_pred) self.support = dict(zip(self.classes_, self.scores[-1])) return self.draw()
[docs] def draw(self): """ Renders the class balance chart across the axis. Returns ------- ax : the axis with the plotted figure """ # Create the axis if it doesn't exist if self.ax is None: self.ax = plt.gca() #TODO: Would rather not have to set the colors with this method. # Refactor to make better use of yb_palettes module? colors = self.colors[0:len(self.classes_)] plt.bar(np.arange(len(self.support)), self.support.values(), color=colors, align='center', width=0.5) return self.ax
[docs] def finalize(self, **kwargs): """ Finalize executes any subclass-specific axes finalization steps. The user calls poof and poof calls finalize. Parameters ---------- kwargs: generic keyword arguments. """ # Set the title self.set_title('Class Balance for {}'.format(self.name)) # Set the x ticks with the class names # TODO: change to the self.ax method rather than plt.xticks plt.xticks(np.arange(len(self.support)), self.support.keys()) # Compute the ceiling for the y limit cmax, cmin = max(self.support.values()), min(self.support.values()) self.ax.set_ylim(0, cmax + cmax* 0.1)
[docs]def class_balance(model, X, y=None, ax=None, classes=None, **kwargs): """Quick method: Displays the support for each class in the fitted classification model displayed as a bar plot. This helper function is a quick wrapper to utilize the ClassBalance ScoreVisualizer for one-off analysis. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features. y : ndarray or Series of length n An array or series of target or class values. ax : matplotlib axes The axes to plot the figure on. model : the Scikit-Learn estimator (should be a classifier) classes : list of strings The names of the classes in the target Returns ------- ax : matplotlib axes Returns the axes that the class balance plot was drawn on. """ # Instantiate the visualizer visualizer = ClassBalance(model, ax, classes, **kwargs) # Create the train and test splits X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Fit and transform the visualizer (calls draw) visualizer.fit(X_train, y_train, **kwargs) visualizer.score(X_test, y_test) # Return the axes object on the visualizer return visualizer.ax