# yellowbrick.bestfit
# Uses Scikit-Learn to compute a best fit function, then draws it in the plot.
#
# Author: Benjamin Bengfort <bbengfort@districtdatalabs.com>
# Created: Sun Jun 26 17:27:08 2016 -0400
#
# Copyright (C) 2016 District Data Labs
# For license information, see LICENSE.txt
#
# ID: bestfit.py [56236f3] benjamin@bengfort.com $
"""
Uses Scikit-Learn to compute a best fit function, then draws it in the plot.
"""
##########################################################################
## Imports
##########################################################################
import numpy as np
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error as mse
from operator import itemgetter
from yellowbrick.style.palettes import LINE_COLOR
from yellowbrick.exceptions import YellowbrickValueError
##########################################################################
## Module Constants
##########################################################################
# Names of the various estimator functions
LINEAR = 'linear'
QUADRATIC = 'quadratic'
EXPONENTIAL = 'exponential'
LOG = 'log'
SELECT_BEST = 'select_best'
##########################################################################
## Draw Line of Best Fit
##########################################################################
[docs]def draw_best_fit(X, y, ax, estimator='linear', **kwargs):
"""
Uses Scikit-Learn to fit a model to X and y then uses the resulting model
to predict the curve based on the X values. This curve is drawn to the ax
(matplotlib axis) which must be passed as the third variable.
The estimator function can be one of the following:
'linear': Uses OLS to fit the regression
'quadratic': Uses OLS with Polynomial order 2
'exponential': Not implemented yet
'log': Not implemented yet
'select_best': Selects the best fit via MSE
The remaining keyword arguments are passed to ax.plot to define and
describe the line of best fit.
"""
# Estimators are the types of best fit lines that can be drawn.
estimators = {
LINEAR: fit_linear, # Uses OLS to fit the regression
QUADRATIC: fit_quadratic, # Uses OLS with Polynomial order 2
EXPONENTIAL: fit_exponential, # Not implemented yet
LOG: fit_log, # Not implemented yet
SELECT_BEST: fit_select_best, # Selects the best fit via MSE
}
# Check to make sure that a correct estimator value was passed in.
if estimator not in estimators:
raise YellowbrickValueError(
"'{}' not a valid type of estimator; choose from {}".format(
estimator, ", ".join(estimators.keys())
)
)
# Then collect the estimator function from the mapping.
estimator = estimators[estimator]
# Ensure that X and y are the same length
if len(X) != len(y):
raise YellowbrickValueError((
"X and y must have same length:"
" X len {} doesn't match y len {}!"
).format(len(X), len(y)))
# Ensure that X and y are np.arrays
X = np.array(X)
y = np.array(y)
# Verify that X is a two dimensional array for Scikit-Learn esitmators
# and that its dimensions are (n, 1) where n is the number of rows.
if X.ndim < 2:
X = X[:,np.newaxis] # Reshape X into the correct dimensions
if X.ndim > 2:
raise YellowbrickValueError(
"X must be a (1,) or (n,1) dimensional array not {}".format(x.shape)
)
# Verify that y is a (n,) dimensional array
if y.ndim > 1:
raise YellowbrickValueError(
"y must be a (1,) dimensional array not {}".format(y.shape)
)
# Uses the estimator to fit the data and get the model back.
model = estimator(X, y)
# Set the color if not passed in.
if 'c' not in kwargs and 'color' not in kwargs:
kwargs['color'] = LINE_COLOR
# Plot line of best fit onto the axes that were passed in.
# TODO: determine if xlim or X.min(), X.max() are better params
xr = np.linspace(*ax.get_xlim(), num=100)
ax.plot(xr, model.predict(xr[:,np.newaxis]), **kwargs)
return ax
##########################################################################
## Estimator Functions
##########################################################################
[docs]def fit_select_best(X, y):
"""
Selects the best fit of the estimators already implemented by choosing the
model with the smallest mean square error metric for the trained values.
"""
models = [fit(X,y) for fit in [fit_linear, fit_quadratic]]
errors = map(lambda model: mse(y, model.predict(X)), models)
return min(zip(models, errors), key=itemgetter(1))[0]
[docs]def fit_linear(X, y):
"""
Uses OLS to fit the regression.
"""
model = linear_model.LinearRegression()
model.fit(X, y)
return model
[docs]def fit_quadratic(X, y):
"""
Uses OLS with Polynomial order 2.
"""
model = make_pipeline(
PolynomialFeatures(2), linear_model.LinearRegression()
)
model.fit(X, y)
return model
[docs]def fit_exponential(X, y):
"""
Fits an exponential curve to the data.
"""
raise NotImplementedError("Exponential best fit lines are not implemented")
[docs]def fit_log(X, y):
"""
Fit a logrithmic curve to the data.
"""
raise NotImplementedError("Logrithmic best fit lines are not implemented")
if __name__ == '__main__':
import os
import pandas as pd
import matplotlib.pyplot as plt
path = os.path.join(os.path.dirname(__file__), "..", "examples", "data", "concrete.xls")
if not os.path.exists(path):
raise Exception("Could not find path for testing")
xkey = 'Fine Aggregate (component 7)(kg in a m^3 mixture)'
ykey = 'Coarse Aggregate (component 6)(kg in a m^3 mixture)'
data = pd.read_excel(path)
fig, axe = plt.subplots()
axe.scatter(data[xkey], data[ykey])
draw_best_fit(data[xkey], data[ykey], axe, 'select_best')
plt.show()