Source code for chempy.util.regression

# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division, print_function)

try:
    import numpy as np
except ImportError:
    np = None

import warnings


[docs]def least_squares(x, y, w=1): # w == 1 => OLS, w != 1 => WLS """ Least-squares (w or w/o weights) fit to data series. Linear regression (unweighted or weighted). Parameters ---------- x : array_like y : array_like w : array_like, optional Returns ------- length 2 tuple : pair of parameter estimates (intercept and slope) 2x2 array : variance-covariance matrix float : R-squared (goodness of fit) Examples -------- >>> beta, vcv, R2 = least_squares([0, 1, 2], [1, 3, 5]) >>> all(abs(beta - [1, 2]) < 1e-14), R2 == 1, (abs(vcv) < 1e-14).all() (True, True, True) >>> b1, v1, r2_1 = least_squares([1, 2, 3], [0, 1, 4], [1, 1, 1]) >>> b2, v2, r2_2 = least_squares([1, 2, 3], [0, 1, 4], [1, 1, .2]) >>> abs(b2[1] - 1) < abs(b1[1] - 1) True References ---------- Wikipedia & standard texts on least sqaures method. Comment regarding R2 in WLS: Willett, John B., and Judith D. Singer. "Another cautionary note about R 2: Its use in weighted least-squares regression analysis." The American Statistician 42.3 (1988): 236-238. """ sqrtw = np.sqrt(w) y = np.asarray(y) * sqrtw x = np.asarray(x) X = np.ones((x.size, 2)) X[:, 1] = x if hasattr(sqrtw, 'ndim') and sqrtw.ndim == 1: sqrtw = sqrtw.reshape((sqrtw.size, 1)) X *= sqrtw beta = np.linalg.lstsq(X, y)[0] eps = X.dot(beta) - y SSR = eps.T.dot(eps) # sum of squared residuals vcv = SSR/(x.size - 2)*np.linalg.inv(X.T.dot(X)) TSS = np.sum(np.square(y - np.mean(y))) # total sum of squares R2 = 1 - SSR/TSS return beta, vcv, float(R2)
[docs]def irls(x, y, w_cb=lambda x, y, b, c: x**0, itermax=16, rmsdwtol=1e-8, full_output=False): """ Iteratively reweighted least squares Parameters ---------- x : array_like y : array_like w_cb : callbable signature (x, y, beta, cov) -> weight itermax : int rmsdwtol : float full_output : bool Returns ------- beta : length-2 array parameters cov : 2x2 array variance-covariance matrix info : dict if ``full_output == True``, keys: - weights - niter """ if itermax < 1: raise ValueError("intermax must be >= 1") weights = [] w = np.ones_like(x) rmsdw = np.inf ii = 0 while rmsdw > rmsdwtol and ii < itermax: weights.append(w) beta, cov, r2 = least_squares(x, y, w) old_w = w.copy() w = w_cb(x, y, beta, cov) rmsdw = np.sqrt(np.mean(np.square(w - old_w))) ii += 1 if ii == itermax: warnings.warn('itermax reached') if full_output: return beta, cov, {'weights': weights, 'niter': ii, 'success': ii < itermax} else: return beta, cov
irls.ones = lambda x, y, b, c: 1 if np is not None: irls.exp = lambda x, y, b, c: np.exp(b[1]*x) irls.gaussian = lambda x, y, b, c: np.exp(-(b[1]*x)**2) # guassian weighting irls.abs_residuals = lambda x, y, b, c: np.abs(np.exp(b[0] + b[1]*x) - np.exp(y))
[docs]def plot_fit(x, y, beta, kw_data=None, kw_fit=None): import matplotlib.pyplot as plt kw_data, kw_fit = kw_data or {}, kw_fit or {} plt.plot(x, y, linestyle='None', **kw_data) plt.plot(x[[0, -1]], beta[0] + beta[1]*x[[0, -1]], marker='None', **kw_fit)
[docs]def avg_params(opt_params, cov_params, label_cb=None, plot=False): var_beta = np.vstack((cov_params[:, 0, 0], cov_params[:, 1, 1])).T avg_beta, sum_of_weights = np.average(opt_params, axis=0, weights=1/var_beta, returned=True) var_avg_beta = np.sum(np.square(opt_params - avg_beta)/var_beta, axis=0)/((avg_beta.shape[0] - 1) * sum_of_weights) if plot: import matplotlib.pyplot as plt if label_cb is not None: lbl = label_cb(avg_beta, var_avg_beta) else: lbl = None plt.errorbar(opt_params[:, 0], opt_params[:, 1], marker='s', ls='None', xerr=var_beta[:, 0]**0.5, yerr=var_beta[:, 1]**0.5) plt.xlabel(r'$\beta_0$') plt.ylabel(r'$\beta_1$') plt.title(r'$y(x) = \beta_0 + \beta_1 \cdot x$') plt.errorbar(avg_beta[0], avg_beta[1], xerr=var_avg_beta[0]**0.5, yerr=var_avg_beta[1]**0.5, marker='o', c='r', linewidth=2, markersize=10, label=lbl) plt.legend(numpoints=1) return avg_beta, var_avg_beta