Source code for rosetta.modeling.fitting

"""
Common functions for fitting regression and classification models.
"""
import numpy as np
from numpy import linalg
import pandas as pd

from rosetta import common_math


[docs]def get_relative_error(reality, estimate): """ Compares estimate to reality and returns the the mean-square error: |estimate - reality|_F / |reality|_F where F is the Frobenius norm. """ diff = estimate - reality return linalg.norm(diff) / linalg.norm(reality)
[docs]def get_R2(Y, Y_hat): """ Gets the coefficient of determination R^2. """ diff_Y_Ybar = Y - Y.mean() SStotal = diff_Y_Ybar.dot(diff_Y_Ybar) diff_Y_Yhat = Y - Y_hat SSerr = diff_Y_Yhat.dot(diff_Y_Yhat) return 1 - SSerr / float(SStotal)
[docs]def get_MSerr(Y, Y_hat): """ Gets the mean square error """ err = Y_hat - Y return err.dot(err) / len(Y)
[docs]def standardize(df, dont_standardize=None): """ Parameters ---------- df : pandas DataFrame Contains independent variables dont_standardize : List Names of variables to not standardize Returns ------- Tuple of DataFrames: standardized_df, stats_df standardized_df is the standardized version of df stats_df contains the mean and std of the variables """ dont_standardize = [] if dont_standardize is None else dont_standardize ## Get the mean and std and then the standardized dataframe mu = df.mean() sigma = df.std() standardized_df = (df - mu) / sigma ## If necessary, don't change some columns for name in dont_standardize: standardized_df[name] = df[name] return standardized_df
[docs]class CoefficientConverter(object): """ For [un]standardizing/winsorizing coefficients and data. CoefficientConverter is initialized with one dataset, from this the standardization/winsorization rules are learned. The functions can be applied to other datasets. Standardization part of module provides the fundamental relation: X.dot(self.unstandardize_params(w_st)) = self.standardize(X).dot(w_st) WORKFLOW 1 1) Initialize with a DataFrame. From this frame we learn the rules. 2) To fit, we use self.transform to transform a (possibly) new DataFrame. This fit results in a set of "transformed params" w_tr 3) To predict Y_hat corresponding to new input X, we first compute X_tr = self.transform(X), and then use X_tr.dot(w_tr) WORKFLOW 2 (standardization only!!) 1) Initialize with a dataframe. From this frame we learn the standardization rules. 2a) To fit, we use self.standardize to standardize a (possibly) new DataFrame. This fit results in a set of "standardized params" w_st. 2b) We obtain the "unstandardized params" w = self.unstandardized_params(w_st) 3) To predict Y_hat corresponding to new input X, we use X.dot(w) """ def __init__( self, df, ones_column=None, dont_standardize=[], dont_winsorize=[], lower_quantile=0, upper_quantile=1, max_std=np.inf): """ Parameters ---------- df : Pandas.DataFrame We learn the standardization rules from this df. ones_column : String Name of a column that is all ones. This is required to use self.unstandardize_params(). You can have more than one column of all ones, but only one can be specified here. dont_standardize : List Names of variables that we will not standardize. dont_winsorize : List Names of variables that we will not winsorize. upper_quantile : Real number in [0, 1] The upper quantile above which we trim lower_quantile : Real number in [0, 1] The lower quantile below which we trim max_std : Non-negative real Trim values that are more than max_std standard deviations away from the mean This is done after quantile trimming. """ self.known_columns = list(df.columns) self.dont_winsorize = dont_winsorize self.upper_quantile = upper_quantile self.lower_quantile = lower_quantile self.max_std = max_std self.dont_standardize = dont_standardize # List of coefficients that should be standardized/winsorized self._should_standardize = list(df.columns.diff(dont_standardize)) self._should_winsorize = list(df.columns.diff(dont_winsorize)) # Initialize the rules self.stats = self._get_stats(df) self.clip_levels = self._get_clip_levels(df) # Get a list of columns that were constant. Error check against # dont_standardize self._const_columns = self._get_const_columns() self._ones_column = self._verify_ones_column(ones_column, df) def _get_stats(self, df): """ Creates self.stats, a DataFrame holding mean and standard deviation for all fields in df. """ # Create self.stats stats = pd.DataFrame({'mu': df.mean(), 'sigma': df.std()}) return stats def _get_clip_levels(self, df): def func(s): return _get_clip_levels_series( s, self.lower_quantile, self.upper_quantile, self.max_std) items = common_math.get_item_names(df) sw = items.intersection(self._should_winsorize) levels = pd.Series( np.nan * np.ones(len(items)), index=items).astype('O') if len(sw) > 0: # This cast to float prevents a mixed data type frame...which can # cause apply to act in a funny manner levels[sw] = df[sw].astype('float').apply(func) return levels def _get_const_columns(self): """ Returns a list of columns that were constant. Note that the first of these is the only one that will be unstandardized in self.unstandardize_params """ _const_columns = list(self.stats[self.stats.sigma == 0].index) # Make sure any and all constant columns were specified as columns # to notstandardize for col in _const_columns: assert col in self.dont_standardize, ( "Variable %s is constant, but was not specified in the " "dont_standardize initialization kwarg" % col) return _const_columns def _verify_ones_column(self, ones_column, df): """ If ones_column is indeed a column of ones, returns True, otherwise raises ValueError. """ if ones_column is None: return None elif all(df[ones_column] == 1): return ones_column else: raise ValueError( "The initialization parameter ones_column = %s is not in fact " "a column of all ones!" % ones_column) def _check_compatible(self, data): """ Raises ValueError if the columns/index of the DataFrame/Series "data" are not contained in self.known_columns. In this case, we don't know how to standardize/unstandardize/winsorize data, so we must raise an exception. """ diff = common_math.get_item_names(data).diff(self.known_columns) if diff: raise ValueError( "Data contained items we don't know how to work with: %s" % diff)
[docs] def standardize(self, data): """ Returns a standardized version of data. Parameters ---------- data : pandas Series or DataFrame Notes ----- data is standardized according to the rules that self was initialized with, i.e. the rules implicit in self.stats. """ self._check_compatible(data) # Convenience stats = self.stats standardized = data.copy().astype('float') if self._should_standardize: ss = common_math.get_item_names(data).intersection( self._should_standardize) standardized[ss] = (data[ss] - stats.mu[ss]) / stats.sigma[ss] return standardized
[docs] def unstandardize_params(self, w_st): """ Returns "w", an unstandardized version of w_st so that X.dot(w) = self.standardize(X).dot(w_st) Parameters ---------- w_st : Pandas.Series Index is names of variables Values are the fitted parameter values """ self._check_compatible(w_st) assert self._ones_column, ( "Specify a ones_column during initialization if you want to " "unstandardize") ## We will return this Series w = w_st.copy().astype('float') # ss = "should standardize" ss = common_math.get_item_names(w_st).intersection( self._should_standardize) ## Unstandardize colums that were standardized if len(ss) > 0: w_st_part_only = w_st[ss] sigma = self.stats.sigma[ss] w[ss] = w_st_part_only / sigma # Unstandardize the constant. Add the "excess" to self._ones_column if len(ss) > 0: mu = self.stats.mu[ss] w[self._ones_column] -= (mu * w_st_part_only / sigma).sum() return w
[docs] def winsorize(self, data): """ Winsorize the data using the rules determined during initialization. """ self._check_compatible(data) def func(series): lower, upper = self.clip_levels[series.name] return np.maximum(lower, np.minimum(upper, series)) # sw = "should winsorize" sw = common_math.get_item_names(data).intersection( self._should_winsorize) winsorized = data.copy() if len(sw) > 0: winsorized[sw] = winsorized[sw].apply(func) return winsorized
[docs] def transform(self, data): """ Winsorize then standardize data. Returns a copy. """ return self.standardize(self.winsorize(data))
def _get_clip_levels_series(series, lower_quantile, upper_quantile, max_std): """ Gets clip levels for winsorization. """ # Quantile trimming upper_q_value = series.quantile(upper_quantile) lower_q_value = series.quantile(lower_quantile) # Std trimming mu = series.mean() sigma = series.std() upper_s_value = mu + max_std * sigma lower_s_value = mu - max_std * sigma return max(lower_q_value, lower_s_value), min(upper_q_value, upper_s_value)
[docs]def winsorize(series, lower_quantile=0, upper_quantile=1, max_std=np.inf): """ Truncate all items in series that are in extreme quantiles. Parameters ---------- series : pandas.Series. Real valued. upper_quantile : Real number in [0, 1] The upper quantile above which we trim lower_quantile : Real number in [0, 1] The lower quantile below which we trim max_std : Non-negative real Trim values that are more than max_std standard deviations away from the mean Returns ------- winsorized_series : pandas.Series Notes ----- Trimming according to max_std is done AFTER quantile trimming. I.e. the std is computed on the series that has already been trimmed by quantile. """ lower, upper = _get_clip_levels_series( series, lower_quantile, upper_quantile, max_std) return np.maximum(lower, np.minimum(upper, series))