Source code for rosetta.modeling.var_create

import scipy as sp
import numpy as np

from rosetta.modeling import eda


# Define the logit function that acts on series to get the logit of the series
[docs]def logit(series): """ Logit for Pandas.Series """ logseries = np.log(series) return logseries / (1 - logseries)
[docs]def logit_of_mean(series): """ Logit of the mean of a pandas series """ mean = series.mean() return logit(mean)
[docs]def sigmoidize(x, scale=5, mid=None): """ Returns y, a sigmoidal version of the variable x. y = sig((x - mid) / scale) sig(z) = exp(z) / (1 + exp(z)) Parameters ---------- x : np.ndarray, ndim=1 scale : positive real number mid : real number If None, use the mean """ y = x.copy() if not mid: mid = y.mean() z = (x - mid) / float(scale) exp_z = np.exp(z) return exp_z / (1 + exp_z)
[docs]def standardize(x): """ Standardizes a Series or DataFrame. """ return (x - x.mean()) / x.std()
[docs]def build_xy_for_linearize( x, y, bins=10, Y_reducer=np.mean, x_lims=None, endpoints=None): """ Return x and y for use in linearization. Use with var_create.interp. Parameters ---------- x : Pandas.Series y : Pandas.Series bins : positive integer Number of bins for x Y_reducer : Function Used to reduce Y in each of the bins. E.g np.mean, logit_of_mean. x_lims : 2-tuple, (xmin, xmax) Rescaled x will be constant outside of this range. Choose xmin, xmax such that you have enough data in the interval (xmin, xmax) endpoints : Array-like [xmin, xmax, ymin, ymax]. Makes sure F(xmin) = ymin, etc... Returns ------- x : Array The bin midpoints (with adjunstments at the ends) y : Array y reduced in the bins """ # Trim the range of x used x_actualmin = x.min() x_actualmax = x.max() if x_lims: # We need to keep the actual max/min mask = (x > x_lims[0]) & (x < x_lims[1]) else: mask = np.ones(len(x), dtype=bool) # reduced_Y is the reduced y values with an index equal to the x midpoints reduced_Y, _ = eda.reducedY_vs_binnedX( x[mask], y[mask], Y_reducer=Y_reducer, bins=bins) # If we don't convert to float, we get an object series... x_midpts = reduced_Y.index.values.astype('float') # Stick on the endpoints if endpoints is None: endpoints = x_actualmin, x_actualmax, reduced_Y[0], reduced_Y[-1] x_extended = np.r_[endpoints[0], x_midpts, endpoints[1]] y_extended = np.r_[endpoints[2], reduced_Y, endpoints[3]] return x_extended, y_extended
[docs]def interp(x, y, t=1, scaling=None): """ Return interpolation helpers for x and y. See build_xy_for_linearize for use in linearization. Parameters ---------- x : Array-like y : Array-like t : Real number in [0, 1] With F(x) the linearization function, re-set F(x) = t*F(x) + (1-t)*x scaling : String If None, the output is not rescaled and Y_reducer(bin_j) = x_j where x_j is the midpoint of bin_j. If 'standardize', then output will have zero mean and unit variance If 'unit', then output will be on the interval [0, 1] Returns ------- F_x : Pandas.Series A rescaled version of x. F : Function that will rescale x Cannot be pickled... :( Examples -------- x4linear, y4linear = vc.build_xy_for_linearize(y_score, y) F_x, F = interp(x4linear, y4linear) """ # Interpolate to get our first try at F F_1 = sp.interpolate.interp1d(x, y, kind='linear') # Reshape F_2 = lambda x: t * F_1(x) + (1 - t) * x # Scaling F_2_x = F_2(x) if scaling is not None: if scaling == 'standardize': F_3 = lambda x: (F_2(x) - F_2_x.mean()) / F_2_x.std() elif scaling == 'unit': F_3 = lambda x: ( F_2(x) - F_2_x.min()) / (F_2_x.max() - F_2_x.min()) else: raise ValueError("Unknown scaling passed: %s" % scaling) else: F_3 = F_2 return F_3(x), F_3