Source code for rosetta.modeling.var_create

import scipy as sp
import numpy as np

from rosetta.modeling import eda


# Define the logit function that acts on series to get the logit of the series
[docs]def logit(series):
    """
    Logit for Pandas.Series
    """
    logseries = np.log(series)
    return logseries / (1 - logseries)


[docs]def logit_of_mean(series):
    """
    Logit of the mean of a pandas series
    """
    mean = series.mean()
    return logit(mean)


[docs]def sigmoidize(x, scale=5, mid=None):
    """
    Returns y, a sigmoidal version of the variable x.
    y = sig((x - mid) / scale)
    sig(z) = exp(z) / (1 + exp(z))

    Parameters
    ----------
    x : np.ndarray, ndim=1
    scale : positive real number
    mid : real number
        If None, use the mean
    """
    y = x.copy()
    if not mid:
        mid = y.mean()

    z = (x - mid) / float(scale)
    exp_z = np.exp(z)

    return exp_z / (1 + exp_z)


[docs]def standardize(x):
    """
    Standardizes a Series or DataFrame.
    """
    return (x - x.mean()) / x.std()


[docs]def build_xy_for_linearize(
    x, y, bins=10, Y_reducer=np.mean, x_lims=None, endpoints=None):
    """
    Return x and y for use in linearization.  Use with var_create.interp.

    Parameters
    ----------
    x : Pandas.Series
    y : Pandas.Series
    bins : positive integer
        Number of bins for x
    Y_reducer : Function
        Used to reduce Y in each of the bins.  E.g np.mean, logit_of_mean.
    x_lims : 2-tuple, (xmin, xmax)
        Rescaled x will be constant outside of this range.
        Choose xmin, xmax such that you have enough data in the interval
        (xmin, xmax)
    endpoints : Array-like
        [xmin, xmax, ymin, ymax].  Makes sure F(xmin) = ymin, etc...

    Returns
    -------
    x : Array
        The bin midpoints (with adjunstments at the ends)
    y : Array
        y reduced in the bins
    """
    # Trim the range of x used
    x_actualmin = x.min()
    x_actualmax = x.max()
    if x_lims:
        # We need to keep the actual max/min
        mask = (x > x_lims[0]) & (x < x_lims[1])
    else:
        mask = np.ones(len(x), dtype=bool)

    # reduced_Y is the reduced y values with an index equal to the x midpoints
    reduced_Y, _ = eda.reducedY_vs_binnedX(
        x[mask], y[mask], Y_reducer=Y_reducer, bins=bins)

    # If we don't convert to float, we get an object series...
    x_midpts = reduced_Y.index.values.astype('float')

    # Stick on the endpoints
    if endpoints is None:
        endpoints = x_actualmin, x_actualmax, reduced_Y[0], reduced_Y[-1]
    x_extended = np.r_[endpoints[0], x_midpts, endpoints[1]]
    y_extended = np.r_[endpoints[2], reduced_Y, endpoints[3]]

    return x_extended, y_extended


[docs]def interp(x, y, t=1, scaling=None):
    """
    Return interpolation helpers for x and y.
    See build_xy_for_linearize for use in linearization.

    Parameters
    ----------
    x : Array-like
    y : Array-like
    t : Real number in [0, 1]
        With F(x) the linearization function, re-set F(x) = t*F(x) + (1-t)*x
    scaling : String
        If None, the output is not rescaled and Y_reducer(bin_j) = x_j where
            x_j is the midpoint of bin_j.
        If 'standardize', then output will have zero mean and unit variance
        If 'unit', then output will be on the interval [0, 1]

    Returns
    -------
    F_x : Pandas.Series
        A rescaled version of x.
    F : Function that will rescale x
        Cannot be pickled... :(

    Examples
    --------
    x4linear, y4linear = vc.build_xy_for_linearize(y_score, y)
    F_x, F = interp(x4linear, y4linear)
    """
    # Interpolate to get our first try at F
    F_1 = sp.interpolate.interp1d(x, y, kind='linear')

    # Reshape
    F_2 = lambda x: t * F_1(x) + (1 - t) * x

    # Scaling
    F_2_x = F_2(x)
    if scaling is not None:
        if scaling == 'standardize':
            F_3 = lambda x: (F_2(x) - F_2_x.mean()) / F_2_x.std()
        elif scaling == 'unit':
            F_3 = lambda x: (
                F_2(x) - F_2_x.min()) / (F_2_x.max() - F_2_x.min())
        else:
            raise ValueError("Unknown scaling passed: %s" % scaling)
    else:
        F_3 = F_2

    return F_3(x), F_3