Source code for rosetta.modeling.categorical_fitter

"""
Various functions for fitting categorical models.
Put functions specific to logistic regression in multinomial_fitter
"""
import copy

import numpy as np

from sklearn.cross_validation import StratifiedKFold

from .. import common_math


[docs]def predict_proba_cv(clf, X, y, n_folds=5): """ Returns an out-of-sample clf.predict_proba(X, y). Parameters ---------- clf : sklearn classifier with a predict_proba method X : 2-D numpy array or DataFrame y : 1-D numpy array or Series Use this along with StratifiedKFold to determine splits. n_folds: int Returns ------- probas : np.ndarray or series Note ---- After folds are created, each training sets need to be large enough to contain at least one member of each class. This happens iff the number of members of each class is less than or equal to n_folds. """ X, _ = common_math.pandas_to_ndarray_wrap(X) y, _ = common_math.pandas_to_ndarray_wrap(y) n_classes = len(np.unique(y)) # We don't want to re-fit our original classifier (changing its coeff_). clf = copy.deepcopy(clf) cv = StratifiedKFold(y, n_folds=n_folds) probas = np.nan * np.ones((len(y), n_classes)) for i, (train, test) in enumerate(cv): if len(np.unique(y[train])) < n_classes: raise ValueError( "Training set did not contain samples from all classes." "Try decreasing the number of folds or use more data") probas[test, :] = clf.fit(X[train], y[train]).predict_proba(X[test]) return probas