Source code for rosetta.common_math
"""
Common math functions.
"""
import numpy as np
import pandas as pd
from numpy.random import choice
from scipy.sparse import isspmatrix
[docs]def pandas_to_ndarray_wrap(X, copy=True):
"""
Converts X to a ndarray and provides a function to help convert back
to pandas object.
Parameters
----------
X : Series/DataFrame/ndarray
copy : Boolean
If True, return a copy.
Returns
-------
Xvals : ndarray
If X is a Series/DataFrame, then Xvals = X.values,
if ndarray, Xvals = X
F : Function
F(Xvals) = X
"""
if copy:
X = X.copy()
if isinstance(X, pd.Series):
return X.values, lambda Z: pd.Series(np.squeeze(Z), index=X.index)
elif isinstance(X, pd.DataFrame):
return X.values, lambda Z: pd.DataFrame(
Z, index=X.index, columns=X.columns)
elif isinstance(X, np.ndarray) or isspmatrix(X):
return X, lambda Z: Z
else:
raise ValueError("Unhandled type: %s" % type(X))
[docs]def subsample_arr(arr, N=None, frac_keep=None):
"""
Subsample a Series, DataFrame, or ndarray along axis 0.
Parameters
----------
arr : Series, DataFrame, or ndarray
N : Integer
Number of samples to keep
frac_keep : Real in [0, 1]
Fraction of samples to keep
Returns
-------
subsampled : Series, DataFrame, or ndarray
A copy
"""
# Input checking
assert ((N is None) and (frac_keep is not None)) \
or ((N is not None) and (frac_keep is None))
#
if N is None:
N = int(len(arr) * frac_keep)
if isinstance(arr, np.ndarray):
index = choice(range(len(arr)), size=N, replace=False)
return arr[np.ix_(index)]
elif isinstance(arr, pd.Series) or isinstance(arr, pd.DataFrame):
index = choice(arr.index, size=N, replace=False)
return arr.ix[index]
else:
raise ValueError("arr of unhandled type: %s" % type(arr))
[docs]def get_item_names(data):
"""
If DataFrame, return columns, if Series, return index.
"""
if isinstance(data, pd.Series):
items = data.index
elif isinstance(data, pd.DataFrame):
items = data.columns
else:
raise TypeError("Argument type %s is a type not handled" % type(data))
return items
[docs]def series_to_frame(data):
"""
If length(N) Series, return an N x 1 Frame with name equal to the series
name. If frame, passthrough.
Parameters
----------
data : pandas Series or DataFrame.
"""
if isinstance(data, pd.Series) or isinstance(data, pd.DataFrame):
data = pd.DataFrame(data)
else:
raise ValueError("type(data) = %s is not handled" % type(data))
return data