Source code for rosetta.modeling.eda

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pylab as pl

import statsmodels.api as sm
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import squareform


[docs]def plot_corr_grid( corr, cluster=True, cluster_method='weighted', distance_fun=None, ax=None, **fig_kwargs): """ Plot a correlation matrix as a grid. Uses scipy.cluster.hierarchy.linkage to compute clusters based on distance between samples. Since correlation is passed in, this correlation must be converted to a distance (using distance_fun). The default distance_fun makes highly correlated points have low distance, and vice versa. Parameters ---------- corr : numpy ndarray or pandas DataFrame corr[i, j] is the correlation (should be between -1 and 1) of samples i and j. cluster : Boolean If True, reorder the matrix putting correlated entries nearby. distance_fun : Function inter-variable distance = distance_fun(corr). If None, use (1 - corr) / 2. ax : matplotlib AxesSubplot instance If None, use pl.gca() cluster_method : String Method to use to amalgomate clusters. Either 'single', 'complete', 'average', or 'weighted'. See scipy.cluster.hierarchy.linkage for details. Returns ------- fig : matplotlib figure instance """ fig_kwargs.setdefault('figsize', (8, 8)) if distance_fun is None: distance_fun = lambda c: (1 - c) / 2. # Convert to a DataFrame in all cases. if not isinstance(corr, pd.DataFrame): names = range(len(corr)) corr = pd.DataFrame(corr, index=names, columns=names) else: names = corr.index.tolist() # If you're clustering, reorder the matrix. if cluster: corr = corr.copy() dist = distance_fun(corr) Z = linkage(squareform(dist.values), method=cluster_method) idx_order = dendrogram(Z, no_plot=True)['ivl'] names = [names[int(i)] for i in idx_order] corr = corr.reindex(index=names, columns=names) if ax is None: ax = pl.gca() fig = sm.graphics.plot_corr(corr, xnames=names, ynames=names, ax=ax) fig.set_size_inches(fig_kwargs['figsize']) return fig
[docs]def plot_corr_dendrogram( corr, cluster_method='weighted', **dendrogram_kwargs): """ Plot a correlation matrix as a dendrogram (on the current axes). Uses scipy.cluster.hierarchy.linkage to compute clusters based on distance between samples. Since correlation is passed in, this correlation must be converted to a distance (using distance_fun). The default distance_fun makes highly correlated points have low distance, and vice versa. Parameters ---------- corr : numpy ndarray or pandas DataFrame corr[i, j] is the correlation (should be between -1 and 1) of samples i and j. cluster_method : String Method to use to amalgomate clusters. Either 'single', 'complete', 'average', or 'weighted'. See scipy.cluster.hierarchy.linkage for details. dendrogram_kwargs : Additional kwargs Pass to the call of scipy.cluster.hierarchy.dendrogram() """ # Convert to a DataFrame in all cases. if not isinstance(corr, pd.DataFrame): names = range(len(corr)) else: names = corr.index.tolist() corr = corr.values dist = (1 - corr) / 2. Z = linkage(squareform(dist), method=cluster_method) dendrogram(Z, labels=names, **dendrogram_kwargs)
[docs]def plot_scatterXY(x, y, stride=1, plot_XequalsY=False, ax=None, **plt_kwargs): """ Plot a XY scatter plot of two Series. Parameters ---------- x, y : Pandas.Series stride : Positive integer If stride == n, then plot only every nth point plot_XequalsY : Boolean If True, plot the line X = Y in red. plt_kwargs : Additional kwargs to pass to plt.scatter """ if isinstance(x, pd.Series) or isinstance(x, pd.DataFrame): xname = x.name if x.name else 'X' else: xname = 'X' if isinstance(y, pd.Series) or isinstance(y, pd.DataFrame): yname = y.name if y.name else 'Y' else: yname = 'Y' x = x[::stride] y = y[::stride] if 'c' in plt_kwargs: plt_kwargs['c'] = plt_kwargs['c'][::stride] if plot_XequalsY: plt.plot(x, x, 'r-') plt.scatter(x, y, **plt_kwargs) plt.xlabel(xname) plt.ylabel(yname)
[docs]def reducedY_vs_binnedX( x, y, Y_reducer=np.mean, X_reducer='midpoint', bins=10, quantiles=False, labels=None): """ Bin X and, inside every bin, apply Y_reducer to the Y values. Parameters ---------- x : Pandas.Series with numeric data y : Pandas.Series with numeric data = 0 or 1 Y_reducer : function Used to aggregate the Y values in every bin X_reducer : function or 'midpoint' Used to aggregate the X values in every bin. This gives us the bin labels that are used as the indices. If 'midpoint', then use the bin midpoint. bins : Positive Integer, optional Number of bins to divide series quantiles : Boolean, optional If True, bin data using quantiles rather than an evenly divided range labels : List-like, with len(labels) = len(x), optional If given, use these labels to bin X rather than bins. Returns ------- y_reduced : Series The reduced y values with an index equal to the reduced X count_X : Series The number of X variables in each bin. Index is the reduced value. Examples -------- Suppose Y is binary. Then to compute P[Y=1|X=x] (for x inside the bins), as well as #[X=x], use: P_Y_g_X, count_X = eda.reducedY_vs_binnedX(x, y, Y_reducer=np.mean) """ ## Get the labels that are also used to group the x data if labels is None: labels = get_labels(x, bins=bins, quantiles=quantiles) ## Reduce Y y_reduced = y.groupby([labels]).agg(Y_reducer) ## Get the sizes of the bins count_X = x.groupby([labels]).size() ## Rename the indices if X_reducer != 'midpoint': x_reduced = x.groupby([labels]).agg(X_reducer) reindex_map = {i: x_reduced[i] for i in x_reduced.index} y_reduced = y_reduced.rename(reindex_map) count_X = count_X.rename(reindex_map) ## Rename the axis count_X.index.name = x.name count_X.name = '#[X=x]' y_reduced.index.name = x.name y_reduced.name = y.name return y_reduced, count_X
[docs]def plot_reducedY_vs_binnedX( x, y, Y_reducer=np.mean, X_reducer=np.mean, bins=10, quantiles=False, plot_count_X=False, **plt_kwargs): """ Bin X and, inside every bin, apply Y_reducer to the Y values. Then plot. Parameters ---------- x : Pandas.Series with numeric data y : Pandas.Series with numeric data = 0 or 1 Y_reducer : function Used to aggregate the Y values in every bin X_reducer : function Used to aggregate the X values in every bin. This gives us the bin labels that are used as the indices. bins : Positive Integer, optional Number of bins to divide series quantiles : Boolean, optional If True, bin data using quantiles rather than an evenly divided range plot_count_X : Boolean, optional If True, plot count_X versus x in a separate subplot **kwargs : Extra keywordargs passed to plot Examples -------- Suppose Y is binary. Then to plot P[Y=1|X=x] (for x inside the bins), as well as #[X=x], use: eda.plot_reducedY_vs_binnedX(x, y, Y_reducer=np.mean, plot_count_X=True) """ y_reduced, count_X = reducedY_vs_binnedX( x, y, Y_reducer, X_reducer, bins, quantiles) # Set a default figure size plt_kwargs.setdefault('figsize', (10, 5)) # We handle the subplots ourselves plt_kwargs['subplots'] = False # If plot_count_X, then we are plotting a dataframe rather than a series, # and there are different key word args available. if plot_count_X: if quantiles: print "Warning! plot_count_X is meaningless if quantiles==True" fig, axes = plt.subplots(1, 2, figsize=plt_kwargs['figsize']) y_reduced.plot(ax=axes[0], **plt_kwargs) count_X.plot(ax=axes[1], title=count_X.name, **plt_kwargs) else: y_reduced.plot(**plt_kwargs)
[docs]def get_labels(series, bins=10, quantiles=False): """ Divides series into bins and returns labels corresponding to midpoints of bins. Parameters ---------- series : Pandas.Series of numeric data bins : Positive Integer, optional Number of bins to divide series quantiles : Boolean, optional If True, bin data using quantiles rather than an evenly divided range """ cutfun = pd.qcut if quantiles else pd.cut levels = cutfun(series, bins) labels = np.zeros(len(levels)) for i, lev in enumerate(levels): # NaN label occurs sometimes, just use as-is if isinstance(lev, float): assert np.isnan(lev) labels[i] = lev else: start = lev.split(',')[0][1:] end = lev.split(',')[1][:-1] mid = (float(start) + float(end)) / 2 labels[i] = mid return labels
[docs]def hist_cols( df, cols_to_plot, num_cols, num_rows, figsize=None, **kwargs): """ Plots histograms of columns of a DataFrame as subplots in one big plot. Handles nans and extreme values in a "graceful" manner by removing them and reporting their occurance. Parameters ---------- df : Pandas DataFrame cols_to_plot : List Column names of df that will be plotted num_cols, num_rows : Positive integers Number of columns and rows in the plot figsize : (x, y) tuple, optional Size of the figure **kwargs : Keyword args to pass on to plot """ num_figures = len(cols_to_plot) num_plots = num_figures / (num_cols * num_rows) if num_plots * num_cols * num_rows < num_figures: num_plots += 1 # Plot the cols old_figure_index = -1 for item_index, col_name in enumerate(cols_to_plot): # Set up the subplot figure_index = item_index / (num_rows * num_cols) if figure_index != old_figure_index: #plt.figure(figure_index) plt.figure(figsize=figsize) plt.clf() plt.suptitle('Histograms %d' % figure_index) old_figure_index = figure_index subplot_index = item_index % (num_rows * num_cols) plt.subplot(num_rows, num_cols, subplot_index) # Plot col = df[col_name] hist_one_col(col)
[docs]def hist_one_col(col): """ Plots a histogram one column. Handles nans and extreme values in a "graceful" manner. """ nan_idx = np.isnan(col) mean, std = col.mean(), col.std() extreme_idx = np.fabs(col - mean) > 10 * std normal_idx = np.logical_not(extreme_idx) * np.logical_not(nan_idx) total_count = len(col) nan_frac = nan_idx.sum() / float(total_count) extreme_frac = extreme_idx.sum() / float(total_count) if normal_idx.sum() > 0: col[normal_idx].hist(bins=50, normed=True) plt.title( '%s. extreme: %.3f, nan: %.3f' % (col.name, extreme_frac, nan_frac))