Source code for jmpy.plotting.scatter

import numpy as np

import scipy.interpolate as spi
import matplotlib as mpl
import matplotlib.backends.backend_agg as mbb

from jmpy import common
from jmpy.plotting import components


[docs]def scatter(x, y, data=None, legend=None, marker='o', alpha=.5, xscale='linear', yscale='linear', cmap='default', figsize=(12, 6), fit=None, fitparams=None, table=True, fig=None, axes=None, cgrid=None, **kwargs): """ Scatter plots with regression lines :param x: str or ndarray :param y: str or ndarray :param data: pandas.Dataframe :param legend: str or ndarray, color/fit by this column :param marker: matplotlib marker style :param alpha: float, matplotlib alpha :param xscale: default == linear, any of matplotlib scale types :param yscale: default == linear, any of matplotlib scale types :param cmap: any of matplotlib cmaps :param figsize: default == (9,6); :param fit: [linear, quadratic, smooth, interpolate] :param fitparams: params to pass to fitting function :param table: show the regression table :param kwargs: :return: fig, (axes) """ # if no dataframe is supplied, create one if data is None: (x, y, _, legend, _, _), data = components.create_df(x, y, legend) if not fitparams: fitparams = {} df = data.copy() df = df[[i for i in (x, y, legend) if i]] # many of the fitting routines don't work with nan or non-sorted data. df = df.dropna() df.sort_values(x) df = df.reset_index() # fit axis is for the regression equations makefitaxis = False if fit == 'linear' or fit == 'quadratic': makefitaxis = True if fig: fig = fig canvas = mbb.FigureCanvasAgg(fig) axm, axc, axl, axt = components.get_axes(fig) elif axes: axm = axes else: fig = mpl.figure.Figure(figsize=figsize, tight_layout=True) canvas = mbb.FigureCanvasAgg(fig) axm, axc, axl, axt = components.create_axes(False, legend, table and makefitaxis, fig=fig) if legend: # colormap is supposed to be the goto function to get all colormaps # should return a colorgrid that maps each point to a set of colors if cgrid is None: cgrid = common.colors.colormap(df[legend], kind='discrete', cmap=cmap) legend_color = {} for i, key in df[legend].iteritems(): legend_color[key] = cgrid[i] # if the axis is supplied, we do not want to create a legend axis if not axes: components.legend(sorted(list(legend_color.items())), axl) axl.set_title(legend, loc='left') text = '' for l in sorted(set(df[legend])): t = df[df[legend] == l] axm.scatter(x=t[x], y=t[y], c=legend_color[l], marker=marker, alpha=alpha, **kwargs) if fit: xs, ys, fn = _get_fit(x, y, t, fit, fitparams) axm.plot(xs, ys, c=legend_color[l]) if makefitaxis and table: text += '${}: {}$\n'.format(str(l).strip(), fn) if makefitaxis and table and not axes: components.regressiontable(text, axt, fig) axt.axis('off') else: axm.scatter(x=df[x], y=df[y], marker=marker, alpha=alpha, **kwargs) if fit: xs, ys, fn = _get_fit(x, y, df, fit, fitparams) axm.plot(xs, ys) if makefitaxis and table: components.regressiontable('{}'.format(fn), axt, fig) axm.set_xlim(np.min(df[x]), np.max(df[x])) axm.set_ylim(np.min(df[y]), np.max(df[y])) axm.set_yscale(yscale) axm.set_xscale(xscale) axm.set_xlabel(x) axm.set_ylabel(y) if axes: return axm return canvas.figure
def _get_fit(x, y, df, fit, fitparams): """ Internal method to return fitted data given an x and y and datatable :param x: x param :param y: y param :param df: data table :param fit: type of fit :return: subsample of data and predicted line """ xhat = np.linspace(df[x].min(), df[x].max(), num=100) if fit == 'linear': xs, ys = _medianify(df, x, y) mb = np.polyfit(xs, ys, 1, **fitparams) fit_fn = np.poly1d(mb) # TODO: make this handle precision correctly eq = 'f(x) = {:.4f}x + {:.4f}'.format( fit_fn.coeffs[0], fit_fn.coeffs[1]) return xhat, fit_fn(xhat), eq elif fit == 'quadratic': xs, ys = _medianify(df, x, y) mb = np.polyfit(xs, ys, 2, **fitparams) fit_fn = np.poly1d(mb) # TODO: make this handle precision correctly... eq = 'f(x) = {:.4f}x^2 + {:.4f}x + {:.4f}'.format( fit_fn.coeffs[0], fit_fn.coeffs[1], fit_fn.coeffs[2]) return xhat, fit_fn(xhat), eq elif fit == 'smooth': xs, ys = _medianify(df, x, y) xhat = np.linspace(xs.min(), xs.max(), num=100) spl = spi.UnivariateSpline(xs, ys, **fitparams) return xhat, spl(xhat), None elif fit == 'interpolate': xs, ys = _medianify(df, x, y) f = spi.interp1d(xs, ys, **fitparams) return xhat, f(xhat), None def _medianify(df, x, y): t = df[[x, y]] # univariate spline chokes if there are multiple values per "x" so # we will take the median of all the doubled up x values. summ = t.groupby(x).agg(np.median) summ = summ.unstack() summ = summ.reset_index() summ = summ.sort_values(x) return summ[x], summ[0]