Source code for jmpy.plotting.scatter

import numpy as np

import scipy.interpolate as spi
import matplotlib as mpl
import matplotlib.backends.backend_agg as mbb

from jmpy import common
from jmpy.plotting import components


[docs]def scatter(x, y, data=None, legend=None,  marker='o', alpha=.5,
            xscale='linear', yscale='linear', cmap='default', figsize=(12, 6),
            fit=None, fitparams=None, table=True, fig=None, axes=None, cgrid=None,
            **kwargs):
    """
    Scatter plots with regression lines
    :param x:  str or ndarray
    :param y: str or ndarray
    :param data: pandas.Dataframe
    :param legend: str or ndarray, color/fit by this column
    :param marker: matplotlib marker style
    :param alpha: float, matplotlib alpha
    :param xscale: default == linear, any of matplotlib scale types
    :param yscale: default == linear, any of matplotlib scale types
    :param cmap: any of matplotlib cmaps
    :param figsize: default == (9,6);
    :param fit: [linear, quadratic, smooth, interpolate]
    :param fitparams: params to pass to fitting function
    :param table:  show the regression table
    :param kwargs:
    :return: fig, (axes)
    """

    # if no dataframe is supplied, create one
    if data is None:
        (x, y, _, legend, _, _), data = components.create_df(x, y, legend)

    if not fitparams:
        fitparams = {}

    df = data.copy()
    df = df[[i for i in (x, y, legend) if i]]
    # many of the fitting routines don't work with nan or non-sorted data.
    df = df.dropna()
    df.sort_values(x)
    df = df.reset_index()

    # fit axis is for the regression equations
    makefitaxis = False
    if fit == 'linear' or fit == 'quadratic':
        makefitaxis = True

    if fig:
        fig = fig
        canvas = mbb.FigureCanvasAgg(fig)
        axm, axc, axl, axt = components.get_axes(fig)
    elif axes:
        axm = axes
    else:
        fig = mpl.figure.Figure(figsize=figsize, tight_layout=True)
        canvas = mbb.FigureCanvasAgg(fig)
        axm, axc, axl, axt = components.create_axes(False, legend, table and makefitaxis, fig=fig)

    if legend:
        # colormap is supposed to be the goto function to get all colormaps
        # should return a colorgrid that maps each point to a set of colors
        if cgrid is None:
            cgrid = common.colors.colormap(df[legend],
                                           kind='discrete', cmap=cmap)

        legend_color = {}
        for i, key in df[legend].iteritems():
            legend_color[key] = cgrid[i]

        # if the axis is supplied, we do not want to create a legend axis
        if not axes:
            components.legend(sorted(list(legend_color.items())), axl)
            axl.set_title(legend, loc='left')

        text = ''
        for l in sorted(set(df[legend])):
            t = df[df[legend] == l]
            axm.scatter(x=t[x], y=t[y], c=legend_color[l],
                        marker=marker, alpha=alpha, **kwargs)

            if fit:
                xs, ys, fn = _get_fit(x, y, t, fit, fitparams)
                axm.plot(xs, ys, c=legend_color[l])

                if makefitaxis and table:
                    text += '${}:  {}$\n'.format(str(l).strip(), fn)

        if makefitaxis and table and not axes:
            components.regressiontable(text, axt, fig)
            axt.axis('off')

    else:
        axm.scatter(x=df[x], y=df[y],
                    marker=marker, alpha=alpha, **kwargs)
        if fit:
            xs, ys, fn = _get_fit(x, y, df, fit, fitparams)
            axm.plot(xs, ys)

            if makefitaxis and table:
                components.regressiontable('{}'.format(fn), axt, fig)

    axm.set_xlim(np.min(df[x]), np.max(df[x]))
    axm.set_ylim(np.min(df[y]), np.max(df[y]))
    axm.set_yscale(yscale)
    axm.set_xscale(xscale)
    axm.set_xlabel(x)
    axm.set_ylabel(y)

    if axes:
        return axm

    return canvas.figure


def _get_fit(x, y, df, fit, fitparams):
    """
    Internal method to return fitted data given an x and y and datatable

    :param x: x param
    :param y: y param
    :param df: data table
    :param fit: type of fit
    :return: subsample of data and predicted line
    """

    xhat = np.linspace(df[x].min(), df[x].max(), num=100)

    if fit == 'linear':
        xs, ys = _medianify(df, x, y)
        mb = np.polyfit(xs, ys, 1, **fitparams)
        fit_fn = np.poly1d(mb)
        # TODO: make this handle precision correctly
        eq = 'f(x) = {:.4f}x + {:.4f}'.format(
            fit_fn.coeffs[0], fit_fn.coeffs[1])

        return xhat, fit_fn(xhat), eq

    elif fit == 'quadratic':
        xs, ys = _medianify(df, x, y)
        mb = np.polyfit(xs, ys, 2, **fitparams)
        fit_fn = np.poly1d(mb)
        # TODO: make this handle precision correctly...
        eq = 'f(x) = {:.4f}x^2 + {:.4f}x + {:.4f}'.format(
            fit_fn.coeffs[0], fit_fn.coeffs[1], fit_fn.coeffs[2])

        return xhat, fit_fn(xhat), eq

    elif fit == 'smooth':
        xs, ys = _medianify(df, x, y)
        xhat = np.linspace(xs.min(), xs.max(), num=100)
        spl = spi.UnivariateSpline(xs, ys, **fitparams)
        return xhat, spl(xhat), None

    elif fit == 'interpolate':
        xs, ys = _medianify(df, x, y)
        f = spi.interp1d(xs, ys, **fitparams)
        return xhat, f(xhat), None


def _medianify(df, x, y):
        t = df[[x, y]]
        # univariate spline chokes if there are multiple values per "x" so
        # we will take the median of all the doubled up x values.
        summ = t.groupby(x).agg(np.median)
        summ = summ.unstack()
        summ = summ.reset_index()
        summ = summ.sort_values(x)

        return summ[x], summ[0]