Source code for metaseq.plotutils

"""
Module with handy utilities for plotting genomic signal
"""
import itertools
import matplotlib
from matplotlib import pyplot as plt
from matplotlib import mlab
import numpy as np
from matplotlib.ticker import MaxNLocator
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib import gridspec
import colormap_adjust
from scipy import stats


[docs]def ci_plot(x, arr, conf=0.95, ax=None, line_kwargs=None, fill_kwargs=None):
    """
    Plots the mean and 95% ci for the given array on the given axes

    Parameters
    ----------
    x : 1-D array-like
        x values for the plot

    arr : 2-D array-like
        The array to calculate mean and std for

    conf : float [.5 - 1]
        Confidence interval to use

    ax : matplotlib.Axes
        The axes object on which to plot

    line_kwargs : dict
        Additional kwargs passed to Axes.plot

    fill_kwargs : dict
        Additiona kwargs passed to Axes.fill_between
    """
    if ax is None:
        fig = plt.figure()
        ax = fig.add_subplot(111)

    line_kwargs = line_kwargs or {}
    fill_kwargs = fill_kwargs or {}

    m, lo, hi = ci(arr, conf)
    ax.plot(x, m, **line_kwargs)
    ax.fill_between(x, lo, hi, **fill_kwargs)
    return ax


[docs]def imshow(arr, x=None, ax=None, vmin=None, vmax=None, percentile=True,
           strip=False, features=None, conf=0.95, sort_by=None,
           line_kwargs=None, fill_kwargs=None, imshow_kwargs=None, figsize=(5, 12),
           width_ratios=(4, 1), height_ratios=(4, 1),
           subplot_params=dict(wspace=0.1, hspace=0.1),
           subset_by=None, subset_order=None,):
    """
    Do-it-all function to help with plotting heatmaps

    Parameters
    ----------
    arr : array-like

    x : 1D array
        X values to use.  If None, use range(arr.shape[1])

    ax : matplotlib.Axes
        If not None, then only plot the array on the provided axes.  This will
        ignore any additional arguments provided that apply to figure-level
        configuration or to the average line plot.  For example, `figsize`,
        `width_ratios`, `height_ratios`, `subplot_params`, `line_kwargs`, and
        `fill_kwargs` will all be ignored.

    vmin, vmax : float

    percentile : bool
        If True, then treat values for `vmin` and `vmax` as percentiles rather
        than absolute values.

    strip : bool
        Include a strip plot alongside the array

    features : pybedtools.BedTool or string filename
        Features used to construct the array

    conf : float
        Confidence interval to use in line plot.

    sort_by : array-like
        Use the provided array to sort the array (e.g., an array of expression
        values).  This array will be argsorted to get the proper order.

    line_kwargs, fill_kwargs : dict
        Passed directly to `ci_plot`.

    figsize : tuple
        (Width, height) of the figure to create.

    imshow_kwargs : dict
        Passed directly to matplotlib.pyplot.imshow.  By default, arguments
        used are `origin='lower'`, `aspect="auto"` and a colormap from
        colormap_adjust.smart_colormap generated using the provided `vmin` and
        `vmax`.

    width_ratios, height_ratios: tuple
        These tuples are passed to the `new_shell` function.  The default
        values set up a 2x2 configuration of panels for heatmap, line plot,
        colorbar axes, and optional strip plot.  However modifying
        `width_ratios` or `height_ratios` can be used to create more or fewer panels.

    subplot_params : dict
        Passed to Figure.subplots_adjust

    subset_by : array
        An array of any type (but usually int or str) that contains a class
        label for each row in the heatmap array.  For example, to subset by
        expression, an array the values of "up", "down", or "unchanged" at each
        of the positions could be provided.

        Note that the heatmap array is first sorted by `sort_by` and then split
        into groups according to `subset_by`, so each subset remains sorted by
        `sort_by`.

    subset_order : list-like
        This provides the order in which the subsets are plotted.  Since the
        default imshow arguments contain `origin="lower"`, these will be
        plotted in order starting at the bottom of the heatmap.

    """
    if ax is None:
        fig = new_shell(
            figsize=figsize,
            strip=strip,
            subplot_params=subplot_params,
            width_ratios=width_ratios,
            height_ratios=height_ratios)

    if x is None:
        x = np.arange(arr.shape[1] + 1)

    if percentile:
        if vmin is None:
            vmin = arr.min()
        else:
            vmin = mlab.prctile(arr.ravel(), vmin)
        if vmax is None:
            vmax = arr.max()
        else:
            vmax = mlab.prctile(arr.ravel(), vmax)
    else:
        if vmin is None:
            vmin = arr.min()
        if vmax is None:
            vmax = arr.max()

    cmap = colormap_adjust.smart_colormap(vmin, vmax)
    _imshow_kwargs = dict(origin='lower', cmap=cmap, vmin=vmin, vmax=vmax,
                          aspect='auto')
    if imshow_kwargs is not None:
        _imshow_kwargs.update(imshow_kwargs)

    # previously we did an argsort first; with subsetting we don't want to do
    # that yet....
    #if sort_by is not None:
    #    ind = np.argsort(sort_by)
    #else:
    #    ind = np.arange(arr.shape[0])

    if sort_by is None:
        sort_by = np.arange(arr.shape[0])

    if ax is None:
        array_ax = fig.array_axes
    else:
        array_ax = ax

    # If not provided, assume all in the same subset.
    if subset_by is None:
        subset_by = np.zeros(arr.shape[0])

    # Ensure always array, since we're doing indexing tricks
    if not isinstance(subset_by, np.ndarray):
        subset_by = np.array(subset_by)

    # If not provided, use sorted order
    if subset_order is None:
        subset_order = sorted(np.unique(subset_by))

    inds = []
    for cls in subset_order:
        subset_ind = np.nonzero(subset_by == cls)[0]
        subset_sort_by = sort_by[subset_ind]
        subset_argsort_by = np.argsort(subset_sort_by)
        inds.append(subset_ind[subset_argsort_by])
    ind = np.concatenate(inds)

    mappable = array_ax.imshow(
        arr[ind, :],
        extent=(x.min(), x.max(), 0, arr.shape[0]),
        **_imshow_kwargs
    )

    if line_kwargs is None:
        line_kwargs = {}
    if fill_kwargs is None:
        fill_kwargs = {}

    if isinstance(line_kwargs, dict):
        line_kwargs = [line_kwargs]
    if isinstance(fill_kwargs, dict):
        fill_kwargs = [fill_kwargs]

    _line_kwargs = itertools.cycle(line_kwargs)
    _fill_kwargs = itertools.cycle(fill_kwargs)

    if ax is None:
        plt.colorbar(mappable, fig.cax)
        for subset_ind, label, _lkw, _fkw in zip(inds, subset_order, _line_kwargs, _fill_kwargs):
            ci_plot(
                x,
                arr[subset_ind],
                ax=fig.line_axes,
                line_kwargs=_lkw,
                fill_kwargs=_fkw,
            )
        return fig
    else:
        return ax.figure


[docs]def add_labels_to_subsets(ax, subset_by, subset_order, text_kwargs=None,
                          add_hlines=True, hline_kwargs=None):
    """
    Helper function for adding labels to subsets within a heatmap.

    Assumes that imshow() was called with `subsets` and `subset_order`.

    Parameters
    ----------
    ax : matplotlib.Axes
        The axes to label.  Generally you can use `fig.array_axes` attribute of
        the Figure object returned by `metaseq.plotutils.imshow`.

    subset_by, subset_order : array, list
        See `metaseq.plotutils.imshow()` docstring; these should be the same
        `subsets` and `subset_order` that were provided to that function.
    """

    _text_kwargs = dict(transform=ax.get_yaxis_transform())
    if text_kwargs:
        _text_kwargs.update(text_kwargs)

    _hline_kwargs = dict(color='k')
    if hline_kwargs:
        _hline_kwargs.update(hline_kwargs)
    pos = 0
    for label in subset_order:
        ind = subset_by == label
        last_pos = pos
        pos += sum(ind)
        if add_hlines:
            ax.axhline(pos, **_hline_kwargs)
        ax.text(
            1.1,
            last_pos + (pos - last_pos)/2.0,
            label,
            **_text_kwargs)


[docs]def calculate_limits(array_dict, method='global', percentiles=None, limit=()):
    """
    Calculate limits for a group of arrays in a flexible manner.

    Returns a dictionary of calculated (vmin, vmax), with the same keys as
    `array_dict`.

    Useful for plotting heatmaps of multiple datasets, and the vmin/vmax values
    of the colormaps need to be matched across all (or a subset) of heatmaps.

    Parameters
    ----------
    array_dict : dict of np.arrays

    method : {'global', 'independent', callable}
        If method="global", then use the global min/max values across all
        arrays in array_dict.  If method="independent", then each array will
        have its own min/max calcuated.  If a callable, then it will be used to
        group the keys of `array_dict`, and each group will have its own
        group-wise min/max calculated.


    percentiles : None or list
        If not None, a list of (lower, upper) percentiles in the range [0,100].
    """
    if percentiles is not None:
        for percentile in percentiles:
            if not 0 <= percentile <= 100:
                raise ValueError("percentile (%s) not between [0, 100]")

    if method == 'global':
        all_arrays = np.concatenate(
            [i.ravel() for i in array_dict.itervalues()]
        )
        if percentiles:
            vmin = mlab.prctile(
                all_arrays, percentiles[0])
            vmax = mlab.prctile(
                all_arrays, percentiles[1])

        else:
            vmin = all_arrays.min()
            vmax = all_arrays.max()
        d = dict([(i, (vmin, vmax)) for i in array_dict.keys()])

    elif method == 'independent':
        d = {}
        for k, v in array_dict.iteritems():
            d[k] = (v.min(), v.max())

    elif hasattr(method, '__call__'):
        d = {}
        sorted_keys = sorted(array_dict.keys(), key=method)
        for group, keys in itertools.groupby(sorted_keys, method):
            keys = list(keys)
            all_arrays = np.concatenate([array_dict[i] for i in keys])
            if percentiles:
                vmin = mlab.prctile(
                    all_arrays, percentiles[0])
                vmax = mlab.prctile(
                    all_arrays, percentiles[1])
            else:
                vmin = all_arrays.min()
                vmax = all_arrays.max()
            for key in keys:
                d[key] = (vmin, vmax)
    return d


[docs]def ci(arr, conf=0.95):
    """
    Column-wise confidence interval.

    Parameters
    ----------
    arr : array-like

    conf : float
        Confidence interval

    Returns
    -------
    m : array
        column-wise mean
    lower : array
        lower column-wise confidence bound
    upper : array
        upper column-wise confidence bound
    """
    m = arr.mean(axis=0)
    n = len(arr)
    se = arr.std(axis=0) / np.sqrt(n)
    h = se * stats.t._ppf((1 + conf) / 2., n - 1)
    return m, m - h, m + h


[docs]def nice_log(x):
    """
    Uses a log scale but with negative numbers.

    :param x: NumPy array
    """
    neg = x < 0
    xi = np.log2(np.abs(x) + 1)
    xi[neg] = -xi[neg]
    return xi


[docs]def tip_zscores(a):
    """
    Calculates the "target identification from profiles" (TIP) zscores
    from Cheng et al. 2001, Bioinformatics 27(23):3221-3227.

    :param a: NumPy array, where each row is the signal for a feature.
    """
    weighted = a * a.mean(axis=0)
    scores = weighted.sum(axis=1)
    zscores = (scores - scores.mean()) / scores.std()
    return zscores


def fdrcorrection(pvals, alpha=0.05, method='indep'):
    '''
    NOTE: This function was copied from
    statsmodels.sandbox.stats.multicomp.fdrcorrection0, from statsmodels
    version 0.5.0.

    This is to avoid requiring all of statsmodels to be a dependency for
    metaseq, just for this function.




    pvalue correction for false discovery rate

    This covers Benjamini/Hochberg for independent or positively correlated and
    Benjamini/Yekutieli for general or negatively correlated tests. Both are
    available in the function multipletests, as method=`fdr_bh`, resp.
    `fdr_by`.

    Parameters
    ----------
    pvals : array_like
        set of p-values of the individual tests.
    alpha : float
        error rate
    method : {'indep', 'negcorr')

    Returns
    -------
    rejected : array, bool
        True if a hypothesis is rejected, False if not
    pvalue-corrected : array
        pvalues adjusted for multiple hypothesis testing to limit FDR

    Notes
    -----

    If there is prior information on the fraction of true hypothesis, then
    alpha should be set to alpha * m/m_0 where m is the number of tests, given
    by the p-values, and m_0 is an estimate of the true hypothesis.  (see
    Benjamini, Krieger and Yekuteli)

    The two-step method of Benjamini, Krieger and Yekutiel that estimates the
    number of false hypotheses will be available (soon).

    Method names can be abbreviated to first letter, 'i' or 'p' for fdr_bh and
    'n' for fdr_by.

    '''
    pvals = np.asarray(pvals)

    pvals_sortind = np.argsort(pvals)
    pvals_sorted = pvals[pvals_sortind]
    sortrevind = pvals_sortind.argsort()

    if method in ['i', 'indep', 'p', 'poscorr']:
        ecdffactor = _ecdf(pvals_sorted)
    elif method in ['n', 'negcorr']:
        cm = np.sum(1./np.arange(1, len(pvals_sorted)+1))  # corrected this
        ecdffactor = _ecdf(pvals_sorted) / cm
#    elif method in ['n', 'negcorr']:
#        cm = np.sum(np.arange(len(pvals)))
#        ecdffactor = ecdf(pvals_sorted)/cm
    else:
        raise ValueError('only indep and necorr implemented')
    reject = pvals_sorted <= ecdffactor*alpha
    if reject.any():
        rejectmax = max(np.nonzero(reject)[0])
        reject[:rejectmax] = True

    pvals_corrected_raw = pvals_sorted / ecdffactor
    pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1]
    pvals_corrected[pvals_corrected > 1] = 1
    return reject[sortrevind], pvals_corrected[sortrevind]


[docs]def tip_fdr(a, alpha=0.05):
    """
    Returns adjusted TIP p-values for a particular `alpha`.

    (see :func:`tip_zscores` for more info)

    :param a: NumPy array, where each row is the signal for a feature
    :param alpha: False discovery rate

    """
    zscores = tip_zscores(a)
    pvals = stats.norm.pdf(zscores)
    rejected, fdrs = fdrcorrection(pvals)
    return fdrs


[docs]def prepare_logged(x, y):
    """
    Transform `x` and `y` to a log scale while dealing with zeros.

    This function scales `x` and `y` such that the points that are zero in one
    array are set to the min of the other array.

    When plotting expression data, frequently one sample will have reads in
    a particular feature but the other sample will not.  Expression data also
    tends to look better on a log scale, but log(0) is undefined and therefore
    cannot be shown on a plot.  This function allows these points to be shown,
    piled up along one side of the plot.

    :param x,y: NumPy arrays
    """
    xi = np.log2(x)
    yi = np.log2(y)

    xv = np.isfinite(xi)
    yv = np.isfinite(yi)

    global_min = min(xi[xv].min(), yi[yv].min())
    global_max = max(xi[xv].max(), yi[yv].max())

    xi[~xv] = global_min
    yi[~yv] = global_min

    return xi, yi


def new_shell(figsize=(5, 12), strip=False, height_ratios=(4, 1),
              width_ratios=(4, 1), subplot_params=None):
    if subplot_params is None:
        subplot_params = {}
    fig = plt.figure(figsize=figsize)
    gs = gridspec.GridSpec(
        len(height_ratios),
        len(width_ratios),
        height_ratios=height_ratios,
        width_ratios=width_ratios,
        **subplot_params)
    fig.array_axes = plt.subplot(gs[0, 0])
    if strip:
        fig.strip_axes = plt.subplot(gs[0, 1], sharey=fig.array_axes)
    fig.line_axes = plt.subplot(gs[1, 0], sharex=fig.array_axes)
    fig.cax = plt.subplot(gs[1, 1])
    fig.gs = gs
    return fig


[docs]def matrix_and_line_shell(figsize=(5, 12), strip=False):
    """
    Helper function to construct an empty figure that has space for a matrix,
    a summary line plot directly below it, a colorbar axis, and an optional
    "strip" axis that parallels the matrix (and shares its y-axis) where data
    can be added to create callbacks.

    Returns a tuple of (fig, matrix_ax, line_ax, strip_ax, colorbar_ax) that
    can then be used to plot upon.

    :param figsize: Tuple of (width, height), in inches, for the figure to be
    :param strip: If `strip` is False, then the returned `strip_ax` will be
        None and no strip axes will be created.
    """
    fig = plt.figure(figsize=figsize)

    # Constants to keep track
    if strip:
        STRIP_COLS = 1
    else:
        STRIP_COLS = 0
    ROWS = 4
    COLS = 8 + STRIP_COLS
    MAT_COLS = 7
    MAT_ROWS = 3
    LINE_ROWS = ROWS - MAT_ROWS

    mat_ax = plt.subplot2grid(
        shape=(ROWS, COLS),
        loc=(0, STRIP_COLS),
        rowspan=MAT_ROWS,
        colspan=MAT_COLS,
    )

    line_ax = plt.subplot2grid(
        shape=(ROWS, COLS),
        loc=(MAT_ROWS, STRIP_COLS),
        rowspan=LINE_ROWS,
        colspan=MAT_COLS,
        sharex=mat_ax)

    if strip:
        strip_ax = plt.subplot2grid(
            shape=(ROWS, COLS),
            loc=(0, 0),
            rowspan=MAT_ROWS,
            colspan=STRIP_COLS,
            sharey=mat_ax,
        )
    else:
        strip_ax = None

    cax = plt.subplot2grid(
        shape=(ROWS, COLS),
        loc=(ROWS - MAT_ROWS, MAT_COLS + STRIP_COLS),
        rowspan=1,
        colspan=1,
    )

    fig.subplots_adjust(hspace=0.1, wspace=0.2, right=0.88, left=0.23)
    return fig, mat_ax, line_ax, strip_ax, cax


[docs]def clustered_sortind(x, k=10, scorefunc=None):
    """
    Uses MiniBatch k-means clustering to cluster matrix into groups.

    Each cluster of rows is then sorted by `scorefunc` -- by default, the max
    peak height when all rows in a cluster are averaged, or
    cluster.mean(axis=0).max().

    Returns the index that will sort the rows of `x` and a list of "breaks".
    `breaks` is essentially a cumulative row count for each cluster boundary.
    In other words, after plotting the array you can use axhline on each
    "break" to plot the cluster boundary.

    If `k` is a list or tuple, iteratively try each one and select the best
    with the lowest mean distance from cluster centers.

    :param x: Matrix whose rows are to be clustered
    :param k: Number of clusters to create or a list of potential clusters; the
        optimum will be chosen from the list
    :param scorefunc: Optional function for sorting rows within clusters.  Must
        accept a single argument of a NumPy array.
    """
    try:
        from sklearn.cluster import MiniBatchKMeans
    except ImportError:
        raise ImportError('please install scikits.learn for '
                          'clustering.')

    # If integer, do it once and we're done
    if isinstance(k, int):
        best_k = k

    else:
        mean_dists = {}
        for _k in k:
            mbk = MiniBatchKMeans(init='k-means++', n_clusters=_k)
            mbk.fit(x)
            mean_dists[_k] = mbk.transform(x).mean()
        best_k = sorted(mean_dists.items(), key=lambda x: x[1])[-1][0]

    mbk = MiniBatchKMeans(init='k-means++', n_clusters=best_k)
    mbk.fit(x)
    k = best_k
    labels = mbk.labels_
    scores = np.zeros(labels.shape, dtype=float)

    if not scorefunc:
        def scorefunc(x):
            return x.mean(axis=0).max()

    for label in range(k):
        ind = labels == label
        score = scorefunc(x[ind, :])
        scores[ind] = score

    pos = 0
    breaks = []
    ind = np.argsort(scores)
    for k, g in itertools.groupby(labels[ind]):
        pos += len(list(g))
        breaks.append(pos)

    return ind, breaks


def new_clustered_sortind(x, k=10, row_key=None, cluster_key=None):
    """
    Uses MiniBatch k-means clustering to cluster matrix into groups.

    Each cluster of rows is then sorted by `scorefunc` -- by default, the max
    peak height when all rows in a cluster are averaged, or
    cluster.mean(axis=0).max().

    Returns the index that will sort the rows of `x` and a list of "breaks".
    `breaks` is essentially a cumulative row count for each cluster boundary.
    In other words, after plotting the array you can use axhline on each
    "break" to plot the cluster boundary.

    If `k` is a list or tuple, iteratively try each one and select the best
    with the lowest mean distance from cluster centers.

    :param x: Matrix whose rows are to be clustered
    :param k: Number of clusters to create or a list of potential clusters; the
        optimum will be chosen from the list
    :param row_key:
        Optional function to act as a sort key for sorting rows within
        clusters.  Signature should be `scorefunc(a)` where `a` is a 1-D NumPy
        array.
    :param cluster_key:
        Optional function for sorting clusters.  Signature is `clusterfunc(a)`
        where `a` is a NumPy array containing all rows of `x` for cluster `i`.
        It must return a single value.
    """
    try:
        from sklearn.cluster import MiniBatchKMeans
    except ImportError:
        raise ImportError('please install scikits.learn for '
                          'clustering.')

    # If integer, do it once and we're done
    if isinstance(k, int):
        best_k = k

    else:
        mean_dists = {}
        for _k in k:
            mbk = MiniBatchKMeans(init='k-means++', n_clusters=_k)
            mbk.fit(x)
            mean_dists[_k] = mbk.transform(x).mean()
        best_k = sorted(mean_dists.items(), key=lambda x: x[1])[-1][0]

    mbk = MiniBatchKMeans(init='k-means++', n_clusters=best_k)
    mbk.fit(x)
    k = best_k
    labels = mbk.labels_
    scores = np.zeros(labels.shape, dtype=float)

    if cluster_key:
        # It's easier for calling code to provide something that operates on
        # a cluster level, but here it's converted to work on a label level
        # that looks in to the array `x`.
        def _cluster_key(i):
            return cluster_key(x[labels == i, :])
        sorted_labels = sorted(range(k), key=_cluster_key)
    else:
        # Otherwise just use them as-is.
        sorted_labels = range(k)

    if row_key:
        # Again, easier to provide a function to operate on a row.  But here we
        # need it to accept an index
        def _row_key(i):
            return row_key(x[i, :])

    final_ind = []
    breaks = []
    pos = 0
    for label in sorted_labels:
        # which rows in `x` have this label
        label_inds = np.nonzero(labels == label)[0]
        if row_key:
            label_sort_ind = sorted(label_inds, key=_row_key)
        else:
            label_sort_ind = label_inds
        for li in label_sort_ind:
            final_ind.append(li)
        pos += len(label_inds)
        breaks.append(pos)

    return np.array(final_ind), np.array(breaks)


[docs]def input_ip_plots(iparr, inputarr, diffed, x, sort_ind,
                   prefix=None, limits1=(None, None), limits2=(None, None),
                   hlines=None, vlines=None):

    """
    All-in-one plotting function to make a 5-panel figure.

    Panels are IP, input, and diffed; plus 2 line plots showing averages.

    :param iparr, inputarr: NumPy arrays constructed by a genomic_signal object
    :param diffed: Difference of `iparr` and `inputarr`, but can be some other
                   transformation.
    :param x: Extent to use -- for TSSs, maybe something like
        np.linspace(-1000, 1000, bins), or for just bin IDs, something like
        `np.arange(bins)`.

    :param sort_ind: row order for each of the 3 panels -- usually interesting
        to use `clustered_sortind` or `tip_zscores`

    :param prefix: Used to prefix plot titles with '%(prefix)s IP", etc
    :param limits1: Tuple passed to the Normalize function for IP and input.
    :param limits2: Tuple passed tot he Normalize function for the diffed array
    :param hlines: List of (position, kwarg) tuples for plotting horizontal
        lines.  Kwargs are passed directly to axhline. Useful for delimiting
        clusters, if you used `clustered_sortind` and have both `row_order` and
        `breaks`.
    :param vlines: List of (position, kwargs) tuples.  A vertical line will be
        plotted at each position using kwargs.
    """

    # global min and max
    gmin = min(iparr.min(), inputarr.min())
    gmax = max(iparr.max(), inputarr.max())

    fig = plt.figure(figsize=(10, 10))

    # 3 arrays, 2 line plots, a gene strip, and 2 colorbars.  Plots share the
    # axes that make sense
    #
    # 3 arrays
    ax1 = plt.subplot2grid(
        (9, 9), (0, 0), colspan=3, rowspan=6)
    ax2 = plt.subplot2grid(
        (9, 9), (0, 3), colspan=3, rowspan=6, sharex=ax1, sharey=ax1)
    ax3 = plt.subplot2grid(
        (9, 9), (0, 6), colspan=3, rowspan=6, sharex=ax1, sharey=ax1)

    # 2 line plots
    ax4 = plt.subplot2grid((9, 9), (6, 3), colspan=3, rowspan=3, sharex=ax1)
    ax5 = plt.subplot2grid((9, 9), (6, 6), colspan=3, rowspan=3, sharex=ax1)

    # 2 colorbars
    cax1 = plt.Axes(fig, rect=(0.05, 0.25, 0.25, 0.025))
    cax2 = plt.Axes(fig, rect=(0.05, 0.15, 0.25, 0.025))

    # For nice imshow axes
    extent = (min(x), max(x), 0, diffed.shape[0])

    cm = matplotlib.cm.gist_gray
    cm.set_bad('k')
    cm.set_over('r')
    cm.set_under('b')

    limits1 = list(limits1)
    limits2 = list(limits2)

    all_base = np.column_stack((iparr.ravel(), inputarr.ravel())).ravel()

    if limits1[0] is None:
        limits1[0] = mlab.prctile(
            all_base, 1. / all_base.size)
    if limits1[1] is None:
        limits1[1] = mlab.prctile(
            all_base, 100 - 1. / all_base.size)
    if limits2[0] is None:
        limits2[0] = mlab.prctile(
            diffed.ravel(), 1. / all_base.size)
    if limits2[1] is None:
        limits2[1] = mlab.prctile(
            diffed.ravel(), 100 - 1. / all_base.size)

    del all_base

    imshow_kwargs = dict(
        interpolation='nearest',
        aspect='auto',
        cmap=cm,
        norm=matplotlib.colors.Normalize(*limits1),
        extent=extent,
        origin='lower')

    # modify kwargs for diffed (by changing the normalization)
    diffed_kwargs = imshow_kwargs.copy()
    diffed_kwargs['norm'] = matplotlib.colors.Normalize(*limits2)

    # IP
    mappable1 = ax1.imshow(iparr[sort_ind, :], **imshow_kwargs)

    # input
    mappable2 = ax2.imshow(inputarr[sort_ind, :], **imshow_kwargs)

    # diffed
    mappable3 = ax3.imshow((diffed)[sort_ind, :], **diffed_kwargs)

    # IP and input line plot with vertical line
    ax4.plot(x, inputarr.mean(axis=0), color='k', linestyle='--',
             label='input')
    ax4.plot(x, iparr.mean(axis=0), color='k', label='ip')
    ax4.axvline(0, color='k', linestyle=':')

    # Diffed line plot with vertical line
    ax5.plot(x, diffed.mean(axis=0), 'k', label='enrichment')
    ax5.axvline(0, color='k', linestyle=':')

    # Colorbars
    cbar1 = fig.colorbar(mappable1, cax1, orientation='horizontal')
    cbar2 = fig.colorbar(mappable3, cax2, orientation='horizontal')
    fig.add_axes(cax1)
    fig.add_axes(cax2)

    # labeling...
    ax1.set_ylabel('features')
    plt.setp(ax2.get_yticklabels(), visible=False)
    plt.setp(ax3.get_yticklabels(), visible=False)
    ax4.set_xlabel('bp')
    ax4.set_ylabel('mean reads per million mapped reads')
    ax5.set_xlabel('bp')
    cax1.set_xlabel('Reads per million mapped reads')
    cax2.set_xlabel('Enrichment (RPMMR)')

    if prefix is None:
        prefix = ""
    ax1.set_title('%s IP' % prefix)
    ax2.set_title('%s input' % prefix)
    ax3.set_title('Difference')

    # diffed line plot should have y ax on right
    ax5.yaxis.set_ticks_position('right')
    ax5.yaxis.set_label_position('right')
    ax5.set_ylabel('enriched reads per million mapped reads')

    # Legends
    ax4.legend(loc='best', frameon=False)
    ax5.legend(loc='best', frameon=False)

    # Make sure everybody snaps to xmin/xmax
    for ax in [ax1, ax2, ax3, ax4, ax5]:
        ax.axis(xmin=extent[0], xmax=extent[1])

    if not hlines:
        hlines = []
    if not vlines:
        vlines = []

    for ax in [ax1, ax2, ax3]:
        for pos, kwargs in hlines:
            ax.axhline(pos, **kwargs)
        for pos, kwargs in vlines:
            ax.axvline(pos, **kwargs)

    fig.subplots_adjust(bottom=0.05, top=0.95, hspace=0.75, wspace=0.9)

    return fig


def _updatecopy(orig, update_with, keys=None, override=False):
    """
    Update a copy of dest with source.  If `keys` is a list, then only update
    with those keys.
    """
    d = orig.copy()
    if keys is None:
        keys = update_with.keys()
    for k in keys:
        if k in update_with:
            if k in d and not override:
                continue
            d[k] = update_with[k]
    return d


def _clean(z):
    """
    Return a version of z that only has finite values
    """
    return z[np.isfinite(z)]


[docs]class MarginalHistScatter(object):
[docs]    def __init__(self, ax, hist_size=0.6, pad=0.05):
        """
        Class to enable incremental appending of scatterplots, each of which
        generate additional marginal histograms.
        """
        self.scatter_ax = ax
        self.fig = ax.figure

        self.divider = make_axes_locatable(self.scatter_ax)

        self.top_hists = []
        self.right_hists = []
        self.hist_size = hist_size
        self.pad = pad
        self.xfirst_ax = None
        self.yfirst_ax = None

        # will hold histogram data
        self.hxs = []
        self.hys = []

    @property
    def xmax(self):
        return self.scatter_ax.dataLim.xmax

    @property
    def ymax(self):
        return self.scatter_ax.dataLim.ymax

    @property
    def xmin(self):
        return self.scatter_ax.dataLim.xmin

    @property
    def ymin(self):
        return self.scatter_ax.dataLim.ymin

    @property
    def limits(self):
        return (self.xmin, self.xmax, self.ymin, self.ymax)

[docs]    def append(self, x, y, scatter_kwargs, hist_kwargs=None, xhist_kwargs=None,
               yhist_kwargs=None, num_ticks=3, labels=None, hist_share=False,
               marginal_histograms=True):
        """
        Adds a new scatter to self.scatter_ax as well as marginal histograms
        for the same data, borrowing addtional room from the axes.

        Parameters
        ----------

        x, y : array-like
            Data to be plotted

        scatter_kwargs : dict
            Keyword arguments that are passed directly to scatter().

        hist_kwargs : dict
            Keyword arguments that are passed directly to hist(), for both the
            top and side histograms.

        xhist_kwargs, yhist_kwargs : dict
            Additional, margin-specific kwargs for the x or y histograms
            respectively.  These are used to update `hist_kwargs`

        num_ticks : int
            How many tick marks to use in each histogram's y-axis

        labels : array-like
            Optional NumPy array of labels that will be set on the collection
            so that they can be accessed by a callback function.

        hist_share : bool
            If True, then all histograms will share the same frequency axes.
            Useful for showing relative heights if you don't want to use the
            hist_kwarg `normed=True`

        marginal_histograms : bool
            Set to False in order to disable marginal histograms and just use
            as a normal scatterplot.
        """
        scatter_kwargs = scatter_kwargs or {}
        hist_kwargs = hist_kwargs or {}
        xhist_kwargs = xhist_kwargs or {}
        yhist_kwargs = yhist_kwargs or {}
        yhist_kwargs.update(dict(orientation='horizontal'))

        # Plot the scatter
        coll = self.scatter_ax.scatter(x, y, **scatter_kwargs)
        coll.labels = labels

        if not marginal_histograms:
            return

        xhk = _updatecopy(hist_kwargs, xhist_kwargs)
        yhk = _updatecopy(hist_kwargs, yhist_kwargs)

        axhistx = self.divider.append_axes(
            'top', size=self.hist_size,
            pad=self.pad, sharex=self.scatter_ax, sharey=self.xfirst_ax)

        axhisty = self.divider.append_axes(
            'right', size=self.hist_size,
            pad=self.pad, sharey=self.scatter_ax, sharex=self.yfirst_ax)

        axhistx.yaxis.set_major_locator(
            MaxNLocator(nbins=num_ticks, prune='both'))

        axhisty.xaxis.set_major_locator(
            MaxNLocator(nbins=num_ticks, prune='both'))

        if not self.xfirst_ax and hist_share:
            self.xfirst_ax = axhistx

        if not self.yfirst_ax and hist_share:
            self.yfirst_ax = axhisty

        # Keep track of which axes are which, because looking into fig.axes
        # list will get awkward....
        self.top_hists.append(axhistx)
        self.right_hists.append(axhisty)

        # Scatter will deal with NaN, but hist will not.  So clean the data
        # here.
        hx = _clean(x)
        hy = _clean(y)

        self.hxs.append(hx)
        self.hys.append(hy)

        # Only plot hists if there's valid data
        if len(hx) > 0:
            if len(hx) == 1:
                _xhk = _updatecopy(orig=xhk, update_with=dict(bins=[hx[0], hx[0]]), keys=['bins'])
                axhistx.hist(hx, **_xhk)
            else:
                axhistx.hist(hx, **xhk)
        if len(hy) > 0:
            if len(hy) == 1:
                _yhk = _updatecopy(orig=yhk, update_with=dict(bins=[hy[0], hy[0]]), keys=['bins'])
                axhisty.hist(hy, **_yhk)
            else:
                axhisty.hist(hy, **yhk)

        # Turn off unnecessary labels -- for these, use the scatter's axes
        # labels
        for txt in axhisty.get_yticklabels() + axhistx.get_xticklabels():
            txt.set_visible(False)

        for txt in axhisty.get_xticklabels():
            txt.set_rotation(-90)

[docs]    def add_legends(self, xhists=True, yhists=False, scatter=True, **kwargs):
        """
        Add legends to axes.
        """
        axs = []
        if xhists:
            axs.extend(self.hxs)
        if yhists:
            axs.extend(self.hys)
        if scatter:
            axs.extend(self.ax)

        for ax in axs:
            ax.legend(**kwargs)
Navigation

Source code for metaseq.plotutils

Quick search

Navigation