Source code for metaseq.integration.signal_comparison

"""
Module for working with comparisons between two genomic_signal objects, e.g.,
genome-wide correlation
"""
import sys
import pybedtools
import itertools
import numpy as np


[docs]def compare(signal1, signal2, features, outfn, comparefunc=np.subtract,
        batchsize=5000, array_kwargs=None, verbose=False):
    """
    Compares two genomic signal objects and outputs results as a bedGraph file.
    Can be used for entire genome-wide comparisons due to its parallel nature.

    Typical usage would be to create genome-wide windows of equal size to
    provide as `features`::

        windowsize = 10000
        features = pybedtools.BedTool().window_maker(
           genome='hg19', w=windowsize)

    You will usually want to choose bins for the array based on the final
    resolution you would like. Say you would like 10-bp bins in the final
    bedGraph; using the example above you would use array_kwargs={'bins':
    windowsize/10}.  Or, for single-bp resolution (beware: file will be large),
    use {'bins': windowsize}.

    Here's how it works.  This function:

        * Takes `batchsize` features at a time from `features`

        * Constructs normalized (RPMMR) arrays in parallel for each input
          genomic signal object for those `batchsize` features

        * Applies `comparefunc` (np.subtract by default) to the arrays to get
          a "compared" (e.g., difference matrix by default) for the `batchsize`
          features.

        * For each row in this matrix, it outputs each nonzero column as
          a bedGraph format line in `outfn`

    `comparefunc` is a function with the signature::

        def f(x, y):
            return z

    where `x` and `y` will be arrays for `signal1` and `signal2` (normalized to
    RPMMR) and `z` is a new array.  By default this is np.subtract, but another
    common `comparefunc` might be a log2-fold-change function::

        def lfc(x, y):
            return np.log2(x / y)

    :param signal1: A genomic_signal object
    :param signal2: Another genomic_signal object
    :param features: An iterable of pybedtools.Interval objects. A list will be
        created for every `batchsize` features, so you need enough memory for
        this.
    :param comparefunc: Function to use to compare arrays (default is
        np.subtract)
    :param outfn: String filename to write bedGraph file
    :param batchsize: Number of features (each with length `windowsize` bp) to
        process at a time
    :param array_kwargs: Kwargs passed directly to genomic_signal.array.  Needs
        `processes` and `chunksize` if you want parallel processing
    :param verbose: Be noisy
    """
    fout = open(outfn, 'w')
    fout.write('track type=bedGraph\n')

    i = 0
    this_batch = []
    for feature in features:
        if i <= batchsize:
            this_batch.append(feature)
            i += 1
            continue

        if verbose:
            print 'working on batch of %s' % batchsize
            sys.stdout.flush()

        arr1 = signal1.array(this_batch, **array_kwargs).astype(float)
        arr2 = signal2.array(this_batch, **array_kwargs).astype(float)
        arr1 /= signal1.million_mapped_reads()
        arr2 /= signal2.million_mapped_reads()
        compared = comparefunc(arr1, arr2)

        for feature, row in itertools.izip(this_batch, compared):
            start = feature.start
            bins = len(row)
            binsize = len(feature) / len(row)

            # Quickly move on if nothing here.  speed increase prob best for
            # sparse data
            if sum(row) == 0:
                continue

            for j in range(0, len(row)):
                score = row[j]
                stop = start + binsize
                if score != 0:
                    fout.write('\t'.join([
                        feature.chrom,
                        str(start),
                        str(stop),
                        str(score)]) + '\n')
                start = start + binsize
        this_batch = []
        i = 0
    fout.close()


if __name__ == "__main__":
    import metaseq
    ip_bam = metaseq.genomic_signal(
            metaseq.example_filename(
                'wgEncodeUwTfbsK562CtcfStdAlnRep1.bam'), 'bam')
    control_bam = metaseq.genomic_signal(
            metaseq.example_filename(
                'wgEncodeUwTfbsK562InputStdAlnRep1.bam'), 'bam')

    BINSIZE = 10
    WINDOWSIZE = 10000
    BINS = WINDOWSIZE / BINSIZE
    features = pybedtools.BedTool()\
            .window_maker(genome='hg19', w=WINDOWSIZE)\
            .filter(lambda x: x.chrom == 'chr19')

    result = compare(
            signal1=ip_bam,
            signal2=control_bam,
            features=features,
            outfn='diffed.bedgraph',
            array_kwargs=dict(bins=BINS, processes=6, chunksize=50),
            verbose=True)
Navigation

Source code for metaseq.integration.signal_comparison

Quick search

Navigation