Source code for metaseq.integration.signal_comparison

"""
Module for working with comparisons between two genomic_signal objects, e.g.,
genome-wide correlation
"""
import sys
import pybedtools
import itertools
import numpy as np


[docs]def compare(signal1, signal2, features, outfn, comparefunc=np.subtract, batchsize=5000, array_kwargs=None, verbose=False): """ Compares two genomic signal objects and outputs results as a bedGraph file. Can be used for entire genome-wide comparisons due to its parallel nature. Typical usage would be to create genome-wide windows of equal size to provide as `features`:: windowsize = 10000 features = pybedtools.BedTool().window_maker( genome='hg19', w=windowsize) You will usually want to choose bins for the array based on the final resolution you would like. Say you would like 10-bp bins in the final bedGraph; using the example above you would use array_kwargs={'bins': windowsize/10}. Or, for single-bp resolution (beware: file will be large), use {'bins': windowsize}. Here's how it works. This function: * Takes `batchsize` features at a time from `features` * Constructs normalized (RPMMR) arrays in parallel for each input genomic signal object for those `batchsize` features * Applies `comparefunc` (np.subtract by default) to the arrays to get a "compared" (e.g., difference matrix by default) for the `batchsize` features. * For each row in this matrix, it outputs each nonzero column as a bedGraph format line in `outfn` `comparefunc` is a function with the signature:: def f(x, y): return z where `x` and `y` will be arrays for `signal1` and `signal2` (normalized to RPMMR) and `z` is a new array. By default this is np.subtract, but another common `comparefunc` might be a log2-fold-change function:: def lfc(x, y): return np.log2(x / y) :param signal1: A genomic_signal object :param signal2: Another genomic_signal object :param features: An iterable of pybedtools.Interval objects. A list will be created for every `batchsize` features, so you need enough memory for this. :param comparefunc: Function to use to compare arrays (default is np.subtract) :param outfn: String filename to write bedGraph file :param batchsize: Number of features (each with length `windowsize` bp) to process at a time :param array_kwargs: Kwargs passed directly to genomic_signal.array. Needs `processes` and `chunksize` if you want parallel processing :param verbose: Be noisy """ fout = open(outfn, 'w') fout.write('track type=bedGraph\n') i = 0 this_batch = [] for feature in features: if i <= batchsize: this_batch.append(feature) i += 1 continue if verbose: print 'working on batch of %s' % batchsize sys.stdout.flush() arr1 = signal1.array(this_batch, **array_kwargs).astype(float) arr2 = signal2.array(this_batch, **array_kwargs).astype(float) arr1 /= signal1.million_mapped_reads() arr2 /= signal2.million_mapped_reads() compared = comparefunc(arr1, arr2) for feature, row in itertools.izip(this_batch, compared): start = feature.start bins = len(row) binsize = len(feature) / len(row) # Quickly move on if nothing here. speed increase prob best for # sparse data if sum(row) == 0: continue for j in range(0, len(row)): score = row[j] stop = start + binsize if score != 0: fout.write('\t'.join([ feature.chrom, str(start), str(stop), str(score)]) + '\n') start = start + binsize this_batch = [] i = 0 fout.close()
if __name__ == "__main__": import metaseq ip_bam = metaseq.genomic_signal( metaseq.example_filename( 'wgEncodeUwTfbsK562CtcfStdAlnRep1.bam'), 'bam') control_bam = metaseq.genomic_signal( metaseq.example_filename( 'wgEncodeUwTfbsK562InputStdAlnRep1.bam'), 'bam') BINSIZE = 10 WINDOWSIZE = 10000 BINS = WINDOWSIZE / BINSIZE features = pybedtools.BedTool()\ .window_maker(genome='hg19', w=WINDOWSIZE)\ .filter(lambda x: x.chrom == 'chr19') result = compare( signal1=ip_bam, signal2=control_bam, features=features, outfn='diffed.bedgraph', array_kwargs=dict(bins=BINS, processes=6, chunksize=50), verbose=True)