Source code for railroadtracks.rnaseq

# Copyright 2014-2015 Novartis Institutes for Biomedical Research

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Simple model for the expression analysis using RNA-Seq
"""
import os, sys, shutil
import re
import warnings
import csv
import zlib
import collections
import subprocess
import tempfile
import logging
logger = logging.getLogger(__name__)
import argparse
from . import environment
from railroadtracks.model.diffexp import RSOURCES_DIR
from . import core
import gzip
from railroadtracks.model import files
from railroadtracks.model.files import (File,
                                        FASTAFile,
                                        FASTQFile,
                                        GFFFile,
                                        FASTQPossiblyGzipCompressed, 
                                        BAMFile,
                                        SAMFile,
                                        BAMFileSortedByID,
                                        CSVFile,
                                        TSVFile,
                                        FilePattern,
                                        CSVFileSequence,
                                        SAMFileSequence,
                                        SamtoolsSamToBam,
                                        SamtoolsBamToSam,
                                        SamtoolsSorterByID,
                                        BEDFile,
                                        BedtoolsBamToFastq,
                                        BedtoolsBamToFastqPE)
from railroadtracks.model import simulate
from railroadtracks.model.simulate import (FluxsimulatorExpression,
                                           FluxsimulatorDerivedExpression,
                                           FluxsimulatorSequencing,
                                           FluxSimulatorParameters,
                                           FluxSimulatorParametersAndPro)
from railroadtracks.model import aligners
from railroadtracks.model.aligners import (BWAIndex,
                                           BWAIndexFiles,
                                           BWA,
                                           Bowtie,
                                           SavedBowtieIndex,
                                           Bowtie2,
                                           SavedBowtie2Index,
                                           BowtieBuild,
                                           Bowtie2Build,
                                           StarIndex,
                                           StarAlign,
                                           GsnapIndex,
                                           GsnapAlign,
                                           SailfishIndex,
                                           SalmonIndex,
                                           TopHat,
                                           TopHat2,
                                           SamtoolsExtractUnaligned)
from railroadtracks.model import diffexp
from railroadtracks.model.diffexp import (EdgeR,
                                          DESeq,
                                          DESeq2,
                                          LimmaVoom,
                                          EBSeq)

from railroadtracks.model import quantify
from railroadtracks.model.quantify import (SailfishQuant,
                                           SalmonAlignQuant,
                                           FeatureCount,
                                           HTSeqCount)

from railroadtracks.model import misc
from railroadtracks.model.misc import Anyscript

from railroadtracks.unifex import UnifexError, _cmdfromuei
from abc import ABCMeta, abstractproperty
            

[docs]class ACTIVITY(simulate.ACTIVITY, 
               aligners.ACTIVITY, 
               quantify.ACTIVITY, 
               diffexp.ACTIVITY,
               files.ACTIVITY,
               misc.ACTIVITY):
    """ Activities that can be performed by the different steps modeled.
    (note: steps can combine several activities - the most obvious example is
    existing monolithic pipelines) """
    __order__ = 'SIMULATE INDEX ALIGN SORT QUANTIFY NORMALIZE DIFFEXP UTILITY MISC'
    NORMALIZE = 'Normalize'



[docs]class AssetsCRCHeadTail(core.AssetsStep):
    """
    Assets for :class:`CRCHeadTail`

    """
    Source = core.assetfactory('Source', [core.AssetAttr('file', File, '')])
    Target = core.assetfactory('Target', [core.AssetAttr('crc', CSVFile, '')])

[docs]class CRCHeadTail(core.StepAbstract):
    """
    Compute a CRC32-based checksum on the beginning (head) and end (tail) of a file
    as a cheap way to check whether 2 files contain identical data.
    Might be useful with large files, however its main purpose is testing.
    """
    _name = 'crc-file'
    _default_execpath = None
    Assets = AssetsCRCHeadTail
    activities = (ACTIVITY.UTILITY, )

    def __init__(self, executable=None):
        if executable is None:
            executable = type(self)._default_execpath
        self._execpath = executable
        # no third-party executable involved
        if self._execpath is not None:
            raise ValueError('No third-party executable involved. The executable should be None')


    @property
    def version(self):
        from railroadtracks import __version__
        return __version__

    def run(self, assets, parameters = tuple()):
        # FIXME: should these checks be moved to the Assets class ?
        out_counts = assets.target.crc
        out_fh = open(out_counts.name, 'w')
        csv_w = csv.writer(out_fh)
        csv_w.writerow(['name', 'crc'])
        for name in assets.source.file:
            with open(name, 'rb') as in_fh:
                # begining of the file
                data = in_fh.read(10000)
                crc = zlib.crc32(data)
                # end of the file
                in_fh.seek(0, 2)
                n = in_fh.tell()
                if n < 10000:
                    in_fh.seek(0)
                else:
                    in_fh.seek(-10000, 2)
                data = in_fh.read(10000)
                crc = zlib.crc32(data, crc)                
            csv_w.writerow((name, crc))
        out_fh.flush()
        out_fh.close()
        cmd = None
        returncode = 0
        return (cmd, returncode)


# patch ngs_plumbing with class Gzip-friendly
try:
    import ngs_plumbing.fastq
    hasngsp = True
except:
    hasngsp = False
if hasngsp:
    import io
    #FIXME: This should ideally be fixed upstream in ngs_plumbing
    class GzipFastqFilePair(ngs_plumbing.fastq.FastqFilePair):
        """ Pair of FASTQ files
        The default iterator will go through the paired entries in the file
        (not the rows), assuming that they are in the same order.
        No check that this is the case (using IDs) is performed.
        """

        def __init__(self, filename1, filename2, **kwargs):
            Cls = type(self)
            if ngs_plumbing.fastq._python2:
                fh1 = gzip.open(filename=filename1, **kwargs)
                fh2 = gzip.open(filename=filename2, **kwargs)
            else:
                stream = io.FileIO(filename1)
                fh1 = gzip.open(stream, **kwargs)
                stream = io.FileIO(filename2)
                fh2 = gzip.open(stream, **kwargs)
            self._fh1 = fh1
            self._fh2 = fh2

        def __iter__(self):
            return ngs_plumbing.fastq.iter_entry_pairs(self._fh1, self._fh2)

        def iter_seqqual_pairs(self):
            """ Iterate over the pairs of (sequence+quality) in 
            the file. """
            return ngs_plumbing.fastq.iter_seqqual_pairs(self._fh1, self._fh2)

        def readcount(self):
            pos = self.tell()
            for i, entry in enumerate(self):
                pass
            return i+1
    




        

class AssetsColumnMerger(core.AssetsStep):
    Source = core.assetfactory('Source', [core.AssetAttr('counts', CSVFileSequence, '')])
    Target = core.assetfactory('Target', [core.AssetAttr('counts', CSVFile, '')])

class ColumnMerger(core.StepAbstract):
    Assets = AssetsColumnMerger
    _name = 'column-merge'
    _default_execpath = None
    activities = (ACTIVITY.UTILITY, )
    #activity = property(lambda x: x.activities, None, None)

    def __init__(self,executable=None):
        """ First parameter is the column used as ID. If None, no column used as ID.
        Second parameter is the column to merge."""
        if executable is None:
            executable = type(self)._default_execpath
        self._execpath = executable # no third-party executable involved
        if  self._execpath is not None:
            raise ValueError('The executable cannot be set. This is a fully internal step.')

    @property
    def version(self):
        from railroadtracks import __version__
        return __version__

    def run(self, assets, parameters = ('0','1')):
        # FIXME: should these checks be moved to the Assets class ?
        out_counts = assets.target.counts
        l = len(tuple(out_counts))
        if l > 1:
            raise ValueError('Merging columns can only have one target file.')
        else:
            import resource
            max_fh_soft, max_fh_hard = resource.getrlimit(resource.RLIMIT_NOFILE)
            if l > max_fh_hard:
                raise ValueError('On this system the maximum number of columns is'
                                 ' %i (and you are trying to merge %i columns)'
                                 ' [note: opened file handles should be taken out of the'
                                  ' maximum number]' % (max_fh_hard-1, l))
        in_counts = assets.source.counts
        # open all
        it_in_counts = in_counts.iteritems()
        cls, value = next(it_in_counts)
        head = cls(value)
        head_iter = head.iter_entry()
        tail = tuple(cls(value) for cls, value in it_in_counts)
        tail_iter = tuple(x.iter_entry() for x in tail)
        # merge
        if len(parameters) != 2:
            raise ValueError('There should be 2 parameters: the index of the columns with IDs, the index of the column with values.')
        try:
            idcol_i = int(parameters[0])
        except ValueError:
            if parameters[0] == 'None':
                idcol_i = None
            else:
                ValueError('The first parameter should be string representation of an integer or "None".')
        valcol_i = int(parameters[1])
        with open(out_counts.name, 'w') as out_fh:
            csv_w = csv.writer(out_fh)                    
            if idcol_i is None:
                for row in head_iter:
                    counts = [row[valcol_i]]
                    for col_i, h in enumerate(tail_iter):
                        row = next(h)
                        counts.append(row[valcol_i].rstrip())
                    csv_w.writerow(counts)
            else:
                for row in head_iter:
                    identifier_head = row[idcol_i]
                    counts = [row[valcol_i].rstrip()]
                    for col_i, h in enumerate(tail_iter):
                        row = next(h)
                        identifier = row[idcol_i]
                        if identifier == identifier_head:
                            counts.append(row[valcol_i].rstrip())
                        else:
                            raise ValueError('Mismatching IDs (first columns has "%s"'
                                             ' while column %i has "%s")' % (identifier_head,
                                                                             col_i+1, identifier))
                    csv_w.writerow([identifier_head, ] + counts)
        
        cmd = None
        returncode = 0
        return (cmd, returncode)

class AssetsNormalizer(core.AssetsStep):
    Source = core.assetfactory('Source', [core.AssetAttr('count', CSVFile, '')])
    Target = core.assetfactory('Target', [core.AssetAttr('normalizedcounts', CSVFile, '')])

class Normalizer(core.StepAbstract):
    Assets = AssetsNormalizer


def _picard_getversion(execpath):
    cmd = [execpath, '--version']
    try:
        logger.debug(subprocess.list2cmdline(cmd))
        res = subprocess.check_output(cmd)
    except OSError as ose:
        raise UnifexError("""Command: %s
        %s""" % (' '.join(cmd), ose))

    m = re.match('^([0-9].+)$', res)
    if m is None:
        raise RuntimeError('Could not find the version number.')
    return m.groups()[0]



class PicardCollectAlignmentSummaryMetrics(core.StepAbstract):
    activities = (ACTIVITY.UTILITY, )
    _name = 'picard-collect-alignment-summary-metrics'
    _default_execpath = 'picard-collect-alignment-summary-metrics.sh'

    class Assets(core.AssetsStep):
        Source = core.assetfactory('Source', [core.AssetAttr('alignedreads', BAMFileSortedByID, ''),
                                              core.AssetAttr('reference', FASTAFile, '')])
        Target = core.assetfactory('Target', [core.AssetAttr('metrics', FilePattern, '')])

    parser = argparse.ArgumentParser(_name)
    parser.add_argument('ASSUME_SORTED',
                        default = 'false',
                        choices = ('true', 'false'))

    def __init__(self, execpath=None):
        """        
        :param executable: path to the executable
        :type param: :class:`str`
        """
        if execpath is None:
            execpath = type(self)._default_execpath
        self._execpath = execpath
        self._version = None
    
    @property 
    def version(self):
        if self._version is None:
            self._version = _picard_getversion(self.execpath)
        return self._version

    def run(self, assets, parameters = tuple()):
        options, unknown = self.parser.parse_known_args()
        # FIXME: check that input is sorted BAM. If not, forbid the use of ASSUME_SORTED ?

        # build command line
        cmd = [self._execpath, ]
        cmd.extend(parameters)
        cmd.extend(['INPUT_FILE=%s', assets.source.annotationfile.name,
                    'REFERENCE_SEQUENCE=%s', assets.source.reference.name,
                    'OUTPUT=%s', assets.target.metrics.name])

        logger.debug(subprocess.list2cmdline(cmd))
        with open(os.devnull, "w") as fnull:
            returncode = subprocess.check_call(cmd)

        return (cmd, returncode)