Source code for seriesoftubes.converters.bamtofastq

"""Convert BAM/SAM to FASTQ format*
@name
sequence
+name
quality score (phred33)

files may be SAM or BAM (autodetected)
If the file(s) contain paired-end sequences, we will write to two files
(in the current working directory)
If the files contain single end sequences, we will write to stdout by default
Output is always to stdout (err goes to stderr, redirect it if you need to)
"""
import pysam
import os
import select
from scripter import path_to_executable 
from argparse import ArgumentParser
from copy import copy
from subprocess import Popen, PIPE
from os import mkfifo, getcwd, devnull
from os.path import join, exists, abspath
import subprocess
from sys import argv, stdin, stdout, stderr, exit, executable
from gzip import GzipFile
from .discover import PATH_TO_GZIP, gzip_class_factory

[docs]class UnpairedBAMToFastqConverter(object): """Works with unpaired SAM/BAM file""" def __init__(self, file_, wd=None, stderr=None, logger=None): self.require_bam(file_) if wd is None: wd = getcwd() fifofile = join(wd, '._sot_fifo') if exists(fifofile): raise RuntimeError('%s already exists' % fifofile) mkfifo(fifofile) self.fifo = abspath(fifofile) self._args = [executable, '-m', 'seriesoftubes.converters.bamtofastq', file_, '--single-stdout', fifofile] self._logger = logger self._stderr = stderr def launch(self): if self._logger is not None: self._logger.info('Launching %s', ' '.join(self._args)) self.subprocess = Popen(self._args, stdout=open(devnull, 'w'), stderr=self._stderr, bufsize=-1) def get_fifo_readers(self): return (open(self.fifos[0], 'r'), open(self.fifos[1], 'r')) def require_bam(self, filename): with open(filename, 'rb') as f: head = f.read(3) # check magic words for compression if head == '\x1f\x8b\x08': open_func = GzipFile else: open_func = open uncompressed = open_func(filename) head2 = uncompressed.read(4) # check for BAM if head2 == 'BAM\x01': return # check for SAM if head2 == '@HD\t': return else: raise ValueError('Not a SAM/BAM file')
[docs]def main(): """ what to do if we execute the module as a script bamtofastq can only convert files (not stdin) because of the paired-end problem """ parser = ArgumentParser(description=__doc__) parser.add_argument('files', nargs='+', help='List of input files') parser.add_argument('--no-gzip', action='store_true', default=False, help='Do not compress output') parser.add_argument('--single-stdout', help='Save single-end reads to here (default: stdout)', default=None) args = parser.parse_args() context = vars(args) read_files(**context)
def pair_writer(out1, out2): def writer(read1, read2): out1.write('@%s\n%s\n+\n%s\n' % (read1.qname, read1.seq, read1.qual)) out2.write('@%s\n%s\n+\n%s\n' % (read2.qname, read2.seq, read2.qual)) return writer
[docs]def read_files(files=None, no_gzip=False, single_stdout=stdout): """ actually reads the SAM/BAM files """ for file in files: if file is None: continue f = pysam.Samfile(file) #check if first read is paired aread = f.next() f.close() f = pysam.Samfile(file) if aread.is_paired: if no_gzip: print 'Detected paired-end reads, redirecting output to text files' file1 = file + '_1.txt' file2 = file + '_2.txt' fh1 = open(file1, 'w') fh2 = open(file2, 'w') elif PATH_TO_GZIP is not None: print 'Detected paired-end reads, redirecting output to .gz text files (using system gzip)' file1 = file + '_1.txt.gz' file2 = file + '_2.txt.gz' open_func = gzip_class_factory(PATH_TO_GZIP) fh1 = open_func(file1, 'wb') fh2 = open_func(file2, 'wb') else: print 'Detected paired-end reads, redirecting output to .gz text files' file1 = file + '_1.txt.gz' file2 = file + '_2.txt.gz' fh1 = GzipFile(file1, 'wb') fh2 = GzipFile(file2, 'wb') is_paired = False write = pair_writer(fh1, fh2) incomplete_pairs = [] for aread in f: is_paired = False qname = aread.qname for i in xrange(len(incomplete_pairs)): if incomplete_pairs[i].qname == qname: mate_read = incomplete_pairs.pop(i) # figure out order if aread.flag & 0x4 == 0x4: write(aread, mate_read) else: write(mate_read, aread) is_paired = True break if not is_paired: incomplete_pairs.append(aread) unpaired = len(incomplete_pairs) if not unpaired == 0: raise RuntimeError('%d unpaired reads remaining' % unpaired) else: if no_gzip: open_func = open elif PATH_TO_GZIP is not None: open_func = gzip_class_factory(PATH_TO_GZIP) else: open_func = GzipFile if single_stdout is None: fh1 = stdout else: fh1 = open_func(single_stdout, 'wb') for aread in f: qname = aread.qname or '' seq = aread.seq or '' qual = aread.qual or '' rec = '@%s\n%s\n+\n%s\n' % (qname, seq, qual) fh1.write(rec) fh1.close() exit(0)
if __name__ == '__main__': main()