Source code for riboplot.ribocore

"""Common functions. """
import pysam
import logging
import subprocess
from contextlib import contextmanager


# create logger for the entire program
log = logging.getLogger('riboplot')
log.setLevel(logging.DEBUG)

# create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s - %(module)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
ch.setFormatter(formatter)
log.addHandler(ch)


[docs]class ArgumentError(Exception): """Raised when invalid arguments are sent in the command line.""" pass
[docs]class BamFileError(Exception): """Errors related to BAM file""" pass
[docs]class RiboPlotError(Exception): """General errors relating to riboplot.""" pass
[docs]class RiboCountError(Exception): """General errors relating to ribocount.""" pass
[docs]class RNACountsError(Exception): """For errors related to RNA Coverage generation using bedtools. """ pass
[docs]def lengths_offsets(value): """Split the given comma separated value to multiple integer values. """ values = [] for item in value.split(','): item = int(item) values.append(item) return values
@contextmanager
[docs]def open_pysam_file(fname, ftype): """Open a BAM or FASTA file with pysam (for use with "with" statement)""" try: if ftype == 'bam': fpysam = pysam.AlignmentFile(fname, 'rb') elif ftype == 'fasta': fpysam = pysam.FastaFile(fname) yield fpysam except: raise else: fpysam.close()
[docs]def is_bam_valid(bam_file): """Check if bam file is valid. Raises a ValueError if pysam cannot read the file. #TODO: pysam does not differentiate between BAM and SAM """ try: f = pysam.AlignmentFile(bam_file) except ValueError: raise except: raise else: f.close() return True
[docs]def bam_has_index(bam_file): """Check if bam file has an index. Returns True/False.""" has_index = None with pysam.AlignmentFile(bam_file, 'rb') as bam_fileobj: try: bam_fileobj.fetch(bam_fileobj.references[0]) except ValueError as err: if err.message == 'fetch called on bamfile without index': bam_fileobj.close() has_index = False else: has_index = True return has_index
[docs]def create_bam_index(bam_file): """Create an index for the given BAM file.""" pysam.index(bam_file)
[docs]def is_fasta_valid(fasta_file): """Check if fasta file is valid. Raises a ValueError if pysam cannot read the file. #TODO: pysam does not differentiate between BAM and SAM """ try: f = pysam.FastaFile(fasta_file) except IOError: raise else: f.close() return True
[docs]def get_first_transcript_name(fasta_file): """Return the first FASTA sequence from the given FASTA file. Keyword arguments: fasta_file -- FASTA format file of the transcriptome """ with open_pysam_file(fname=fasta_file, ftype='fasta') as f: transcript_name = f.references[0] return transcript_name
[docs]def get_fasta_record(fasta_file, transcript_name): """Return a single transcript from a valid fasta file as a record. record[transcript_name] = sequence Keyword arguments: fasta_file -- FASTA format file of the transcriptome transcript_name -- Name of the transcript as in the FASTA header """ with open_pysam_file(fname=fasta_file, ftype='fasta') as f: sequence = f.fetch(transcript_name) return {transcript_name: sequence}
[docs]def get_fasta_records(fasta, transcripts): """Return list of transcript records from the given fasta file. Each record will be of the form {'sequence_id': {'sequence': 'AAA', 'length': 3}} trascripts should be provided as a list of sequence id's. """ records = {} f = pysam.FastaFile(fasta) for transcript in transcripts: try: sequence, length = f.fetch(transcript), f.get_reference_length(transcript) except KeyError: msg = 'Transcript "{}" does not exist in transcriptome FASTA file'.format(transcript) log.error(msg) raise ArgumentError(msg) records[transcript] = {'sequence': sequence, 'length': length} f.close() return records
[docs]def get_three_frame_orfs(sequence, starts=None, stops=None): """Find ORF's in frames 1, 2 and 3 for the given sequence. Positions returned are 1-based (not 0) Return format [{'start': start_position, 'stop': stop_position, 'sequence': sequence}, ] Keyword arguments: sequence -- sequence for the transcript starts -- List of codons to be considered as start (Default: ['ATG']) stops -- List of codons to be considered as stop (Default: ['TAG', 'TGA', 'TAA']) """ if not starts: starts = ['ATG'] if not stops: stops = ['TAG', 'TGA', 'TAA'] # Find ORFs in 3 frames orfs = [] for frame in range(3): start_codon = None orf = '' for position in range(frame, len(sequence), 3): codon = sequence[position:position + 3] if codon in starts: # We have found a start already, so add codon to orf and # continue. This is an internal MET if start_codon is not None: orf += codon continue # New orf start start_codon = position orf = codon else: # if sequence starts with ATG, start_codon will be 0 if start_codon is None: # We haven't found a start codon yet continue orf += codon if codon in stops: # orfs[start_codon + 1] = orf orfs.append({'start': start_codon + 1, 'stop': position + 3, 'sequence': orf}) # Reset start_codon = None orf = '' return orfs
[docs]def get_longest_orf(orfs): """Find longest ORF from the given list of ORFs.""" sorted_orf = sorted(orfs, key=lambda x: len(x['sequence']), reverse=True)[0] return sorted_orf
[docs]def filter_ribo_counts(counts, orf_start=None, orf_stop=None): """Filter read counts and return only upstream of orf_start or downstream of orf_stop. Keyword arguments: counts -- Ribo-Seq read counts obtained from get_ribo_counts. orf_start -- Start position of the longest ORF. orf_stop -- Stop position of the longest ORF. """ filtered_counts = dict.copy(counts) for position in counts: if orf_start and orf_stop: # if only upstream and downstream reads are required, check if # current position is upstream or downstream of the ORF start/stop # if not, remove from counts if (position > orf_start and position < orf_stop): filtered_counts.pop(position) elif orf_start: # check if current position is upstream of ORF start. if not, remove if position >= orf_start: filtered_counts.pop(position) elif orf_stop: # check if current position is downstream of ORF stop. If not, # remove if position <= orf_stop: filtered_counts.pop(position) # calculate total reads for this transcript total_reads = sum(sum(item.values()) for item in filtered_counts.values()) return filtered_counts, total_reads
[docs]def get_ribo_counts(ribo_fileobj, transcript_name, read_lengths, read_offsets): """For each mapped read of the given transcript in the BAM file (pysam AlignmentFile object), return the position (+1) and the corresponding frame (1, 2 or 3) to which it aligns. Keyword arguments: ribo_fileobj -- file object - BAM file opened using pysam AlignmentFile transcript_name -- Name of transcript to get counts for read_length (optional) -- If provided, get counts only for reads of this length. """ read_counts = {} total_reads = 0 for record in ribo_fileobj.fetch(transcript_name): query_length = record.query_length position_ref = record.pos + 1 for index, read_length in enumerate(read_lengths): position = position_ref # reset position if read_length == 0 or read_length == query_length: # if an offset is specified, increment position by that offset. position += read_offsets[index] else: # ignore other reads/lengths continue total_reads += 1 try: read_counts[position] except KeyError: read_counts[position] = {1: 0, 2: 0, 3: 0} # calculate the frame of the read from position rem = position % 3 if rem == 0: read_counts[position][3] += 1 else: read_counts[position][rem] += 1 log.debug('Total read counts: {}'.format(total_reads)) log.debug('RiboSeq read counts for transcript: {0}\n{1}'.format(transcript_name, read_counts)) return read_counts, total_reads
[docs]def check_required_arguments(ribo_file, transcriptome_fasta, transcript_name=None): """Check required arguments of both riboplot and ribocount.""" # Is this a valid BAM file? i.e., can pysam read it? try: is_bam_valid(ribo_file) except ValueError: log.error('The given RiboSeq BAM file is not valid') raise # Does the BAM file have an index? If not, create it. if not bam_has_index(ribo_file): log.info('Creating an index for the BAM file...') create_bam_index(ribo_file) if not bam_has_index(ribo_file): msg = ('Could not create an index for this BAM file. Is this a valid BAM file ' 'and/or is the BAM file sorted by chromosomal coordinates?') log.error(msg) raise BamFileError(msg) # Is FASTA file valid? fasta_valid = False try: fasta_valid = is_fasta_valid(transcriptome_fasta) except IOError: log.error('Transcriptome FASTA file is not valid') raise if fasta_valid: if transcript_name: try: get_fasta_records(transcriptome_fasta, [transcript_name]) except IOError: log.error('Could not get FASTA sequence of "{}" from transcriptome FASTA file'.format(transcript_name)) raise else: # ribocount doesn't have a transcript option so we get the first # sequence name from the fasta file transcript_name = get_first_transcript_name(transcriptome_fasta) # check if transcript also exists in BAM with pysam.AlignmentFile(ribo_file, 'rb') as bam_file: if transcript_name not in bam_file.references: msg = 'Transcript "{}" does not exist in BAM file'.format(transcript_name) log.error(msg) raise ArgumentError(msg)
[docs]def check_rna_file(rna_file): """Check if bedtools is available and if the given RNA-Seq bam file is valid. """ try: subprocess.check_output(['bedtools', '--version']) except OSError: log.error('Could not find bedtools in PATH. bedtools is required ' 'for generating RNA coverage plot.') raise # Is this a valid BAM file? i.e., can pysam read it? try: is_bam_valid(rna_file) except ValueError: log.error('The given RNASeq BAM file is not valid') raise
[docs]def check_read_lengths(ribo_file, read_lengths): """Check if read lengths are valid (positive). """ # check if there are any valid read lengths to check i.e., not equal to 0 valid_lengths = list(set(read_lengths)) # if read length is 0, all read lengths are requested so we skip further # checks. if len(valid_lengths) == 1 and valid_lengths[0] == 0: return for read_length in valid_lengths: if read_length < 0: msg = 'Read length must be a positive value' log.error(msg) raise ArgumentError(msg)
[docs]def check_read_offsets(read_offsets): """Check if read offsets are valid (positive).""" for read_offset in read_offsets: if read_offset < 0: msg = 'Read offset must be 0 or greater' log.error(msg) raise ArgumentError(msg)
[docs]def check_read_lengths_offsets(read_lengths, read_offsets): """Check if read length has corresponding read offset for all read lengths. """ if not len(read_lengths) == len(read_offsets): raise ArgumentError('Each read length should have a corresponding offset value') else: return True