Source code for fastAQ.fastqInfo

# -*- coding: utf-8 -*-
#	Parsing a fastq file.
#	Author - Janu Verma
#	jv367@cornell.edu

import sys
from trimming import Trimming
from sequenceOperations import SequenceManipulation

[docs]class FastqParser: """ Parses a FASTQ file to extract the sequences and the base qualities. Parameters ---------- fasta_file : Fastq file to be parsed. Example ------- >>> import sys >>> input_file = sys.argv[1] >>> out = FastqParser(input_file) >>> seqDict = out.sequenceDict() >>> print len(seqDict.keys()) """ def __init__(self, fastq_file): self.ff = fastq_file
[docs] def readFastq(self, fastqFile): """ Reads and parser the FASTQ file. Parameters ---------- fastqFile - A FASTQ file. Returns ------ Generator object containing sequences. """ i = 1 name, seq, baseQ = None, [], [] for line in fastqFile: if (line.startswith("@")) and (i%4 != 0): if name: yield (name, ''.join(seq), ''.join(baseQ)) name, seq, baseQ = line, [], [] if (line[0] in ['A', 'G', 'T', 'C', 'N']): seq.append(line) if (i%4 == 0): baseQ.append(line) i += 1 if name: yield (name, ''.join(seq), ''.join(baseQ))
[docs] def sequenceDict(self): """ Creates a dictionary of sequences with their header. Returns ------- A dictionary of sequences. """ with open(self.ff) as fastaFile: sequences = {} for name,seq,baseQ in self.readFastq(fastaFile): sequences[name.strip()] = seq.strip() return sequences
[docs] def baseQualities(self): """ Creates a dictionary of base qualities of the sequences. Returns ------- A dictionary of base qualities. """ with open(self.ff) as fastaFile: qualities = {} for name,seq,baseQ in self.readFastq(fastaFile): qualities[name.strip()] = baseQ.strip() return qualities
[docs] def seqNames(self): """ Names/Headers of all the sequences. Returns ------- A list of names of all the sequences in the FASTQ file. """ seqDict = self.sequenceDict() return seqDict.keys()
[docs] def trimSeq(self, name, qualityCutOff=0, byInterval=False, interval=None, mott=False, limitValue=None): """ Trims the sequence. Parameters ---------- name : Name/header of the sequence to be trimmed. qualityCutOff : Threshold value of the quality for trimming sequence based on removing low quality bases. byInterval : If True, the sequence will be trimmed by removing bases according to the given interval. interval : The interval containing the number of bp's to be trimmed from left and right side respectively. Need byInterval to be True. mott : If True, the sequence will be trimmed according to the Mott's algorithm. limitValue : Numerical value of the limit to be used in Mott's algorithm. Requires mott to be True. Returns ------- Trimmed sequence. """ seqDict = self.sequenceDict() qualDict = self.baseQualities() sequence = seqDict[name] quality = qualDict[name] trimmer = Trimming(sequence, quality) if (byInterval): return trimmer.trimSequence(interval) elif (mott): return trimmer.mott(limitValue) else: return trimmer.lowQualTrim(qualityCutOff)
[docs] def maskSeq(self, name, interval, toLower=False, maskingChar='N'): """ Masks the sequence based on the given interval. Parameters --------- name: Name/header of the sequence. interval: A tuple containing the start and end positions for the masking. toLower: If True, the sequence in the interval is converted to lower case bases. Default is False. maskingChar : Masking character. Default is 'N'. Returns ------- Masked sequence. """ seqDict = self.sequenceDict() sequence = seqDict[name] masker = SequenceManipulation(sequence) return masker.maskSequence(interval, toLower=False, maskingChar='N')
[docs] def maskAll(self, intervals, toLower=False, maskingChar='N'): """ Masks the sequences in the FASTA file based on the given intervals. Parameters --------- intervals: A list of tuples containing the start and end positions for the masking. toLower: If True, the sequence in the interval is converted to lower case bases. Default is False. maskingChar : Masking character. Default is 'N'. Returns ------- Masked sequences. """ seqDict = self.sequenceDict() for i in range(len(seqDict.keys())): x = seqDict.keys()[i] interval = intervals[i] print self.maskSeq(x, interval, toLower=False, maskingChar='N') return ''
[docs] def trimAll(self, qualityCutOff=0, byInterval=False, intervals=None, mott=False, limitValue=None): """ Trims all the sequence in the FASTA file from both sides based on the intervals. Parameters ---------- interval : A list of tuples containing the number of bp's to be trimmed from left and right side respectively. Returns ------- Trimmed sequences. """ seqDict = self.sequenceDict() for i in range(len(seqDict.keys())): x = seqDict.keys()[i] if (byInterval) and (intervals != None): interval = intervals[i] else: interval = None print self.trimSeq(x, qualityCutOff, byInterval, interval, mott, limitValue) return ''
[docs] def reverseComplement(self, nameSeq): """ Compute the reverse complement of a given sequence. Parameters ---------- sequence: Name of the sequence whose reverse complement is to be computed. Returns ------- sequence which is the reverse complement of the input sequence. """ seqDict = self.sequenceDict() sequence = seqDict[nameSeq] new_seq = SequenceManipulation(sequence) return new_seq.reverseComplement()
[docs] def reverseComplementAll(self): """ Compute the reverse complements of all the sequences in the given FASTA file. Parameters ---------- sequence: Name of the sequence whose reverse complement is to be computed. Returns ------- Prints the reverse complements. """ seqDict = self.sequenceDict() for i in range(len(seqDict.keys())): x = seqDict.keys()[i] print self.reverseComplement(x) return ''