Source code for fastAQ.fastqInfo

# -*- coding: utf-8 -*-
#	Parsing a fastq file.
#	Author - Janu Verma
#	jv367@cornell.edu

import sys
from trimming import Trimming
from sequenceOperations import SequenceManipulation

[docs]class FastqParser:
	"""
	Parses a FASTQ file to extract the sequences and the base qualities. 

	Parameters
	----------
	fasta_file : Fastq file to be parsed. 


	Example
	-------
	>>> import sys
	>>> input_file = sys.argv[1] 
	>>> out = FastqParser(input_file)
	>>> seqDict = out.sequenceDict()
	>>> print len(seqDict.keys())
	"""

	def __init__(self, fastq_file):
		self.ff = fastq_file
		
[docs]	def readFastq(self, fastqFile):
		"""
		Reads and parser the FASTQ file. 

		Parameters
		----------
		fastqFile - A FASTQ file.

		Returns
		------
		Generator object containing sequences. 
		"""
		i = 1
		name, seq, baseQ = None, [], []
		for line in fastqFile:
			if (line.startswith("@")) and (i%4 != 0):
				if name: yield (name, ''.join(seq), ''.join(baseQ))
				name, seq, baseQ = line, [], []
			if (line[0] in ['A', 'G', 'T', 'C', 'N']):
				seq.append(line)
			if (i%4 == 0):
				baseQ.append(line)
			i += 1
		if name: yield (name, ''.join(seq), ''.join(baseQ))	




[docs]	def sequenceDict(self):
		"""
		Creates a dictionary of sequences with their header.

		Returns
		-------
		A dictionary of sequences.
		"""
		with open(self.ff) as fastaFile:
			sequences = {}
			for name,seq,baseQ in self.readFastq(fastaFile):
				sequences[name.strip()] = seq.strip()
			return sequences
	



[docs]	def baseQualities(self):
		"""
		Creates a dictionary of base qualities of the sequences.

		Returns
		-------
		A dictionary of base qualities.
		"""
		with open(self.ff) as fastaFile:
			qualities = {}
			for name,seq,baseQ in self.readFastq(fastaFile):
				qualities[name.strip()] = baseQ.strip()
			return qualities



[docs]	def seqNames(self):
		"""
		Names/Headers of all the sequences.

		Returns
		-------
		A list of names of all the sequences in the FASTQ file. 
		"""
		seqDict = self.sequenceDict()
		return seqDict.keys()


[docs]	def trimSeq(self, name, qualityCutOff=0, byInterval=False, interval=None, mott=False, limitValue=None):
		"""
		Trims the sequence.

		Parameters
		----------
		name : Name/header of the sequence to be trimmed.
		qualityCutOff : Threshold value of the quality for trimming sequence based on removing low quality bases.
		byInterval : If True, the sequence will be trimmed by removing bases according to the given interval.
		interval : The interval containing the number of bp's to be trimmed from left and right side respectively.
					Need byInterval to be True.
		mott : If True, the sequence will be trimmed according to the Mott's algorithm.
		limitValue : Numerical value of the limit to be used in Mott's algorithm. 
					Requires mott to be True.			 
		
		Returns
		-------
		Trimmed sequence.			
		"""
		seqDict = self.sequenceDict()
		qualDict = self.baseQualities()
		sequence = seqDict[name]
		quality = qualDict[name]
		trimmer = Trimming(sequence, quality)
		if (byInterval):
			return trimmer.trimSequence(interval)
		elif (mott):
			return trimmer.mott(limitValue)
		else:
			return trimmer.lowQualTrim(qualityCutOff)	


[docs]	def maskSeq(self, name, interval, toLower=False, maskingChar='N'):
		"""
		Masks the sequence based on the given interval. 

		Parameters
		---------
		name: Name/header of the sequence.  
		interval: A tuple containing the start and end positions for the masking. 
		toLower: If True, the sequence in the interval is converted to lower case bases.
						Default is False. 
		maskingChar :  Masking character. Default is 'N'.

		Returns
		-------
		Masked sequence.
		"""
		seqDict = self.sequenceDict()
		sequence = seqDict[name]
		masker = SequenceManipulation(sequence)
		return masker.maskSequence(interval, toLower=False, maskingChar='N')

					

[docs]	def maskAll(self, intervals, toLower=False, maskingChar='N'):
		"""
		Masks the sequences in the FASTA file based on the given intervals. 

		Parameters
		--------- 
		intervals: A list of tuples containing the start and end positions for the masking. 
		toLower: If True, the sequence in the interval is converted to lower case bases.
						Default is False. 
		maskingChar :  Masking character. Default is 'N'.

		Returns
		-------
		Masked sequences.
		"""
		seqDict = self.sequenceDict()
		for i in range(len(seqDict.keys())):
			x = seqDict.keys()[i]
			interval = intervals[i]
			print self.maskSeq(x, interval, toLower=False, maskingChar='N')
		return ''	
	



[docs]	def trimAll(self, qualityCutOff=0, byInterval=False, intervals=None, mott=False, limitValue=None):
		"""
		Trims all the sequence in the FASTA file from both sides based on the intervals.

		Parameters
		----------
		interval : A list of tuples containing the number of bp's to be trimmed from left and right side respectively.

		Returns
		-------
		Trimmed sequences.			
		"""
		seqDict = self.sequenceDict()
		for i in range(len(seqDict.keys())):
			x = seqDict.keys()[i]
			if (byInterval) and (intervals != None):
				interval = intervals[i]
			else:
				interval = None	
			print self.trimSeq(x, qualityCutOff, byInterval, interval, mott, limitValue)
		return ''	


[docs]	def reverseComplement(self, nameSeq):
		"""
		Compute the reverse complement of a given sequence. 

		Parameters
		----------
		sequence: Name of the sequence whose reverse complement is to be computed. 

		Returns
		-------
		sequence which is the reverse complement of the input sequence.
		"""
		seqDict = self.sequenceDict()
		sequence = seqDict[nameSeq]
		new_seq = SequenceManipulation(sequence)
		return new_seq.reverseComplement()



[docs]	def reverseComplementAll(self):
		"""
		Compute the reverse complements of all the sequences in the given FASTA file. 

		Parameters
		----------
		sequence: Name of the sequence whose reverse complement is to be computed. 

		Returns
		-------
		Prints the reverse complements.
		"""
		seqDict = self.sequenceDict()
		for i in range(len(seqDict.keys())):
			x = seqDict.keys()[i]
			print self.reverseComplement(x)	
		return ''
Navigation

Source code for fastAQ.fastqInfo

Quick search

Navigation