Source code for fastAQ.fastaInfo

# -*- coding: utf-8 -*-
#	Parsing a fasta file.
#	Author - Janu Verma
#	jv367@cornell.edu

import sys
from sequenceOperations import SequenceManipulation
from trimming import Trimming


[docs]class FastaParser: """ Parses a FASTA file to extract the sequences and header information, if any. Parameters ---------- fasta_file : Fasta file to be parsed. Example ------- >>> import sys >>> input_file = sys.argv[1] >>> out = FastaParser(input_file) >>> seqDict = out.sequenceDict() >>> print len(seqDict.keys()) """ def __init__(self, fasta_file): self.ff = fasta_file
[docs] def readFasta(self, fastaFile): """ Reads and parser the FASTA file. Parameters ---------- fastaFile - A FASTA file. Returns ------ Generator object containing sequences. """ name, seq = None, [] for line in fastaFile: line = line.rstrip() if (line.startswith(">")): if name: yield (name, ''.join(seq)) name, seq = line, [] else: seq.append(line) if name: yield (name, ''.join(seq))
[docs] def sequenceDict(self): """ Creates a dictionary of sequences with their header. Returns ------- A dictionary of sequences. """ with open(self.ff) as fastaFile: sequences = {} for name, seq in self.readFasta(fastaFile): sequences[name] = seq return sequences
[docs] def seqNames(self): """ Names/Headers of all the sequences. Returns ------- A list of names of all the sequences in the FASTA file. """ seqDict = self.sequenceDict() return seqDict.keys()
[docs] def seqFromName(self, name): """ Extract the sequence corresponding to the given name. Parameters --------- name : Name of the sequence to be retrieved. Returns ------- Sequence corresponding to the input name. """ seqDict = self.sequenceDict() return seqDict[name]
[docs] def reverseComplement(self, nameSeq): """ Compute the reverse complement of a given sequence. Parameters ---------- sequence: Name of the sequence whose reverse complement is to be computed. Returns ------- sequence which is the reverse complement of the input sequence. """ seqDict = self.sequenceDict() sequence = seqDict[nameSeq] new_seq = SequenceManipulation(sequence) return new_seq.reverseComplement()
[docs] def maskSeq(self, name, interval, toLower=False, maskingChar='N'): """ Masks the sequence based on the given interval. Parameters --------- name: Name/header of the sequence. interval: A tuple containing the start and end positions for the masking. toLower: If True, the sequence in the interval is converted to lower case bases. Default is False. maskingChar : Masking character. Default is 'N'. Returns ------- Masked sequence. """ seqDict = self.sequenceDict() sequence = seqDict[name] masker = SequenceManipulation(sequence) return masker.maskSequence(interval, toLower=False, maskingChar='N')
[docs] def trimSeq(self, name, interval, quality = None): """ Trims the sequence from both sides based on the interval. Parameters ---------- name : Name/header of the sequence to be trimmed. interval : The interval containing the number of bp's to be trimmed from left and right side respectively. Returns ------- Trimmed sequence. """ seqDict = self.sequenceDict() sequence = seqDict[name] trimmer = Trimming(sequence, quality) return trimmer.trimSequence(interval)
[docs] def maskAll(self, intervals, toLower=False, maskingChar='N'): """ Masks the sequences in the FASTA file based on the given intervals. Parameters --------- intervals: A list of tuples containing the start and end positions for the masking. toLower: If True, the sequence in the interval is converted to lower case bases. Default is False. maskingChar : Masking character. Default is 'N'. Returns ------- Masked sequences. """ seqDict = self.sequenceDict() for i in range(len(seqDict.keys())): x = seqDict.keys()[i] interval = intervals[i] print self.maskSeq(x, interval, toLower=False, maskingChar='N') return ''
[docs] def trimAll(self, intervals, quality=None): """ Trims all the sequence in the FASTA file from both sides based on the intervals. Parameters ---------- interval : A list of tuples containing the number of bp's to be trimmed from left and right side respectively. Returns ------- Trimmed sequences. """ seqDict = self.sequenceDict() for i in range(len(seqDict.keys())): x = seqDict.keys()[i] interval = intervals[i] print self.trimSeq(x, interval, quality = None) return ''
[docs] def reverseComplementAll(self): """ Compute the reverse complements of all the sequences in the given FASTA file. Parameters ---------- sequence: Name of the sequence whose reverse complement is to be computed. Returns ------- Prints the reverse complements. """ seqDict = self.sequenceDict() for i in range(len(seqDict.keys())): x = seqDict.keys()[i] print self.reverseComplement(x) return ''