Source code for bioplus.fasta

'''meta-tools for large FASTA files'''
import random
import itertools
import os.path
from Bio.SeqIO import parse, write
import Bio.Seq

[docs]def count(foo): '''takes a file named foo returns the number lines''' f = parse(open(foo,'rU'),'fasta') n = 0 for dummyX in f: n += 1 return n
[docs]def random_seq(foo, n): '''takes a file foo and returns n random sequences from it''' max_n = count(foo) record_numbers = itertools.repeat(random.randint(1,max_n),times=n) seq_recs = parse(open(foo,'rU'),'fasta') i = 0 seqs = [] for rec in seq_recs: i += 1 if i in record_numbers: seqs.append(rec) return seqs
[docs]def reader(foo): ''' generator yielding Bio.Seq.Seq objects from a FASTA file ''' for record in parse(open(foo, 'rU'), 'fasta'): yield record.seq
[docs]def writer(foo, iterable): ''' writes SeqRecord objects from iterable to FASTA file foo. Warning: overwrites foo, does not append ''' write(iterable, open(foo, 'w'), 'fasta')
[docs]def random_files(foo, n, R): ''' takes a FASTA file foo and creates (in the current directory) R random files each containing n random sequences from foo, named foo_random[0-R].fa ''' for filenumber in R: prefix = foo.split(os.path.sep)[-1] boo = prefix + '_random' + pad(filenumber, R) + '.fa' writer(boo,random(foo,n))
[docs]def pad(x, y): ''' takes two integers x and y, and returns str(x) with enough 0s to match the length of str(y) ''' return str(x).zfill(len(str(y)))
[docs]def truncate_lines(f, n): ''' truncate_lines(f,n) truncates lines in a file to at most n characters See truncate_seqs to truncate sequences instead of lines ''' with open(f,'r') as seqs, open('{!s}.{!s}'.format(f, n), 'w') as tseqs: tseqs.writelines((seq[0:n]+'\n' for seq in seqs))
[docs]def truncate_seqs(f, n): ''' truncate FASTA seqs truncates sequences in a FASTA file named f to at most n bases, writing a new FASTA file named f.n.fa ''' seq_recs = parse(open(f,'rU'),'fasta') # USE FILENAME CORRECTION SCHEME foo = f+str(n)+'.fa' writer(foo, (rec[0:n] for rec in seq_recs))
[docs]def permute_fasta(f): ''' takes a FASTA file and returns a new FASTA file with each sequence randomly permuted (separately, such that its % A,T,G,C doesn't change) ''' mute = Bio.Seq.Seq.tomutable shuffle = random.shuffle with open(f + '_permuted.fa', 'w') as output: with open(f, 'rU') as fobj: for seq_rec in parse(fobj, 'fasta'): seq_rec.seq = mute(seq_rec.seq) shuffle(seq_rec.seq) write(seq_rec, output, 'fasta')