Source code for bioplus.seqtools

import collections
import operator
from itertools import izip, imap
from numpy.core.fromnumeric import std 

[docs]def get_first(seq1, seq2, ignore_case=True): """ returns the alphabetically prior of seq1 and seq2 if they are the same or we fail to order, return seq1 if ignore_case=True, we apply str.upper to each letter """ if ignore_case: upper = str.upper for a, b in izip(seq1, seq2): A = upper(a) B = upper(b) if A == B: continue elif A > B: return seq1 elif B > A: return seq2 else: for a, b in izip(seq1, seq2): if a == b: continue elif a > b: return seq1 elif b > a: return seq2 return seq1
[docs]def count_seqs(iterable, reverse_complement=True, ignore_case=True): ''' count_seqs takes as input an iterable that yields sequences, and then returns the counts of each sequence as a defaultdict (similar to dict, can be recast as dict type). Sorting: Use dictSort to sort, if needed. Reverse complements: if rc=True, count_seqs will count reverse complements as the same sequence and use the alphabetically prior sequence as the key. if rc=False, reverse complements will be treated as different sequences. ''' seq_counts = collections.defaultdict(lambda: 0) if reverse_complement: orient_seq = lambda s: get_first(s, rc(s), ignore_case=ignore_case) oriented_seqs = imap(orient_seq, iterable) for seq in oriented_seqs: seq_counts[seq]+=1 else: # if not rc for seq in iterable: seq_counts[seq]+=1 return seq_counts
[docs]def count_compare(a, b, rc=False): """ count_compare takes two default dictionaries (defaultdict objects), a and b, each with default factory "lambda: 0" and key-value pairs that specify the counts (value) of each sequence (key), and returns a new dictionary whose values are tuples (value_a,value_b), the values from a and b. Reverse complements: if rc=True, count_seqs will count reverse complements as the same sequence and use the alphabetically prior sequence as the key. if rc=False, reverse complements will be treated as different sequences. If you have already ensured that the keys meet this condition, you should use rc=False, but rc=True is also safe. note: defaultdict is in the collections module of the standard library """ if not (type(a)==collections.defaultdict and type(b)==collections.defaultdict): raise ValueError('a and b must be default dictionaries (collections.defaultdict)') # proceed comparedCounts = {} for k in set( a.keys(), b.keys() ): comparedCounts[k] = (a[k],b[k]) return comparedCounts
[docs]def analyze_sites(some_list): ''' requires the following input format: seq score cdist cons cdist = distance to center, cons = conservation value ''' # column definitions for reference / calling x0 = 0 # x0 is sequence x1 = 1 # x1 is score (Ri) x2 = 2 # x2 is peaklength x3 = 3 # x3 is cdist x4 = 4 # x4 is conservation score # columns are seq, score, peaklength, cdist, cons # build an array scalar with the correct column types for x in some_list: # forces sequences to uppercase x[x0] = x[x0].upper() # forces length to int x[x2] = int(x[x2]) # forces cdist to int x[x3] = int(x[x3]) # forces cons score to floating point # and correct nan if x[x4]=='nan': x[x4] = 0.0 else: x[x4] = float(x[x4]) # sort the list by sequence (arbitrarily) sorted_list = sorted(some_list, key=operator.itemgetter(x0)) # Part 3: continue with analysis analyzedSeqs = [] counter = len(sorted_list) while counter > 0: site = sorted_list.pop() counter -= 1 seq = site[x0] score = site[x1] # start a list of dists, absdists, cscores, we'll sum later i = 1 plengths = [ site[x2] ] dists = [ site[x3] ] absdists = [ abs(site[x3]) ] cscores = [ site[x4] ] while True: # grab sequences that match try: y = sorted_list.pop() except IndexError: break counter -= 1 # look if we match the last entry if y[x0] == seq: i += 1 plengths.append(y[x2]) dists.append(y[x3]) absdists.append( abs( y[x3] ) ) cscores.append(y[x4]) # if we don't, escape to previous loop else: # append back sorted_list.append(y) break # append seq, sum of scores, # of instances avgPlength = float(sum(plengths))/i avgDist = float(sum(dists))/i avgAbsdist = float(sum(absdists))/i distErr = std(absdists) analyzedSeqs.append([seq, score, i, sum(cscores), avgPlength, avgDist, avgAbsdist,distErr]) analyzedSeqs.reverse() return sorted(analyzedSeqs, key=operator.itemgetter(x2))
[docs]def complement(nucleotide): '''returns the complement of a single nucleotide''' complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'a': 't', 'c': 'g', 'g': 'c', 't': 'a'} return complement.setdefault(nucleotide,'N')
[docs]def rc(seq): '''returns the reverse complement of a sequence''' L = [complement(s) for s in seq] L.reverse() return ''.join(L)
reversecomplement = rc