Example with C. elegans:
import rpy2.robjects as robjects
import bioc.biostrings as bs
import bioc.bsgenome as bg
import rpy2.rlike.container as rlc
from rpy2.robjects.packages import importr
bsgenome_celegans = importr('BSgenome.Celegans.UCSC.ce2')
celegans = bsgenome_celegans.Celegans
Listing the sequence names:
>>> tuple(celegans.seqnames)
['chrI', 'chrII', 'chrIII', 'chrIV', 'chrV', 'chrX', 'chrM']
Extracting a chromosome sequence is working the same way as it would with R:
>>> chrI = celegans['chrI']
>>> chrI
<DNAString - Python:0x9ca44ac / R:0xbc1a8c0>
import urllib
sanger_ftp = 'ftp://ftp.sanger.ac.uk/pub/databases/'
celegans_path = 'C.elegans_sequences/REPEATS/'
#'http://www.sanger.ac.uk/Projects/C_elegans/REPEATS/elegans.lib'
#'http://www.sanger.ac.uk/Projects/C_elegans/REPEATS/briggsae.lib'
repeats_dir = urllib.urlopen(sanger_ftp + celegans_path)
import re
fasta_items = []
for row in repeats_dir.readlines():
m = re.match('.+?([^ ]+.fasta)\r\n$', row)
if m is not None:
fasta_item = urllib.urlopen(sanger_ftp + celegans_path + m.groups(0)[0])
line = fasta_item.readline()
while not line.startswith('>'):
line = fasta_item.readline()
if not line.startswith('>'):
print('Error: not fasta header in %s' %m.groups(0)[0])
continue
fasta_header = line.strip()[1:]
fasta_sequence = ''
for line in fasta_item.readlines():
fasta_sequence += line.strip()
fasta_items.append((fasta_header, fasta_sequence))
#-- subset of the repeated sequences
cerepdna_items = [x for x in fasta_items \
if x[0].startswith('CEREP53') \
or x[0].startswith('CEREP55')]
For speed, we only consider a subset of the sequences for now.
cerepdna_items = [x for x in fasta_items \
if x[0].startswith('CEREP53') \
or x[0].startswith('CEREP55')]
matchpattern = bs.biostrings.matchPattern
matches = []
for header, sequence in cerepdna_items:
print('matching: %s' %header)
matches.append([])
for chromosome_name in celegans.seqnames:
print(' chromosome %s' %chromosome_name)
chromosome = celegans[chromosome_name]
matches[-1].append([])
for n_mm in range(6):
m = matchpattern(sequence, chromosome, max_mismatch = n_mm)
matches[-1][-1].append(m)
m_info = []
for header, sequence in cerepdna_items:
m_info.append([])
for chromosome_name in celegans.seqnames:
m_info[-1].append([])
for n_mm in range(3):
m_info[-1][-1].append((header, chromosome_name, n_mm))
nrows = sum([len(chr_m) for m in matches for chr_m in m])
#sum([chr_m for m in matches for chr_m in m for mm in chr_m])