Case study: Analysis of the C. elegans genomeΒΆ

Example with C. elegans:

import rpy2.robjects as robjects
import bioc.biostrings as bs
import bioc.bsgenome as bg
import rpy2.rlike.container as rlc

from rpy2.robjects.packages import importr

bsgenome_celegans = importr('BSgenome.Celegans.UCSC.ce2')

celegans = bsgenome_celegans.Celegans

Listing the sequence names:

>>> tuple(celegans.seqnames)
['chrI', 'chrII', 'chrIII', 'chrIV', 'chrV', 'chrX', 'chrM']

Extracting a chromosome sequence is working the same way as it would with R:

>>> chrI = celegans['chrI']
>>> chrI
<DNAString - Python:0x9ca44ac / R:0xbc1a8c0>
import urllib
sanger_ftp = 'ftp://ftp.sanger.ac.uk/pub/databases/'
celegans_path = 'C.elegans_sequences/REPEATS/'
#'http://www.sanger.ac.uk/Projects/C_elegans/REPEATS/elegans.lib'
#'http://www.sanger.ac.uk/Projects/C_elegans/REPEATS/briggsae.lib'
repeats_dir = urllib.urlopen(sanger_ftp + celegans_path)
import re

fasta_items = []
for row in repeats_dir.readlines():
    m = re.match('.+?([^ ]+.fasta)\r\n$', row)
    if m is not None:
        fasta_item = urllib.urlopen(sanger_ftp + celegans_path + m.groups(0)[0])

        line = fasta_item.readline()
        while not line.startswith('>'):
            line = fasta_item.readline()
        if not line.startswith('>'):
            print('Error: not fasta header in %s' %m.groups(0)[0])
            continue
        
        fasta_header = line.strip()[1:]
        fasta_sequence = ''
        for line in fasta_item.readlines():
            fasta_sequence += line.strip()
        fasta_items.append((fasta_header, fasta_sequence))

#-- subset of the repeated sequences
cerepdna_items = [x for x in fasta_items \
                      if x[0].startswith('CEREP53') \
                      or x[0].startswith('CEREP55')]

For speed, we only consider a subset of the sequences for now.

cerepdna_items = [x for x in fasta_items \
                      if x[0].startswith('CEREP53') \
                      or x[0].startswith('CEREP55')]
matchpattern = bs.biostrings.matchPattern
matches = []
for header, sequence in cerepdna_items:
    print('matching: %s' %header)
    matches.append([])
    for chromosome_name in celegans.seqnames:
        print('  chromosome %s' %chromosome_name)
        chromosome = celegans[chromosome_name]
        matches[-1].append([])
        for n_mm in range(6):
            m = matchpattern(sequence, chromosome, max_mismatch = n_mm)
            matches[-1][-1].append(m)
m_info = []
for header, sequence in cerepdna_items:
    m_info.append([])
    for chromosome_name in celegans.seqnames:        
        m_info[-1].append([])
        for n_mm in range(3):
            m_info[-1][-1].append((header, chromosome_name, n_mm))

nrows = sum([len(chr_m) for m in matches for chr_m in m])
#sum([chr_m for m in matches for chr_m in m for mm in chr_m])

Previous topic

ggbase: infrastructure for the genetics of gene expression

This Page