Source code for pytadbit.chromosome

"""
26 Nov 2012


"""

from os.path                           import exists
from pytadbit.boundary_aligner.aligner import align
from pytadbit                          import tadbit
from pytadbit.experiment               import Experiment
from string                            import ascii_lowercase as letters
from warnings                          import warn
from copy                              import deepcopy as copy
from cPickle                           import load, dump
from pytadbit.alignment                import Alignment, randomization_test
from numpy                             import log2
from random                            import random

try:
    from matplotlib import pyplot as plt
except ImportError:
    warn('matplotlib not found\n')




[docs]def load_chromosome(in_f, fast=2):
    """
    Load a Chromosome object from a file. A Chromosome object can be saved with
    the :func:`Chromosome.save_chromosome` function. 
    
    :param in_f: path to a saved Chromosome object file
    :param 2 fast: if fast=2 do not load the Hi-C data (in the case that they 
       were saved in a separate file see :func:`Chromosome.save_chromosome`).
       If fast is equal to 1, the weights will be skipped from load to save 
       memory. Finally if fast=0, both the weights and Hi-C data will be loaded
    
    :returns: a Chromosome object

    TODO: remove first try/except type error... this is loading old experiments
    """
    dico = load(open(in_f))
    name = ''
    crm = Chromosome(dico['name'])
    for name in dico['experiments']:
        xpr = Experiment(name, dico['experiments'][name]['resolution'], 
                         no_warn=True)
        xpr.tads       = dico['experiments'][name]['tads']
        xpr.norm       = dico['experiments'][name]['wght']
        xpr.hic_data   = dico['experiments'][name]['hi-c']
        xpr.conditions = dico['experiments'][name]['cond']
        xpr.size       = dico['experiments'][name]['size']
        try:
            crm.experiments.append(xpr)
        except TypeError:
            continue
    crm.size            = dico['size']
    crm.r_size          = dico['r_size']
    crm.max_tad_size    = dico['max_tad_size']
    crm.forbidden       = dico['forbidden']
    crm._centromere     = dico['_centromere']
    if type(dico['experiments'][name]['hi-c']) == str or fast!= int(2):
        try:
            dicp = load(open(in_f + '_hic'))
        except IOError:
            raise Exception('ERROR: file %s not found\n' % (
                dico['experiments'][name]['hi-c']))
        for name in dico['experiments']:
            crm.get_experiment(name).hic_data = dicp[name]['hi-c']
            if fast != 1:
                crm.get_experiment(name).norm = dicp[name]['wght']
    elif not fast:
        warn('WARNING: data not saved correctly for fast loading.\n')
    return crm


[docs]class Chromosome(object):
    """
    A Chromosome object designed to deal with Topologically Associating Domains
    predictions from different experiments, in different cell types for a given
    chromosome of DNA, and to compare them.

    :param name: name of the chromosome (might be a chromosome name for example)
    :param None resolutions: list of resolutions corresponding to a list of
       experiments passed.
    :param None experiment_hic_data: :py:func:`list` of paths to files
       containing the Hi-C count matrices corresponding to different experiments
    :param None experiment_tads: :py:func:`list` of paths to files
       containing the definition of TADs corresponding to different experiments
    :param None experiment_names: :py:func:`list` of the names of each 
        experiment
    :param 5000000 max_tad_size: maximum TAD size allowed. TADs longer than
        this value will not be considered, and size of the corresponding
        chromosome size will be reduced accordingly
    :param 0 chr_len: size of the DNA chromosome in bp. By default it will be
        inferred from the distribution of TADs
    :param None parser: a parser function that returns a tuple of lists 
       representing the data matrix and the length of a row/column. With
       the file example.tsv:

       ::
       
         chrT_001	chrT_002	chrT_003	chrT_004
         chrT_001	629	164	88	105
         chrT_002	164	612	175	110
         chrT_003	88	175	437	100
         chrT_004	105	110	100	278

       the output of parser('example.tsv') would be be:
       ``[([629, 164, 88, 105, 164, 612, 175, 110, 88, 175, 437, 100, 105,
       110, 100, 278]), 4]``

    :return: Chromosome object


    """
    def __init__(self, name, experiment_resolutions=None, experiment_tads=None,
                 experiment_hic_data=None, experiment_names=None,
                 max_tad_size=5000000, chr_len=0, parser=None,
                 centromere_search=False):
        self.name             = name
        self.max_tad_size     = max_tad_size
        self.size             = self._given_size = self.r_size = chr_len
        self.size             = ChromosomeSize(self.size)
        self.r_size           = RelativeChromosomeSize(self.size)
        self.forbidden        = {}
        self.experiments      = ExperimentList([], self)
        self._centromere      = None
        self.alignment        = AlignmentDict()
        
        self._search_centromere = centromere_search
        if experiment_tads:
            for i, handler in enumerate(experiment_tads or []):
                name = experiment_names[i] if experiment_names else None
                self.add_experiment(name, experiment_resolutions[i],
                                    tad_def=handler, parser=parser)
        if experiment_hic_data:
            for i, handler in enumerate(experiment_hic_data or []):
                name = experiment_names[i] if experiment_names else None
                try:
                    xpr = self.get_experiment(name)
                    xpr.load_hic_data(handler)
                    continue
                except:
                    pass
                if type(handler) == Experiment:
                    name = name or handler.name
                    self.experiments.append(handler)
                else:
                    self.add_experiment(name, experiment_resolutions[i],
                                        hic_data=handler, parser=parser)


    def _get_forbidden_region(self, xpr):
        """
        Find the regions for which there is no information in any of the
        experiments. This is used to infer the relative chromosome size.
        """
        if not xpr.tads:
            return
        forbidden = []
        for pos in xpr.tads:
            start = float(xpr.tads[pos]['start'])
            end   = float(xpr.tads[pos]['end'])
            diff  = end - start
            if diff * xpr.resolution > self.max_tad_size:
                forbidden += range(int(start), int(end+1))
                xpr.tads[pos]['score'] = -abs(xpr.tads[pos]['score'])
        if not self.forbidden:
            self.forbidden = dict([(f, None) for f in forbidden])
        else:
            self.forbidden = dict([(f, None) for f in 
                                   set(forbidden).intersection(self.forbidden)])
        # search for centromere:
        self._find_centromere(xpr)
        # add centromere as forbidden region:
        if self._centromere:
            for pos in xrange(int(self._centromere[0]),
                              int(self._centromere[1])):
                self.forbidden[pos] = 'Centromere'
        self.__update_size(xpr)


[docs]    def get_experiment(self, name):
        """
        This can also be done directly with Chromosome.experiments[name].
        
        :param name: name of the experiment to select
        :returns: :class:`pytadbit.Experiment`
        """
        for exp in self.experiments:
            if exp.name == name:
                return exp
        raise Exception('ERROR: experiment ' +
                        '%s not found\n' % (name))
                

[docs]    def save_chromosome(self, out_f, fast=True, divide=True, force=False):
        """
        Save a Chromosome object to a file (it uses :py:func:`pickle.load` from
        the :py:mod:`cPickle`). Once saved, the object can be loaded with
        :func:`load_chromosome`.

        :param out_f: path to the file where to store the :py:mod:`cPickle`
           object
        :param True fast: if True, skip Hi-C data and weights
        :param True divide: if True writes two pickles, one with what would
           result by using the fast option, and the second with the Hi-C and 
           weights data. The second file name will be extended by '_hic' (ie:
           with out_f='chromosome12.pik' we would obtain chromosome12.pik and
           chromosome12.pik_hic). When loaded :func:`load_chromosome` will
           automatically search for both files
        :param False force: overwrite the existing file

        """
        while exists(out_f) and not force:
            out_f += '_'
        dico = {}
        dico['experiments'] = {}
        if divide:
            dicp = {}
        for xpr in self.experiments:
            dico['experiments'][xpr.name] = {
                'size'      : xpr.size,
                'cond'      : xpr.conditions,
                'tads'      : xpr.tads,
                'resolution': xpr.resolution,
                'hi-c'      : None,
                'wght'      : None}
            if fast:
                continue
            if divide:
                dicp[xpr.name] = {
                    'wght': xpr.norm,
                    'hi-c': xpr.hic_data}
                dico['experiments'][xpr.name]['wght'] = None
                dico['experiments'][xpr.name]['hi-c'] = None
            else:
                dico['experiments'][xpr.name]['wght'] = xpr.norm
                dico['experiments'][xpr.name]['hi-c'] = xpr.hic_data
        dico['name']            = self.name
        dico['size']            = self.size
        dico['r_size']          = self.r_size
        dico['max_tad_size']    = self.max_tad_size
        dico['forbidden']       = self.forbidden
        dico['_centromere']     = self._centromere
        out = open(out_f, 'w')
        dump(dico, out)
        out.close()
        if not fast:
            out = open(out_f + '_hic', 'w')
            dump(dicp, out)
            out.close()


[docs]    def align_experiments(self, names=None, verbose=False, randomize=False,
                          rnd_method='interpolate', rnd_num=1000, **kwargs):
        """
        Align the predicted boundaries of two different experiments. The 
        resulting alignment will be stored in the self.experiment list.
        
        :param None names: list of names of the experiments to align. If None,
            align all
        :param experiment1: name of the first experiment to align
        :param experiment2: name of the second experiment to align
        :param -0.1 penalty: penalty for inserting a gap in the alignment
        :param 100000 max_dist: maximum distance between two boundaries
            allowing match (100Kb seems fair with HUMAN chromosomes)
        :param False verbose: if True, print some information about the 
            alignments
        :param False randomize: check the alignment quality by comparing
            randomized boundaries over Chromosomes of the same size. This will
            return a extra value, the p-value of accepting that the observed
            alignment is not better than a random alignment
        :param interpolate rnd_method: by default uses the interpolation of TAD
           distribution. The alternative method is 'shuffle', where TADs are
           simply shuffled
        :param 1000 rnd_num: number of randomizations to do
        :param reciprocal method: if global, Needleman-Wunsch is used to align
            (see :func:`pytadbit.boundary_aligner.globally.needleman_wunsch`);
            if reciprocal, a method based on reciprocal closest boundaries is
            used (see :func:`pytadbit.boundary_aligner.reciprocally.reciprocal`)

        :returns: the alignment and the score of the alignment (by default)
        """
        if names:
            xpers = ExperimentList([self.get_experiment(n) for n in names],
                                   self)
        else:
            xpers = self.experiments
        tads = []
        for xpr in xpers:
            if not xpr.tads:
                raise Exception('No TADs defined, use find_tad function.\n')
            tads.append([xpr.tads[x]['brk'] * xpr.resolution for x in xpr.tads])
        # new
        aligneds, score = align(tads, verbose=verbose, **kwargs)
        name = tuple(sorted([x.name for x in xpers]))
        ali = Alignment(name, aligneds, xpers, score=score)
        self.alignment[name] = ali
        if verbose:
            print self.alignment[name]
        # old
        # self.alignment[name] = {}
        # for xpr, ali in zip(xpers, aligneds):
        #     self.alignment[name][xpr.name] = ali
        # if verbose:
        #     self.print_alignment(xpers=xpers)
        if not randomize:
            # return self.get_alignment(name), score
            return ali
        p_value = randomization_test(xpers, score=score, rnd_method=rnd_method,
                                     verbose=verbose, r_size=self.r_size,
                                     num=rnd_num, **kwargs)
        return score, p_value


[docs]    def add_experiment(self, name, resolution=None, tad_def=None,
                       hic_data=None, replace=False, parser=None,
                       conditions=None, **kwargs):
        """
        Add a Hi-C experiment to Chromosome
        
        :param name: name of the experiment or of the Experiment object
        :param resolution: resolution of the experiment (needed if name is not
           an Experiment object)
        :param None hic_data: whether a file or a list of lists corresponding to
           the Hi-C data
        :param None tad_def: a file or a dict with precomputed TADs for this
           experiment
        :param False replace: overwrite the experiments loaded under the same
           name
        :param None parser: a parser function that returns a tuple of lists 
           representing the data matrix and the length of a row/column. With 
           a file example.tsv containing:

           ::
           
             chrT_001	chrT_002	chrT_003	chrT_004
             chrT_001	629	164	88	105
             chrT_002	164	612	175	110
             chrT_003	88	175	437	100
             chrT_004	105	110	100	278
           
           the output of parser('example.tsv') would be:
           ``[([629, 164, 88, 105, 164, 612, 175, 110, 88, 175, 437, 100, 105,
           110, 100, 278]), 4]``
        
        """
        if not name:
            name = ''.join([letters[int(random() * len(letters))] \
                            for _ in xrange(5)])
            warn('No name provided, random name generated: %s\n' % (name))
        if name in self.experiments:
            if 'hi-c' in self.get_experiment(name) and not replace:
                warn('''Hi-C data already loaded under the name: %s.
                This experiment will be kept under %s.\n''' % (name,
                                                               name + '_'))
                name += '_'
        if type(name) == Experiment:
            self.experiments.append(name)
        elif resolution:
            self.experiments.append(Experiment(name, resolution, hic_data,
                                               tad_def, parser=parser,
                                               conditions=conditions, **kwargs))
        else:
            raise Exception('resolution param is needed\n')


[docs]    def find_tad(self, experiments, name=None, n_cpus=1, verbose=True,
                 max_tad_size="auto", no_heuristic=False, batch_mode=False,
                 use_visibility=False):
        """
        Call the :func:`pytadbit.tadbit.tadbit` function to calculate the
        position of Topologically Associated Domains
        
        :param experiment: A square matrix of interaction counts of Hi-C
           data or a list of such matrices for replicated experiments. The
           counts must be evenly sampled and not normalized. 'experiment'
           can be either a list of lists, a path to a file or a file handler
        :param 1 n_cpus: The number of CPUs to allocate to TADBit. If
           n_cpus='max' the total number of CPUs will be used
        :param auto max_tad_size: an integer defining the maximum size of a 
           TAD. Default (auto) defines it as the number of rows/columns
        :param False no_heuristic: whether to use or not some heuristics
        :param False batch_mode: if True, all the experiments will be 
           concatenated into one for the search of TADs. The resulting TADs 
           found are stored under the name 'batch' plus a concatenation of the
           experiment names passed (e.g.: if experiments=['exp1', 'exp2'], the
           name would be: 'batch_exp1_exp2').

        TODO: check option -> name for batch mode... some dirty changes....

        """
        if batch_mode:
            matrix = []
            if not name:
                name = 'batch'
            experiments = experiments or self.experiments
            xprs = []
            for xpr in experiments:
                if not type(xpr) == Experiment:
                    xprs.append(self.get_experiment(xpr))
                else:
                    xprs.append(xpr)
            resolution = xprs[0].resolution
            for xpr in sorted(xprs, key=lambda x: x.name):
                if xpr.resolution != resolution:
                    raise Exception('All Experiments might have the same ' +
                                    'resolution\n')
                matrix.append(xpr.hic_data[0])
                if name.startswith('batch'):
                    name += '_' + xpr.name
            result, weights = tadbit(matrix,
                                     n_cpus=n_cpus, verbose=verbose,
                                     max_tad_size=max_tad_size,
                                     no_heuristic=no_heuristic,
                                     get_weights=True,
                                     use_visibility=use_visibility)
            experiment = Experiment(name, resolution, hic_data=matrix,
                                    tad_def=result, weights=weights)
            self.add_experiment(experiment)
            return
        if type(experiments) is not list:
            experiments = [experiments]
        for experiment in experiments:
            if not type(experiment) == Experiment:
                xpr = self.get_experiment(experiment)
            result, weights = tadbit(xpr.hic_data,
                                     n_cpus=n_cpus, verbose=verbose,
                                     max_tad_size=max_tad_size,
                                     no_heuristic=no_heuristic,
                                     get_weights=True,
                                     use_visibility=use_visibility)
            xpr.load_tad_def(result, weights=weights)
            if self._search_centromere:
                self._get_forbidden_region(xpr)


    def __update_size(self, xpr):
        """
        Update the chromosome size and relative size after loading new Hi-C
        experiments, unless the Chromosome size was defined by hand.
        
        """
        if not self._given_size:
            self.size = max(xpr.tads[max(xpr.tads)]['end'] * xpr.resolution,
                            self.size)
            self.size   = ChromosomeSize(self.size)
        self.r_size = self.size - len(self.forbidden) * xpr.resolution
        self.r_size = RelativeChromosomeSize(self.size)


[docs]    def visualize(self, name, tad=None, focus=None, paint_tads=False, axe=None,
                  show=True, logarithm=True, normalized=False, relative=True,
                  decorate=True):
        """
        Visualize the matrix of Hi-C interactions.

        :param name: name of the experiment to visualize
        :param None tad: a given TAD in the form:
           ::
           
             {'start': start,
              'end'  : end,
              'brk'  : end,
              'score': score}
              
           **Alternatively** a list of the TADs can be passed (all the TADs
           between the first and last one passed will be showed. Thus, passing
           more than two TADs might be superfluous)
        :param None focus: a tuple with the start and end positions of the 
           region to visualize
        :param False paint_tads: draw a box around the TADs defined for this
           experiment
        :param None axe: an axe object from matplotlib can be passed in order
           to customize the picture
        :param True show: either to pop-up matplotlib image or not
        :param True logarithm: show the logarithm values
        :param True normalized: show the normalized data (weights might have
           been calculated previously). *Note: white rows/columns may appear in
           the matrix displayed; these rows correspond to filtered rows (see*
           :func:`pytadbit.utils.hic_filtering.hic_filtering_for_modelling` *)*
        :param True relative: color scale is relative to the whole matrix of
           data, not only to the region displayed
        :param True decorate: draws color bar, title and axes labels
        """
        xper = self.get_experiment(name)
        if logarithm:
            fun = log2
        else:
            fun = lambda x: x
        size = xper.size
        if normalized and not xper.norm:
            raise Exception('ERROR: weights not calculated for this ' +
                            'experiment. Run Experiment.normalize_hic\n')
        if tad and focus:
            raise Exception('ERROR: only one of "tad" or "focus" might be set')
        start = end = None
        if focus:
            start, end = focus
            if start == 0:
                warn('Hi-C matrix starts at 1, setting starting point to 1.\n')
                start = 1
        elif type(tad) == dict:
            start = int(tad['start'])
            end   = int(tad['end'])
        elif type(tad) == list:
            if type(tad[0]) == dict:
                start = int(sorted(tad,
                                   key=lambda x: int(x['start']))[0 ]['start'])
                end   = int(sorted(tad,
                                   key=lambda x: int(x['end'  ]))[-1]['end'  ])
        if relative:
            if normalized:
                # find minimum, if value is non-zero... for logarithm
                mini = min([i for i in xper.norm[0] if i])
                if mini == int(mini):
                    vmin = min(xper.norm[0])
                else:
                    vmin = mini
                vmin = fun(vmin or (1 if logarithm else 0))
                vmax = fun(max(xper.norm[0]))
            else:
                vmin = fun(min(xper.hic_data[0]) or (1 if logarithm else 0))
                vmax = fun(max(xper.hic_data[0]))
        if not axe:
            plt.figure(figsize=(8, 6))
            axe = plt.subplot(111)
        if tad or focus:
            if start > -1:
                if normalized:
                    matrix = [
                        [xper.norm[0][i+size*j]
                         if (not i in xper._zeros
                             and not j in xper._zeros) else vmin
                         for i in xrange(start - 1, end)]
                        for j in xrange(start - 1, end)]
                else:
                    matrix = [
                        [xper.hic_data[0][i+size*j]
                         for i in xrange(start - 1, end)]
                        for j in xrange(start - 1, end)]
            elif type(tad) is list:
                if normalized:
                    warn('List passed, not going to be normalized.')
                matrix = tad
            else:
                # TODO: something... matrix not declared...
                pass
        else:
            if normalized:
                matrix = [[xper.norm[0][i+size*j]
                           if (not i in xper._zeros
                               and not j in xper._zeros) else vmin
                           for i in xrange(size)]
                          for j in xrange(size)]
            else:
                matrix = [[xper.hic_data[0][i+size*j]\
                           for i in xrange(size)] \
                          for j in xrange(size)]
        if relative:
            img = axe.imshow(fun(matrix), origin='lower', vmin=vmin, vmax=vmax,
                             interpolation="nearest",
                             extent=(int(start or 1) - 0.5,
                                     int(start or 1) + len(matrix) - 0.5,
                                     int(start or 1) - 0.5,
                                     int(start or 1) + len(matrix) - 0.5))
        else:
            img = axe.imshow(fun(matrix), origin='lower',
                             interpolation="nearest",
                             extent=(int(start or 1) - 0.5,
                                     int(start or 1) + len(matrix) - 0.5,
                                     int(start or 1) - 0.5,
                                     int(start or 1) + len(matrix) - 0.5))
        if decorate:
            cbar = axe.figure.colorbar(img)
            cbar.ax.set_ylabel('Log2 Hi-C interactions count')
            axe.set_title(('Chromosome %s experiment %s' +
                           ' %s') % (self.name, xper.name,
                                     'focus: %s-%s' % (start, end) if tad else ''))
            axe.set_xlabel('Genomic bin (resolution: %s)' % (xper.resolution))
            axe.set_ylabel('Genomic bin (resolution: %s)' % (xper.resolution))
        if not paint_tads:            
            axe.set_ylim(int(start or 1) - 0.5,
                         int(start or 1) + len(matrix) - 0.5)
            axe.set_xlim(int(start or 1) - 0.5,
                         int(start or 1) + len(matrix) - 0.5)
            if show:
                plt.show()
            return img
        for i, tad in xper.tads.iteritems():
            if start:
                print int(tad['start']) + 1, start
                print int(tad['end']) + 1, end
                if int(tad['start']) + 1 < start:
                    continue
                if int(tad['end']) + 1 > end:
                    continue
                t_start = int(tad['start']) + 1.5
                t_end   = int(tad['end'])   + 2.5
            else:
                t_start = int(tad['start']) + .5
                t_end   = int(tad['end']) + 1.5
            axe.hlines(t_start, t_start, t_end, colors='k')
            axe.hlines(t_end, t_start, t_end, colors='k')
            axe.vlines(t_start, t_start, t_end, colors='k')
            axe.vlines(t_end, t_start, t_end, colors='k')
            if not i % (len(xper.tads) / 10):
                if i % 2:
                    axe.text(t_start + abs(t_start-t_end)/2,
                             t_end + 1, str(i), va='bottom', ha='center')
                else:
                    axe.text(t_start + abs(t_start-t_end)/2,
                             t_start - 1, str(i), va='top', ha='center')
            if tad['score'] < 0:
                for j in xrange(0, int(t_end) - int(t_start), 2):
                    axe.plot((t_start    , t_start + j),
                             (t_end   - j, t_end      ), color='k')
                    axe.plot((t_end      , t_end   - j),
                             (t_start + j, t_start    ), color='k')
        axe.set_ylim(int(start or 1) - 0.5,
                     int(start or 1) + len(matrix) - 0.5)
        axe.set_xlim(int(start or 1) - 0.5,
                     int(start or 1) + len(matrix) - 0.5)
        if show:
            plt.show()


[docs]    def get_tad_hic(self, tad, x_name, normed=True, matrix_num=0):
        """
        Retrieve the Hi-C data matrix corresponding to a given TAD.
        
        :param tad: a given TAD (:py:class:`dict`)
        :param x_name: name of the experiment
        :param True normed: if True, normalize the Hi-C data
        
        :returns: Hi-C data matrix for the given TAD
        """
        beg, end = int(tad['start']), int(tad['end'])
        xpr = self.get_experiment(x_name)
        size = xpr.size
        matrix = [[0 for _ in xrange(beg, end)]\
                  for _ in xrange(beg, end)]
        for i, tadi in enumerate(xrange(beg, end)):
            tadi = tadi * size
            for j, tadj in enumerate(xrange(beg, end)):
                if normed:
                    matrix[j][i] = xpr.hic_data[matrix_num][tadi + tadj]
                else:
                    matrix[j][i] = xpr.norm[0][tadi + tadj]
        return matrix


[docs]    def iter_tads(self, x_name, normed=True):
        """
        Iterate over the TADs corresponding to the given experiment.
        
        :param x_name: name of the experiment
        :param True normed: normalize Hi-C data returned
        
        :yields: Hi-C data corresponding to each TAD
        """
        if not self.get_experiment(x_name).hic_data:
            raise Exception('No Hi-c data for %s experiment\n' % (x_name))
        for name, ref in self.get_experiment(x_name).tads.iteritems():
            yield name, self.get_tad_hic(ref, x_name, normed=normed)


[docs]    def set_max_tad_size(self, value):
        """
        Change the maximum size allowed for TADs. It also applies to the
        computed experiments.

        :param value: an int value (default is 5000000)
        """
        self.max_tad_size = value
        for xpr in self.experiments:
            for tad in xpr.tads:
                xpr.tads[tad]['brk'] = xpr.tads[tad]['end']
                if ((xpr.tads[tad]['end'] - xpr.tads[tad]['start']) 
                    * xpr.resolution) > self.max_tad_size:
                    xpr.tads[tad]['score'] = -abs(xpr.tads[tad]['score'])
            

    def _find_centromere(self, xpr):
        """
        Search for the centromere in a chromosome, assuming that
        :class:`Chromosome` corresponds to a real chromosome.
        Add a boundary to all the experiments where the centromere is.
         * A centromere is defined as the largest area where the rows/columns
           of the Hi-C matrix are empty.
        """
        beg = end = 0
        size = xpr.size
        try:
            hic = xpr.hic_data[0]
        except TypeError:
            return
        # search for largest empty region of the chromosome
        best = (0, 0, 0)
        pos = 0
        for pos, raw in enumerate(xrange(0, size * size, size)):
            if sum(hic[raw:raw + size]) == 0 and not beg:
                beg = float(pos)
            if sum(hic[raw:raw + size]) != 0 and beg:
                end = float(pos)
                if (end - beg) > best[0]:
                    best = ((end - beg), beg, end)
                beg = end = 0
        # this is for weared cases where centromere is at the end of Hi-C data
        if beg and not end:
            end = float(pos)
            if (end - beg) > best[0]:
                best = ((end - beg), beg, end)
        beg, end = best[1:]
        if not beg or not end:
            return
        tads = xpr.tads
        # if we already have a centromere defined, check if it can be reduced
        if self._centromere:
            if beg > self._centromere[0]:
                # readjust TADs that have been split around the centromere
                for tad in tads:
                    if tads[tad]['end'] == self._centromere[0]:
                        tads[tad]['end'] = beg
                self._centromere[0] = beg
            if end < self._centromere[1]:
                # readjust TADs that have been split around the centromere
                for tad in tads:
                    if tads[tad]['start'] == self._centromere[1]:
                        tads[tad]['start'] = end
                self._centromere[1] = end
        else:
            self._centromere = [beg, end]
        # split TADs overlapping  with the centromere
        if [True for t in tads.values() \
            if t['start'] < beg < t['end'] \
            and t['start'] < end < t['end']]:
            tad  = len(tads) + 1
            plus = 0
            while tad + plus > 1:
                start = tads[tad - 1 + plus]['start']
                final = tads[tad - 1 + plus]['end']
                # centromere found?
                if start < beg < final and start < end < final:
                    tads[tad] = copy(tads[tad - 1])
                    tads[tad]['start'] = end
                    tads[tad]['score'] = abs(tads[tad]['score'])
                    if (tads[tad]['end'] - tads[tad]['start']) \
                           * xpr.resolution > self.max_tad_size:
                        xpr.tads[tad]['score'] = -abs(xpr.tads[tad]['score'])
                    tads[tad]['brk'] = tads[tad]['end']
                    tad -= 1
                    tads[tad] = copy(tads[tad])
                    tads[tad]['score'] = abs(tads[tad]['score'])
                    tads[tad]['end'] = beg
                    if (tads[tad]['end'] - tads[tad]['start']) \
                           * xpr.resolution > self.max_tad_size:
                        xpr.tads[tad]['score'] = -abs(xpr.tads[tad]['score'])
                    tads[tad]['brk'] = tads[tad]['end']
                    plus = 1
                else:
                    tads[tad] = copy(tads[tad - 1 + plus])
                tad -= 1
        # if tad includes centromere but starts in the same point
        elif [True for t in tads.values() \
              if t['start'] == beg and end < t['end']]:
            tad  = len(tads) + 1
            plus = 0
            while tad + plus > 1:
                start = tads[tad - 1 + plus]['start']
                final = tads[tad - 1 + plus]['end']
                # centromere found?
                if start == beg:
                    tads[tad] = copy(tads[tad - 1])
                    tads[tad]['start'] = end
                    tads[tad]['score'] = abs(tads[tad]['score'])
                    if (tads[tad]['end'] - tads[tad]['start']) \
                           * xpr.resolution > self.max_tad_size:
                        xpr.tads[tad]['score'] = -abs(xpr.tads[tad]['score'])
                    tads[tad]['brk'] = tads[tad]['end']
                    tad -= 1
                    tads[tad] = copy(tads[tad])
                    tads[tad]['score'] = abs(tads[tad]['score'])
                    tads[tad]['end'] = end - 1
                    if (tads[tad]['end'] - tads[tad]['start']) \
                           * xpr.resolution > self.max_tad_size:
                        xpr.tads[tad]['score'] = -abs(xpr.tads[tad]['score'])
                    tads[tad]['brk'] = tads[tad]['end']
                    plus = 1
                else:
                    tads[tad] = copy(tads[tad - 1 + plus])
                tad -= 1
        # if tad includes centromere but ends in the same point
        elif [True for t in tads.values() \
              if t['end'] == end and beg > t['start']]:
            tad  = len(tads) + 1
            plus = 0
            while tad + plus > 1:
                start = tads[tad - 1 + plus]['start']
                final = tads[tad - 1 + plus]['end']
                # centromere found?
                if final == end:
                    tads[tad] = copy(tads[tad - 1])
                    tads[tad]['start'] = beg
                    tads[tad]['score'] = abs(tads[tad]['score'])
                    if (tads[tad]['end'] - tads[tad]['start']) \
                           * xpr.resolution > self.max_tad_size:
                        xpr.tads[tad]['score'] = -abs(xpr.tads[tad]['score'])
                    tads[tad]['brk'] = tads[tad]['end']
                    tad -= 1
                    tads[tad] = copy(tads[tad])
                    tads[tad]['score'] = abs(tads[tad]['score'])
                    tads[tad]['end'] = beg
                    if (tads[tad]['end'] - tads[tad]['start']) \
                           * xpr.resolution > self.max_tad_size:
                        xpr.tads[tad]['score'] = -abs(xpr.tads[tad]['score'])
                    tads[tad]['brk'] = tads[tad]['end']
                    plus = 1
                else:
                    tads[tad] = copy(tads[tad - 1 + plus])
                tad -= 1



[docs]class ExperimentList(list):
    """
    Inherited from python built in :py:func:`list`, modified for tadbit
    :class:`pytadbit.Experiment`.
    
    Mainly, `getitem`, `setitem`, and `append` were modified in order to
    be able to search for experiments by index or by name, and to add 
    experiments simply using Chromosome.experiments.append(Experiment).

    The whole ExperimentList object is linked to a Chromosome instance
    (:class:`pytadbit.Chromosome`).

    """
    def __init__(self, thing, crm):
        super(ExperimentList, self).__init__(thing)
        self.crm = crm
        

    def __getitem__(self, i):
        try:
            return super(ExperimentList, self).__getitem__(i)
        except TypeError:
            for nam in self:
                if nam.name == i:
                    return nam
            raise KeyError('Experiment %s not found\n' % (i))


    def __setitem__(self, i, exp):
        try:
            super(ExperimentList, self).__setitem__(i, exp)
            exp.crm = self.crm
            if self.crm._search_centromere:
                self.crm._get_forbidden_region(exp)
        except TypeError:
            for j, nam in enumerate(self):
                if nam.name == i:
                    exp.crm = self.crm
                    self[j] = exp
                    if self.crm._search_centromere:
                        self.crm._get_forbidden_region(exp)
                    break
            else:
                exp.crm = self.crm
                self.append(exp)
                if self.crm._search_centromere:
                    self.crm._get_forbidden_region(exp)


    def __delitem__(self, i):
        try:
            super(ExperimentList, self).__delitem__(i)
        except TypeError:
            for j, nam in enumerate(self):
                if nam.name == i:
                    exp = self.pop(j)
                    del(exp)
                    break
            else:
                raise KeyError('Experiment %s not found\n' % (i))


    def append(self, exp):
        if exp.name in [e.name for e in self]:
            self[exp.name] = exp
            if self.crm._search_centromere:
                self.crm._get_forbidden_region(exp)
        else:
            super(ExperimentList, self).append(exp)
            if self.crm._search_centromere:
                self.crm._get_forbidden_region(exp)
            exp.crm = self.crm


class AlignmentDict(dict):
    """
    :py:func:`dict` of :class:`pytadbit.Alignment`
    
    Modified getitem, setitem, and append in order to be able to search
    alignments by index or by name.

    linked to a :class:`pytadbit.Chromosome`
    """

    def __getitem__(self, nam):
        try:
            return super(AlignmentDict, self).__getitem__(nam)
        except KeyError:
            for i, key in enumerate(self):
                if nam == i:
                    return self[key]
            raise TypeError('Alignment %s not found\n' % (i))


class ChromosomeSize(int):
    """
    This is an integer.
    
    Chromosome size in base pairs
    """
    def __init__(self, thing):
        super(ChromosomeSize, self).__init__(thing)


class RelativeChromosomeSize(int):
    """
    This is an integer.
    
    Relative Chromosome size in base pairs. Equal to Chromosome size minus
    forbidden regions (eg: the centromere)
    """
    def __init__(self, thing):
        super(RelativeChromosomeSize, self).__init__(thing)
Navigation

Source code for pytadbit.chromosome

Table Of Contents

Quick search

Navigation