Source code for msdas.replicates

# -*- python -*-
#
#  This file is part of MS-DAS software
#
#  Copyright (c) 2014 - EBI-EMBL
#
#  File author(s): Thomas Cokelaer <cokelaer@ebi.ac.uk>
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#
##############################################################################
import pandas as pd
import numpy as np
import pylab
from matplotlib import colors


from msdas import readers

__all__ = ["ReplicatesYeast", "Replicates"]



[docs]class Replicates(readers.MassSpecReader):
    """Investigate the replicates in the raw dat sets

    The purpose of this class is to analyse a full data set with replicates,
    to remove rows that are meaningless, to compute mean/sigma on
    replicates and identify rows that have large errors.

    Replicates are identified as follows. If the input is a CSV file,
    columns with the same name are considered replicates to each other. Internally,
    in the dataframe, an index is appended. The first replicate does not have
    an index appended though. For instance, 3 columns called **X1** will be
    renamed internally as::

        X1
        X1.1
        X1.2

    If you save the dataframe into a CSV file, these convention is kept
    and names will be kept as it is.

    ::

        from msdas import *
        r = ReplicatesYeast(get_yeast_raw_data(), verbose=True)

    Since Replicates inherits from :class:`msdas.readers.MassSpecReader`, some
    functionalities can be re-used such as plotting, selection of data, annotations...

    Here are some notations used in the documentation.

    * :math:`i` represent a row that corresponds to a combination
      of protein name and psites.
    * There are possibly more than 1 experiment, which number is :math:`E` and indexed
      thanks to the :math:`e` letter.
    * There are :math:`R` replicates for each measurement. A replicate is
      indexed with :math:`r`.

    Using the notations above, one datum in the dataframe can be denoted as
    :math:`X_{i,e,r}`. We denote :math:`\mu` the mean and :math:`\sigma` the
    standard deviation.

    * :math:`\mu_{i,e} = E(X_{i,e,r})_r` is the mean over replicates and is a
      NxE matrix.
    * If you then want to average the resulting mean over experiment (to get the
      **grand mean**), we denote it as :math:`\mu_i = E(\mu_{i,e})_e`. This
      is equivalent to :math:`\mu_i = E(X_{i,e,r})_{e,r}`.

    Note however, that for standard deviation, the distinction and order of
    the indices matters.

    * :math:`\sigma_i = \sqrt{V(X_{i,e,r})_{r,e}}` is a N-length vector of standard
      deviations for each combi of protein and psite.
    * :math:`\sigma_{i,e} = \sqrt{V(X_{i,e,r})_r}` is a NxE matrix of standard deviations.

    There are quite a few methods to extract these information either as full
    dataframe or set of dataframe inside a dictionary. For instance, :meth:`get_mus`
    returns the mean over each


    """
    def __init__(self, data, verbose=False, cleanup=True):
        """.. rubric:: constructor

        :param data: a valid input to :class:`msdas.readers.MassSpecReader`
        :param cleanup: see :class:`msdas.readers.MassSpecReader` for details.


        """
        super(Replicates, self).__init__(data, verbose=verbose, cleanup=cleanup,
            merge_peptides=False)

        # hack. should be in readers but could not find where.
        self.df.fillna(np.nan, inplace=True)

        if "count_phospho" in self.df.columns:
            del self.df['count_phospho']

        # saves annotations
        self._rawdf = self.df.copy()

[docs]    def average(self):
        """Average the data over replicates and replace the dataframe inplace"""
        mu = pd.DataFrame(self.get_mus())
        self.df[mu.columns] = mu
        self._drop_replicates()

[docs]    def boxplot_replicates_by_experiment(self, tag, indices=None, fontsize=8):
        """BoxPlot of the replicates for a given experiment

        :param str tag: a valid measurement name to look for replicates.
        :param indices: list of indices to look at. If data set is large,
            you will want to use this parameter. Note that indices is not a range
            i,e., range(0,20) may not work because index 10 is not present.
        :param fontsize: fontsize used for the x-axis

        .. plot::
            :include-source:
            :width: 80%

            from msdas import *
            r = replicates.ReplicatesYeast(get_yeast_raw_data())
            r.boxplot_replicates_by_experiment("a0_t0", indices=r.df.index[0:20])

        """
        if tag not in self.get_unique_measurement_name():
            raise ValueError("invalid tag. call get_unique_measurement_name)")

        if indices is None:
            indices = self.df.index
        if len(indices) > 200:
            self.logging.info("boxplot with more than 200 variables will be slow. You can stop the plotting pressing CTRL+C")

        df = self.get_replicates_from_one_unique_measurement(tag)

        # tranpose to make protein the variable (column)
        df = df.ix[indices].transpose()
        # df.columns is equivalent to indices is principle
        # does not work; use xticks instead : df.columns = self.df.ix[df.columns].Identifier.values
        colnames = self.df.ix[df.columns].Identifier.values
        pylab.clf()
        self.dfdf = df
        df.boxplot(rot=90, return_type='axes')
        pylab.semilogy()
        pylab.xticks(range(0,len(indices)), colnames, rotation=90,fontsize=fontsize)
        pylab.tight_layout()
        return df

[docs]    def copy(self):
        """Return a copy of Replicates instance. """
        r = Replicates(self)
        return r

[docs]    def get_unique_measurement_name(self):
        """Returns unique column names without replicates.

        If there are replicates, they have the same column name in the CSV file
        but in the dataframe, there are encoded differently with an appended suffix.
        For instance, three replicates of the colum *t0* will be renamed *t0*, *t0.1*, *t0.2*.

        This method returns all unique names before the dot charater.
        For instance, with a dataframe that contains the following columns:
        *t0*, *t0.1*, *t0.2*, *t5*, *t5.1*, *t5.2*, this methods returns a list that
        contains *t0* and *t5* strings.

        .. note:: names are sorted alphabetically

        .. seealso:: :meth:`get_replicates_from_one_unique_measurement`
        """
        return sorted(list(set([x.split(".")[0] for x in self.measurements.columns])))

[docs]    def get_replicates_from_one_unique_measurement(self, name):
        """Given a unique name, returns a dataframe with only its replicates

        .. seealso:: :meth:`get_unique_measurement_name`
        """
        if name not in self.get_unique_measurement_name():
            raise ValueError("invalid tag")
        else:
            names = [c for c in self.df.columns if c.split(".")[0]==name]
            return self.df[names]


[docs]    def get_mu_df(self):
        """Version of :meth:`get_mus` returning a dataframe """

        mu = pd.DataFrame(self.get_mus())
        mu = mu[[x for x in self.measurements.columns if "." not in x]]
        return mu


[docs]    def get_mus(self):
        r"""Return mean of replicates as a dictionary

        Mathematical, it returns a NxE matrix computed as follows:

        .. math::

            \mu_{i,e} = E(X_{i,e,r})_r =  \frac{1}{R} \sum_{r=1}^{R} X_{i,e,r}


        :return: a dictionary. Keys are the E unique measurement names
            Values are each vector :math:`\mu_i` of length N protein/psites

        """
        tags = self.get_unique_measurement_name()
        mus = {}
        for tag in tags:
           mu = self.get_replicates_from_one_unique_measurement(tag).mean(axis=1)
           mus[tag] = mu.copy()
        return mus

[docs]    def get_sigmas(self):
        r"""Return standard deviation of replicates as a dictionary

        Mathematical, it returns a NxE matrix computed as follows :

        .. math::

            \sigma_{i,e} = V(X_{i,e,r})_r

        :return: a dictionary. Keys are the E unique measurement names
            Values are each vector :math:`\mu_i` of length N protein/psites

        """
        tags = self.get_unique_measurement_name()
        sigmas = {}
        for tag in tags:
           sigma = self.get_replicates_from_one_unique_measurement(tag).std(axis=1)
           sigmas[tag] = sigma.copy()
        return sigmas

[docs]    def get_mean_errors_df(self):
        """Returns dataframe with errors averaged (dataframe version)

        This is taking the results of :meth:`get_errors`, compute the average
        for each key/value and builds up a dataframe.

        :return: dataframe with averaged errors.

        .. seealso:: :meth:`get_errors`


        """
        errors = self.get_errors()
        errors = dict([(tag,errors[tag].mean(axis=1)) for tag in errors.keys()])
        errors = pd.DataFrame(errors)
        return errors

[docs]    def get_errors(self):
        r"""Returns dataframe with errors (dictionary version)

        Errors are computed as follows:

        .. math:: \epsilon_{i,e,r} =  \frac{\left| X_{i,e,r}- \mu_{i,e} \right|}{\mu_{i,e}}

        with

        .. math:: \mu_{i,e} = \frac{1}{R} \sum_{r=1}^{R} X_{i,e,r}


        In the dictionary, keys correspond to the name of a measurement as returned
        by :meth:`get_unique_measurement_name`. Values are dataframe with the
        errors for each replicate.

        """
        errors = {}
        mus = self.get_mus()
        tags = self.get_unique_measurement_name()
        for time in tags:
            data = self.get_replicates_from_one_unique_measurement(time)
            errors[time] = np.abs(data.sub(mus[time], axis="index")).divide(mus[time],
                axis="index")*100
        return errors

[docs]    def set_irrelevant_replicates_to_na(self):
        """Set unique replicate to NAs

        If an experiment has no replicates (0 or 1), you may want to set the
        experiment to NA. This is not relevant if there are zero replicates
        since the value may already be an NA but may make sense when there
        is only one replicate, for which no errors can be obtained.


        .. plot::
            :include-source:
            :width: 70%

            from msdas import *
            r = ReplicatesYeast(get_yeast_raw_data(), verbose=False)
            r.set_irrelevant_replicates_to_na()
            r.hist_na_per_experiments(color="r", alpha=0.5)
            r.reset()
            r.hist_na_per_experiments(color="g", alpha=0.5)

        """
        tags = self.get_unique_measurement_name()
        for tag in tags:
            df = self.get_replicates_from_one_unique_measurement(tag)
            indices = pd.notnull(df).sum(axis=1)<=1
            indices = [k for k,v in indices.iteritems() if v]
            colnames = [c for c in self.df.columns if c.split(".")[0]==tag]
            self.df.ix[indices, colnames] = np.nan




    def _drop_replicates(self):
        """Remove replicates from the dataframe"""
        todrop = [x for x in self.measurements.columns if "." in x]
        self.df.drop(todrop, inplace=True, axis=1)


[docs]    def get_average_mu(self):
        r"""Return grand mean

        :return: a vector with the grand mean over protein/psites

        The grand mean is the mean of the mean over replicates and experiments
        that is:

        .. math:: \mu_i = E(X_{i,e,r})_{e,r}

        This is equivalent to taking the mean over the experiment of the output
        of :meth:`get_mu_df`

        .. warning:: may contain NAs

        """
        mu = self.get_mu_df().mean(axis=1)
        return mu

[docs]    def get_average_sigma(self):
        r"""average of individual sigma on data measurements replicates

        :return: a vector with average sigma computed for each experiment.

        For instance, if you have 3 replicates (R=3) on 5 psites (N=5) for
        E=10 experiments, you get 5 values (average of 10 sigmas computed on 3 replicates)

        .. math:: \bar{\sigma}_{i} = E( \sigma_{e,i})_e

        .. note:: this is not the standard deviation over experiment and
            replicates :math:`\sigma_i = V( X_{i,e,r})_{e,r}` but the average
            of the standard deviation over replicates. See figure below.

        .. plot::
            :include-source:
            :width: 70%

            >>> import pylab
            >>> from msdas import *
            >>> r = replicates.ReplicatesYeast(get_yeast_raw_data())
            >>> x = pylab.linspace(1e8, 1e9, 100)
            >>> r.get_average_sigma().hist(bins=x, alpha=0.5, normed=True, label=r"$\bar{\sigma}_i$")
            >>> r.df.std(axis=1).hist(alpha=0.5, bins=x, normed=True, label=r"$\sigma_{i}$")
            >>> pylab.legend()

        .. warning:: may contain NAs
        """
        tags = self.get_unique_measurement_name()
        sigma = pd.concat([self.get_sigmas()[tag] for tag in tags], axis=1).mean(axis=1)
        return sigma

[docs]    def hist_na_per_experiments(self, bins=None, **kargs):
        """Histogram of the number of NA per rows

        :param kargs: optional parameter accepted by matplotlib.hist function.
        :param bins: binning of the histogram

        .. plot::
            :include-source:
            :width: 80%

            from msdas import *
            from easydev import gsf
            filename= gsf("msdas", "data", "YEAST_raw_sample.csv")
            r = Replicates(filename, verbose=False)
            r.hist_na_per_experiments()


        """
        if bins == None:
            bins = len(self.measurements.columns)
        data = self.get_na_count()
        data.hist(bins=bins, **kargs)
        pylab.xticks([x+0.5 for x in range(0,bins)], self.measurements.columns, rotation=90)

[docs]    def hist_coefficient_variation(self, merge=True, tags=None, **kargs):
        r"""Plots histogram of the coefficient variation

        That is, the quantity:

        .. math::

            \frac{\sigma_{i,e}}{\mu_{i,e}}

        .. plot::
            :include-source:
            :width: 70%

            import pylab
            import scipy.stats
            from msdas import *
            r = replicates.ReplicatesYeast(get_yeast_raw_data())
            r.hist_coefficient_variation(normed=True)
            x = pylab.linspace(0, 1, 200)
            pylab.plot(x, scipy.stats.gamma.pdf(x,2, scale=0.068), lw=2, color='r')
            pylab.xlim([0,1])

        """
        kargs['bins'] = kargs.get('bins', 100)
        if tags == None:
            tags = self.get_unique_measurement_name()

        cv = self.get_coefficient_variation()

        coeffs = {}
        if merge == False:
            for i, key in enumerate(tags):
                pylab.hist(cv[key].dropna(), label=key, **kargs)
            pylab.legend()
        else:
            coeffs = []
            for i, key in enumerate(tags):
                coeffs.extend(list(cv[key].dropna()))
            pylab.hist(coeffs, **kargs)
        pylab.grid()
        pylab.xlabel("Coefficient of Variation")
        pylab.ylabel("#")
        return coeffs


[docs]    def hist2d_mu_versus_cv(self, bins=100, cmap="hot_r", fontsize=10, Nlevels=4,
                            contour=True,**kargs):
        """plots histogram of mean across replicates versus coefficient variation

        :param int bins: binning for the 2D histogram
        :param fontsize: fontsize for the labels
        :param contour: show some contours
        :param int Nlevels: must be more than 2

        .. plot::
            :include-source:
            :width: 50%

            >>> from msdas import *
            >>> r = replicates.ReplicatesYeast(get_yeast_raw_data())
            >>> r.drop_na_count(54) # to speed up the plot creation
            >>> r.hist2d_mu_versus_cv()

        """
        cv = self.get_coefficient_variation_df().unstack()
        mu = self.get_mu_df().unstack()

        tokeep = (pd.isnull(cv)==False) & (pd.isnull(mu)==False)

        if len(cv)>10000:
            self.logging.info("Computing 2D histogram. Please wait")

        pylab.clf()
        res = pylab.hist2d( cv[tokeep]*100, pylab.log10(mu[tokeep]), bins=bins,
                cmap=cmap, norm=colors.LogNorm())
        pylab.colorbar()

        X, Y = pylab.meshgrid(res[1][0:bins], res[2][0:bins])
        if contour:
            levels = [round(x) for x in pylab.logspace(0, pylab.log10(res[0].max().max()),Nlevels)]
            pylab.contour(X,Y,res[0].transpose(), levels[2:], color="g")
            #pylab.clabel(C, fontsize=fontsize, inline=1)
        pylab.xlabel(r"mean of errors, $E\left[ \epsilon_{i,e,r} \right]_{e,r}$", fontsize=fontsize)
        pylab.ylabel(r"Coefficient of Variation,  $\frac{\sigma_{i,e}}{\mu_{i,e}}$", fontsize=fontsize)

        pylab.grid(True)
        pylab.xlabel("Coefficient Variation (percent)")
        pylab.ylabel("Average (log10)")
        return res

[docs]    def hist2d_errors_versus_cv(self, bins=100, cmap="hot_r", fontsize=10,
                                Nlevels=4, contour=True,**kargs):

        r"""Show hist 2D of mean of errors across replicates and experiment versus
        Coefficient variation.

        :param int bins: binning for the 2D histogram
        :param fontsize: fontsize for the labels
        :param contour: show some contours
        :param int Nlevels: must be more than 2



        .. plot::
            :include-source:
            :width: 50%

            >>> from msdas import *
            >>> r = replicates.ReplicatesYeast(get_yeast_raw_data())
            >>> r.drop_na_count(54) # to speed up the plot creation
            >>> r.hist2d_errors_versus_cv()

        """

        cv = self.get_coefficient_variation_df().unstack()
        grand_mu_errors = self.get_mean_errors_df().unstack()
        tokeep = (pd.isnull(cv)==False) & (pd.isnull(grand_mu_errors)==False)



        if len(cv)>10000:
            self.logging.info("Computing 2D histogram. Please wait")
        pylab.clf()
        res = pylab.hist2d(grand_mu_errors[tokeep], cv[tokeep]*100, bins=bins,
                cmap=cmap, norm=colors.LogNorm())
        pylab.colorbar()

        X, Y = pylab.meshgrid(res[1][0:bins], res[2][0:bins])
        if contour:
            levels = [round(x) for x in pylab.logspace(0, pylab.log10(res[0].max().max()),Nlevels)]
            pylab.contour(X,Y,res[0].transpose(), levels[2:], color="g")
            #pylab.clabel(C, fontsize=fontsize, inline=1)
        pylab.xlabel(r"mean of errors, $E\left[ \epsilon_{i,e,r} \right]_{e,r}$", fontsize=fontsize)
        pylab.ylabel(r"Coefficient of Variation,  $\frac{\sigma_{i,e}}{\mu_{i,e}}$", fontsize=fontsize)

        pylab.grid(True)
        return res

[docs]    def get_coefficient_variation_df(self):
        r"""Return dataframe containing the coefficient of variation

        .. math:: CV_{i,e} = \frac{\sigma_{i,e} }{\mu_{i,e}}

        """
        cv = pd.DataFrame(self.get_coefficient_variation())
        cv = cv[[x for x in self.measurements.columns if "." not in x]]
        return cv

[docs]    def get_coefficient_variation(self):
        r"""Return dictionary containing the coefficient of variation for each measurement

        .. math:: CV_{i,e} = \frac{\sigma_{i,e} }{\mu_{i,e}}

        In the dictionary, keys correspond to the name of a measurement as returned
        by :meth:`get_unique_measurement_name`. Values are time series with the
        coefficient of variation.

        """
        tags = self.get_unique_measurement_name()
        coeffs = {}
        mus = self.get_mus()
        sigmas = self.get_sigmas()
        coeffs = {}
        for i, time in enumerate(tags):
            coeffs[time] = (sigmas[time]/mus[time])
        return coeffs

[docs]    def get_standarised_residuals(self):
        r"""Returns dictionary (times being the keys)  with residual errors

        .. math:: \epsilon_{i,e} =  \frac{\left|X_{i,e,r}-\mu_{i,e}\right|}{\sigma_{i,e}}

        """
        tags = self.get_unique_measurement_name()
        errors = {}
        mus = self.get_mus()
        sigmas = self.get_sigmas()

        for i, tag in enumerate(tags):
            df = self.get_replicates_from_one_unique_measurement(tag)
            errors[tag] = np.abs(df.sub(mus[tag], axis="index")).divide(sigmas[tag],
                axis="index")
        return errors

    def _get_na_count_per_experiment(self):
        nas = {}
        for tag in self.get_unique_measurement_name():
           df = self.get_replicates_from_one_unique_measurement(tag)
           R = len(df.columns)
           na = R  - pd.notnull(df).sum(axis=1)
           nas[tag] = na.copy()
        return nas

[docs]    def pcolor_na(self, sort_index=False, noxticks=True, cmap="jet"):
        """plots the number of NA versus experiment and protein

        :param sort_index:
        :param noxticks: can be slow if you have thousands of protein names
        :return: dataframe used with NAs per row and experiment


        .. plot::
            :include-source:
            :width: 70%

            from msdas import *
            from easydev import gsf
            filename= gsf("msdas", "data", "YEAST_raw_sample.csv")
            r = Replicates(filename, verbose=False)
            r.pcolor_na()

        """
        df = pd.DataFrame(self._get_na_count_per_experiment()).transpose()
        df.columns = self.df.Identifier

        if sort_index:
            dfna = self.get_na_count()
            dfna.sort(axis=1)
            df = df[dfna.index]
            xtickslabel = self.metadata.ix[dfna.index].Protein + "_" + self.metadata.Psite
        else:
            xtickslabel = self.metadata.Protein + "_" + self.metadata.Psite

        pylab.clf()
        #pcolor in at least 15 times faster than pcolor but is not in pandas, so convert data to matrix before
        pylab.pcolormesh(df.as_matrix(), cmap=cmap)
        pylab.yticks([0.5+x for x in range(0,len(df.index))], df.index)
        if noxticks==False:
            pylab.xticks(range(0, len(self.metadata)), xtickslabel, rotation=90)
        pylab.colorbar()
        pylab.title("Number of NA")
        pylab.ylim([0,len(df.index)])
        pylab.xlim([0,len(self.metadata)])
        return df

[docs]    def plot_na_per_experiment(self, percent=False):
        """plot curve showing number of NAs per experiment

        .. plot::
            :include-source:
            :width: 70%

            from msdas import *
            from easydev import gsf
            filename= gsf("msdas", "data", "YEAST_raw_sample.csv")
            r = Replicates(filename, verbose=False)
            r.plot_na_per_experiment()



        """
        N = len(self.df)
        pylab.clf()
        if percent == False:
            count = pd.isnull(self.measurements).sum(axis=0)
            count.plot()
            pylab.ylabel("Number of NAs per experiment")
            pylab.ylim([0, N])
        else:
            count = pd.isnull(self.measurements).sum(axis=0)
            count/=float(N)
            count *= 100
            count.plot()
            pylab.ylabel("Number of NAs (percentage) per experiment")
            pylab.ylim([0,101])

        pylab.xticks(range(0, self.N), self.measurements.columns, rotation=90)



        pylab.tight_layout()


[docs]    def plot_mu_sigma(self, tag_list=None, histograms=True, loglog=True, fontsize_legend=10):
        """Plots coefficient variation versus sigma as a scatter plot with histograms

        :param list tag_list: list of measurement to look at (Defaults to all)

        .. plot::
            :include-source:
            :width: 90%

            from msdas import *
            r = replicates.ReplicatesYeast(get_yeast_raw_data())
            r.plot_mu_sigma(tag_list=["a0_t0", "a0_t45"], loglog=True, histograms=True)


        """
        mus = self.get_mus()
        sigmas = self.get_coefficient_variation()

        if tag_list == None:
            tag_list = self.get_unique_measurement_name()

        f = pylab.figure()
        f.clf()
        if histograms is True:
            a1 = f.add_axes([.12,.12,.5,.5])
            a_top = f.add_axes([.12,.7,.5,.2])
            a_right = f.add_axes([.7,.12,.2,.5])
        else:
            a1 = pylab.axes()

        for time in tag_list:
            pylab.sca(a1)
            if loglog:
                a1.loglog(mus[time], sigmas[time], 'o', alpha=0.5, label=time)
            else:
                a1.plot(mus[time], sigmas[time], 'o', alpha=0.5, label=time)
            pylab.grid(True)
            if histograms:
                pylab.sca(a_top)
                if loglog:
                    pylab.log10(mus[time]).hist(alpha=0.5, bins=20)
                    a = pylab.gca()
                    a.set_xlim(pylab.log10(a1.get_xlim()))
                else:
                    mus[time].hist(alpha=0.5, bins=20)
                    a = pylab.gca()
                    a.set_xlim(a1.get_xlim())
                a.grid(True)
                pylab.sca(a_right)
                if loglog:
                    pylab.log10(sigmas[time]).hist(alpha=0.5, bins=20, orientation="horizontal")
                    a = pylab.gca()
                    a.set_ylim(pylab.log10(a1.get_ylim()))
                else:
                    sigmas[time].hist(alpha=0.5, orientation="horizontal", bins=20, xrot=True)
                    a = pylab.gca()
                    a.set_ylim(a1.get_ylim())
                a.grid(True)
        #pylab.legend(loc="upper left", fontsize=fontsize_legend)
        #pylab.tight_layout()
        pylab.xlabel("mu")
        pylab.ylabel("sigma")
        pylab.legend(tag_list, fontsize=fontsize_legend, loc="lower right")

[docs]    def reset(self):
        """Reset the dataframe to its original contenst.

        Since loading large data set takes some time to initialise, this
        could be useful for debugging if you cut the dataframe using for
        instance :meth:`drop_na_count`.
        """
        self.df = self._rawdf.copy()
        #self.metadata = self._metadata.copy()



[docs]class ReplicatesYeast(Replicates):
    """This class is a specialisation of :class:`Replicates` dedicated to the YEAST data


    The data set comprises 36 experiment that are combination of
    alpha hormone at different times (0,1,5,10,20,45)
    """

    #: times
    times = [0,1,5,10,20,45]
    #: input tags to be found
    yeast_tags = ["Ma0s04", "Ma1s04", "Ma5s04", "Ma10s04", "Ma20s04","Ma45s04"]

    def __init__(self, data, verbose=False, cleanup=True):
        """.. rubric:: constructor

        :param data: a valid input to :class:`msdas.readesr.MassSpecReader`
        :param cleanup: see :class:`msdas.readesr.MassSpecReader` for details.


        """
        super(ReplicatesYeast, self).__init__(data, verbose=verbose, cleanup=cleanup)

        # renaming columns
        for i, tag in enumerate(self.yeast_tags):
            self.df.columns = [c if c.startswith(tag)==False
                else "a" + str(self.times[i]) + "_t" + c.split("_")[1] for c in self.df.columns]

        self._rawdf = self.df.copy()

[docs]    def copy(self):
        """Return a copy of Replicates instance. """
        r = ReplicatesYeast(self)
        return r

[docs]    def normalise(self):

        from easydev import gsf
        self.tic = pd.read_csv(gsf("msdas", "data", "YEAST_TIC.csv"))
        # FIXME we assume that replicates are sorted and rename them to differentiate their name
        # and to agree with data. Should therefore be ordered in the same way.
        self.tic.columns = ['name', 'TIC', 'dummy']
        def func_rename(x,i):
            if i%3==0:
                return x
            elif i%3==1:
                return x+".1"
            elif i%3==2:
                return x+".2"
                # rename to add the replicate index
        self.tic['name'] = [func_rename(x,i) for i,x in enumerate(self.tic.name)]


        # rename to replcae Ma10s04_20 into convention a10_t20
        def func_rename2(x):
            res = x.split(".")
            name = res[0]
            if len(res) == 1:
                replicate = None
            else:
                replicate = res[1]
            hormone, salt = name.split("_")
            hormone = hormone.replace("Ma", "")
            hormone = hormone.replace("s04", "")
            if replicate:
                return "a" + hormone + "_t" + salt + "." + replicate
            else:
                return "a" + hormone + "_t" + salt
        self.tic['name'] = [func_rename2(x) for x in self.tic['name']]
        # we do not need these informatoion anymore
        self.tic.drop(['dummy'], axis=1, inplace=True)

        # let us have the name as column names to be consistent with the data
        self.tic = self.tic.set_index("name").transpose()

        # let us reorder the columns similarly to the data
        self.tic = self.tic[self.measurements.columns]

        # FIXME there is probably a way to divide enture matrix by the tic row
        # but could not figure it out so let us do it column by column
        for col in self.measurements.columns:
            self.df[col] /= self.tic[col].values[0]


    # CLAUDIA this is an attemp to build scaling function using as template scale function from clustering.py
[docs]    def scale(self):

        # the scale function uses unbiased variance
        for col in self.measurements.columns:
            X = self.df[col]
            self.df[col]=(X-X.mean())/X.std(ddof=1)


     # CLAUDIA this is an attemp to build log2 function
[docs]    def log_b2(self):

        X=self.df[self.measurements.columns]
        self.df[self.measurements.columns]=np.log2(X)


[docs]    def plot_timeseries_midas(self, identifier):

         measurements = [u'a0_t0', u'a0_t1', u'a0_t5', u'a0_t10', u'a0_t20', u'a0_t45',
         u'a1_t1', u'a5_t5', u'a10_t10', u'a20_t20', u'a45_t45',
         u'a1_t0', u'a5_t0', u'a10_t0', u'a20_t0', u'a45_t0']

         l = [0,1,5,10,20,45,46,50,55,65,90,91,95,100,120,145]
         data= self.get_mu_df()[measurements]

         index = self.df[self.df.Identifier == identifier].index[0]
         data = data.ix[index]
         print data
         pylab.plot(l, data, 'ro-')

[docs]    def get_na_per_experiment_after_averaging(self):
        """average data and figure out number of NA per alpha experiment"""
        mu = self.get_mu_df()

        na = pd.DataFrame()
        for exp in ['a0', 'a1', 'a5', 'a10', 'a20', 'a45']:
            sum_na = pd.isnull(mu[[x for x in mu.columns if x.startswith(exp+"_")]]).sum(axis=1)
            na[exp] = sum_na
        return na

[docs]    def get_na_per_salt_experiment_after_averaging(self):
        """average data and figure out number of NA per alpha experiment"""
        mu = self.get_mu_df()

        na = pd.DataFrame()
        for exp in ['t0', 't1', 't5', 't10', 't20', 't45']:
            sum_na = pd.isnull(mu[[x for x in mu.columns if x.endswith(exp)]]).sum(axis=1)
            na[exp] = sum_na
        return na

    #def is_na_count_exceeds_minnonzero_in_one_experiment(self, min_non_zero=4):
    #    na = self.get_na_per_experiment_after_averaging()
    #    return na.max(axis=1) >= min_non_zero

[docs]    def is_na_count_exceeds_minnonzero_in_all_experiment(self, min_non_zero=4):
        na = self.get_na_per_experiment_after_averaging()

        N = len(self.get_unique_measurement_name())

        # <= like in Stephania's code so that a row with only 4 non-na values are removed.
        ts =  N - na.sum(axis=1) <=  min_non_zero
        return ts

[docs]    def drop_na_exceeds_minnonzero(self, min_non_zero=4):
        """drop rows where Number of number of non-null values is below 4

        .. note:: Not used anymore. Maybe dropped.

        """
        ts = self.is_na_count_exceeds_minnonzero_in_all_experiment(min_non_zero=min_non_zero)
        self.logging.info("Dropping %s rows that have less than %s non-na values" % (sum(ts),min_non_zero))
        self.df.drop(ts[ts].index, inplace=True)

[docs]    def clean_na_paper(self, inplace=True,N=2):
        """To be used with YEAST data only

        :param N: max number of NAs per experiment

        The YEAST data set is made of 6 experiments of 6 data points times 3 replicates.
        We first average the data over replicates so we have 6 experiments of 6 data points.
        In each experiments, if there have less than 6 - N values, then that experiment is reset:
        all data (including replicates) are set to NAs.

        """
        na = self.get_na_per_experiment_after_averaging()
        filterna = na>N # means that there less than 4 measurements

        labels = list(na.columns)
        # Let us exoande the na dataframe by copying each column (e.g. a0)
        # into expected replicates e.f. (a0_t0, a0_t0.1, ...a0_t45)
        for column in filterna.columns:
            expanded_columns = [x for x in self.measurements.columns if x.startswith(column+"_")]
            #col1 = [this+".1" for this in expanded_columns]
            #col2 = [this+".2" for this in expanded_columns]
            #expanded_columns.extend(col1)
            #expanded_columns.extend(col2)
            for x in expanded_columns:
                filterna[x] = filterna[column]

        # let us remove the labels a0,a1,a5,a10,a20,a45
        filterna.drop(labels, axis=1, inplace=True)

        for this in self.metadata.columns:
            filterna[this] = [False] * len(filterna)
        self._filterna = filterna

        if inplace:
            self.df = self.df.mask(filterna)
        else:
            df = self.df.mask(filterna)
            return df

[docs]    def clean_na_paper2(self, inplace=True, N=2, M=2):
        """To be used with YEAST data only

        Identify rows with at least 2 experiments (M=2) with more than 6-N values
        and crosses the 6 x replicates measurements in the experiment.
        """
        na = self.get_na_per_salt_experiment_after_averaging()
        filterna = na>N # means that there less than 4 measurements

        labels = list(na.columns)
        # Let us exoande the na dataframe by copying each column (e.g. a0)
        # into expected replicates e.f. (a0_t0, a0_t0.1, ...a0_t45)
        for column in filterna.columns:
            expanded_columns = [x for x in self.measurements.columns if x.split(".")[0].endswith(column)]
            for x in expanded_columns:
                filterna[x] = filterna[column]

        # let us remove the labels a0,a1,a5,a10,a20,a45
        filterna.drop(labels, axis=1, inplace=True)

        for this in self.metadata.columns:
            filterna[this] = [False] * len(filterna)
        self._filterna = filterna

        if inplace:
            self.df = self.df.mask(filterna)
        else:
            df = self.df.mask(filterna)
            return df
MS-DAS 0.9.0 documentation

Source code for msdas.replicates

Search