Source code for cellnopt.core.midas

# -*- python -*-
#
#  This file is part of the cinapps.tcell package
#
#  Copyright (c) 2012-2013 - EMBL-EBI
#
#  File author(s): Thomas Cokelaer (cokelaer@ebi.ac.uk)
#
#  Distributed under the GLPv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  website: www.cellnopt.org
#
##############################################################################
from __future__ import print_function
from __future__ import unicode_literals
import types

import pylab
import numpy
import numpy as np
import pandas as pd

import normalisation
#from oldmidas import MIDAS

from easydev.logging_tools import Logging
import colormap
from easydev import check_param_in_list


__all__ = ["XMIDAS", "MultiMIDAS", "TypicalTimeSeries", "Experiment",
           "Experiments", "MIDASBuilder"]



class MIDAS(object):
    """What is it ?"""
    valid_codes = {
                   'ID':'identifier',
                   'TR':'stimuli/inhibitors',
                   'DA':'times',
                   'DV':'measurements'}

    ignore_codes = ['NOINHIB', 'NOCYTO', 'NOLIG', 'NO-CYTO', 'NO-INHIB', 'NO-LIG']

    def __init__(self, filename=None, verbose=False):
        """.. rubric:: Constructor

        :param str filename: the filename (including correct path)

        """
        self._header = []
        self._names = []
        self.verbose = verbose

        #: filename of the original data
        self.filename = filename

        self.cmap_scale=0.5
        self.fontsize = 16
        self.logging = Logging("INFO")
        self._colormap = colormap.Colormap()




[docs]class MultiMIDAS(object):
    """Data structure to store multiple instances of MIDAS files


    You can read a MIDAS file that contains several cell lines:
    and acces to the midas files usig their cell line name

    .. doctest::

        >>> mm = MultiMIDAS(cnodata("EGFR-ErbB_PCB2009.csv"))
        >>> mm.cellLines
        ['HepG2', 'PriHu']
        >>> mm["HepG2"].namesCues
        ['TGFa', 'MEK12', 'p38', 'PI3K', 'mTORrap', 'GSK3', 'JNK']

    where the list of cell line names is available in the :attr:`cellLines`
    attribute.

    Or you can start from an empty list and add instance later on using :meth:`addMIDAS`
    method.

    """
    def __init__(self, filename=None):
        """.. rubric:: constructor

        :param str filename: a valid MIDAS file (optional)

        """
        self._midasList = []
        self._names = []
        if filename:
            self.readMIDAS(filename)

[docs]    def addMIDAS(self, midas):
        """Add an existing MIDAS instance to the list of MIDAS instances

        .. doctest::

            >>> from cellnopt.core import *
            >>> m = MIDASReader(cnodata("MD-ToyPB.csv"))
            >>> mm = MultiMIDAS()
            >>> mm.addMIDAS(m)

        """
        if midas.celltypeName not in self._names:
            self._midasList.append(midas)
            self._names.append(midas.celltypeName)
        else:
            raise ValueError("midsa with same celltype already in the list")

[docs]    def readMIDAS(self, filename):
        """read MIDAS file and extract individual cellType/cellLine

        This function reads the MIDAS and identifies the cellLines. Then, it
        creates a MIDAS instance for each cellLines and add the MIDAS instance to the
        :attr:`_midasList`. The MIDAS file can then be retrieved using their
        cellLine name, which list is stored in :attr:`cellLines`.

        :param str filename: a valid MIDAS file containing any number of cellLines.


        """
        from oldmidas import MIDASReader
        m = MIDASReader()
        m.filename = filename
        m._names = m._readHeader()
        celltypes = m._get_celltypes()
        if len(celltypes) <= 1:
            m = MIDASReader(filename)
            self.addMIDAS(m)
        else:
            for name in celltypes.keys():
                m = MIDASReader(filename, celltype=name)
                self.addMIDAS(m)

    def _get_cellLines(self):
        names = [x.celltypeName for x in self._midasList]
        return names
    cellLines = property(_get_cellLines,
        doc="return names of all cell lines, which are the MIDAS instance identifier ")

    def __getitem__(self, name):
        index = self.cellLines.index(name)
        return self._midasList[index]

[docs]    def plot(self):
        """Call plot() method for each MIDAS instances in different figures

        More sophisticated plots to easily compare cellLines could be
        implemented.

        """
        for i,m in enumerate(self._midasList):
            from pylab import figure, clf
            figure(i+1)
            clf()
            m.plot()


[docs]class XMIDAS(MIDAS):
    """XMIDAS dat structure. X stands for extended and replaces
    :class:`MIDASReader` class.

    ::

        from cellnopt.core import XMIDAS
        m = XMIDAS(cnodata("tes.csv"))
        m.df # access to the data frame
        m.scale_max(gain=0.9)   # scale over all experiments and time finding the max
                        # and scaling (divide by max) for each species individually.
        m.corr()        # alias to m.df.corr() removing times/experiments
        columns



        tuples = [(exp, time) for exp in ["exp1", "exp2"] for time in [0,1,2,3,4]]
        index = pd.MultiIndex.from_tuples(tuples, names=["experiment", "time"])
        xx = pd.DataFrame(randn(10,2), index=index, columns=["Akt", "Erk"])


    What remains to be done ?

        * average over celltypes  To be done in MultiMIDAS


    .. warning:: if there are replicates, call average_replicates before creating a
        simulation or calling plot"mode="mse")

    .. todo:: when using MSE, an option could be to average the errors by taking into
        account the time. In other words, a weight/integral. if t = 1,2,3,4,5,10,60, the
        errors on 1,2,3,4,5 are more  important than between 5,10,60. does it make sense ?

    .. todo:: make df a property to handle sim properly, if not scaled,
        sim and exp seems to have the same scale also errors are large as expected

    .. warning:: MD-TR-33333-JITcellData.csv contains extra ,,,, at the end. should
        be removed or ignored

    .. todo:: colorbar issue with  midas.XMIDAS("share/data/MD-test_4andgates.csv")


    .. todo:: when plotting, if there is only 1 stimuli and 5-6 inhibitors, the
        width of the stimuli is the same as the one with the inhibitors. cell size
        should be identical, not stretched. See e..g., "EGFR-ErbB_PCB2009.csv"


    .. todo:: when ploting the mse, we should be able to plotonly a subset of the time
      indices (useful for bollean analysis at a given time)


    .. todo::MIDAS have two ways of coding stimuli/inhibitors a short and long version
        TR:aa / TR::aai or TR:aa:Stimuli / TR:aa:Inhibitors note that in the shotr case,
        the letter i is used to encode inhibitor, which is not robust at all.


    ..todo:: a MIDAS class to check validity just to simplfy the XMIDAs class itself.

    .. todo:: inhibitors ends in :i to avoid clashes with same name in stimuli..
    """
    def __init__(self, filename=None, cellLine=None, verbose=False):
        super(XMIDAS, self).__init__(filename)

        self._cellLine = cellLine
        self._celltype_index = 0
        self._experiment_index = 1
        self._time_index = 2
        self._levels = ["cellLine", "experiment", "time"]
        self.verbose = verbose

        self._ignore_invalid_columns = True

        self._read_filename()
        self.create_empty_simulation()
        self.errors = self.sim.copy()

    def _check_param_in_list(self, names, values):
        names = self._str_or_list_to_list(names)
        for name in names:
            # easydev function
            check_param_in_list(name, values)

[docs]    def reset(self):
        """Reset the data to the original data.



        .. warning:: experimental
        .. todo:: copy errors ?
        """
        self.df = self._rawdf.copy()
        self._experiments = self._rawexp.copy()

    def _read_filename(self, filename=None):
        if filename != None:
            self.filename = filename
        if self.filename != None:
            self._data = pd.read_csv(self.filename, skipinitialspace=True, sep=",")
        else:
            self._data = pd.DataFrame()
            self._experiments = pd.DataFrame()
            self.df = pd.DataFrame()
            return

        # figure out the cell line names
        self._preprocess_cellines()

        # remove columns that are invalid
        self._midas_validity()

        # some cleanup to remove columns that have to be ignored
        labels = ["TR:"+x for x in self.ignore_codes if "TR:"+x in self._data.columns]
        self._data = self._data.drop(labels, axis=1)

        try:
            self._init()  # read MIDAS and convert to dataframe
        except Exception, e:
            self.logging.warning("Could not interpret the MIDAS input data file")
            self.logging.warning(e.message)
        self._rawdf = self.df.copy()
        self._rawexp = self.experiments.copy()

        # populate thr simulation dataframe if possible

    def _preprocess_cellines(self):
        #CellLine are tricky to handle with the MIDAS format because they use the
        #same prefix TR: as the treatments. You must be sure that (1) there are
        # 2 : signs (2) the suffix is called CellLine and (3) the middle name is
        # provided

        # Those could be recognised but only CellLine is correct. So, let us
        # enfore the correct one instead of dealing with all kind of user
        # choices.

        cellLines = [col for col in self._data.columns if "CellLine" in col]
        if len(cellLines) == 0:
            raise ValueError("Could not find any column with the required keyword 'CellLine'")

        for name in cellLines:
            if name.count(":") != 2:
                txt = "column name related to CellLine %s must have 2 : characters"
                txt += "{} has less or more than 2".format(name)
                raise ValueError(txt)

        # rename the cellLines if there are undefined
        # let us give thme the same name ('undefined')
        # this should be done at the level of the data
        for i,name in enumerate(cellLines):
            if name.split(":")[1] == "":
                self.logging.warning("Found a column related to CellLine without a name. Renamed to undefined.")
                columns = list(self._data.columns)
                columns[i] = "TR:undefined:CellLine"
                self._data.columns = columns

        # if there is only one undefined, no need for an error.
        # otherwise they will be an ambiguity.
        if len(cellLines) != len(set(cellLines)) and len(cellLines)>1:
            raise ValueError("some cellLines have the same name.")


        for this in self._data.columns:
            if this.split(":")[0].startswith("TR") == True:
                if "cellLine" in this.lower() and "CellLine" not in this:
                    raise ValueError("Found column with invalid tag. A Cell Line must be written 'TR:<name>:CellLine' not {}. Capitalisation matters.".format(this))

        celltype_names = self._get_cellLines()

        if len(cellLines) >=2:
            if self.cellLine == None or self.cellLine not in celltype_names:
                txt = "Error:: More than 1 celline was found.\n"
                txt += "You must select one amongst: {}".format([this.split(":")[1] for this in cellLines])
                raise ValueError(txt)
            else:
                # we could remove columns and rows where cell type is not correct.
                # but we are going to do it in the _init so that a user can
                # change his mind and reset the cell line to be looked at.
                pass

        if len(celltype_names) == 1:
            self._cellLine = celltype_names[0]

        #self._valid_cellLine_names = [this for this in self._]

    def _midas_validity(self):
        # checks are made of self._data only not df that will be built later on.

        if self._ignore_invalid_columns:
            for this in self._data.columns:
                columns = [this for this in self._data.columns if this[0:2] in self.valid_codes]
                bad = [this for this in self._data.columns if this[0:2] not in self.valid_codes]
                if len(bad):
                    self.logging.warning("Found columns that are invalid (do not start with {}). There are removed.".format(self.valid_codes.keys()))
                    self._data = self._data[columns]

            #columns = [this in self._data.columns if this.startswith("")]
            pass

        # check validity of TR:
        for this in self._data.columns:
            if ":" not in this:
                txt = "Error in header of the input MIDAS file\n"
                txt += "    Column's name must contain the special character ':'\n"
                raise ValueError(txt)
            if this.split(":")[0] not in self.valid_codes.keys():
                txt = "Error in header of the input MIDAS file\n"
                txt += "    Column's name must start with one of the valid code {}\n".format(self.valid_codes)
                raise ValueError(txt)


        # check if zero time data is available otherwise need to call
        #maybe better to play with -df
        #_duplicate_time_zero_using_inhibitors_only
        df = self._data[[this for this in self._data.columns if this.startswith("DA")]]
        unique_times = list(set(df.as_matrix().flatten()))
        unique_times.sort()
        if len(unique_times) <2:
            raise ValueError("Must contains at least 2 time points including time zero")
        if 0 not in unique_times or len(unique_times)<=1:
            raise ValueError("You must have zero times in the MIDAS file, that was not found.")
        times = list(df.as_matrix().flatten())
        counter = {}
        for time in unique_times:
            counter[time] = times.count(time)

        self._missing_time_zero = False
        if len(set(counter.keys())) >= 2:
            if counter[0] != counter[unique_times[1]]:
                # call correct function
                self._missing_time_zero = True
            else:
                pass

        if len([x for x in self._data.columns if x.startswith("DV")]) == 0:
            raise ValueError("Header of MIDAS file has no columns starting with DV. expects at least one")


    def _manage_replicates(self):
        """

         tuples = [("HepG2", exp, time) for exp in ["exp1", "exp2"] for time in   [0,1,2,3,4]]
         tuples += [("Liver", exp, time) for exp in ["exp1", "exp2"] for time in [0,1,2,3,4]]
         tuples += [("HepG2", exp, time) for exp in ["exp1"] for time  in [0,1,2,3,4]]
         index = pd.MultiIndex.from_tuples(tuples, names=["CellType", "experiment", "time"])
         xx = pd.DataFrame(randn(25,2), index=index, columns=["Akt", "Erk"])

        xxx = xx.groupby(level=["CellType", "experiment", "time"]).agg([mean, std])
        xxx.to_csv("test.csv", tupleize_cols=False)
        pd.read_csv("test.csv", index_col=[0,1,2], tupleize_cols=False, header=[0,1])

        x.df = x.df[[this for this in x.df.columns if "mean" in this]]
        x.df.columns = [this[0] for this in x.df.columns]

        """
        groups = self.df.groupby(level=self._levels).groups
        if any([len(this)>1 for this in groups.values()])==False:
            self.logging.info("No replicates found")
        else:
            newdf = self.df.groupby(level=self._levels).agg([np.mean, np.std])
            return newdf

[docs]    def average_replicates(self, inplace=False):
        df = self._manage_replicates()

        if isinstance(df,pd.DataFrame):
            dfstd = df[[this for this in df.columns if "std" in this]]

            df = df[[this for this in df.columns if "mean" in this]]
            df.columns = [this[0] for this in df.columns]
            if inplace:
                self.df = df.copy()
                self.sim = self.df.copy()
                self.errors = dfstd.copy()
            else:
                return df

    #def __radd__(self, this):
    #    print("__radd__")
    #    self.df += this.df

    def __div__(self, this):
        m = self.copy()
        m.df /= this
        return m

    def __mul__(self, this):
        m = self.copy()
        m.df *= this
        return m

    def __sub__(self, this):
        # m1 + m2   or m1+=m2
        m = self.copy()
        if isinstance(this, XMIDAS):
            m.df -= this.df
        else:
            m.df -= this
        return m

    def __add__(self, this):
        # m1 + m2   or m1+=m2
        m = self.copy()
        if isinstance(this, XMIDAS):
            m.df += this.df
        else:
            m.df += this
        return m

    def __getitem__(self, item):
        """

        .. doctest::

            >>> m = XMIDAS("MD-ToyPB.csv")
            >>> m['p38']
            >>> m['Cell','experiment_0', '0']

        """
        if len(item)==3 and isinstance(item, str) != True:
            i1, i2, i3 = item
            return self.df.ix[i1].ix[i2].ix[i3]
        elif isinstance(item,str)==True:
            if item in self.df.columns:
                return self.df[item].copy()

    def _get_experiments(self):
        return self._experiments
    experiments = property(_get_experiments,
            doc="Return dataframe with experiments")

    def _get_names_species(self):
        return self.df.columns.values
    names_species = property(_get_names_species,
            doc="list of species")
    names_signals = property(_get_names_species,
            doc="same as :attr:`names_species`")
    species = property(_get_names_species,
            doc="Getter for the columns of the dataframe that represents the species/signals")
    signals = property(_get_names_species,
            doc="Getter for the columns of the dataframe that represents the species/signals")

    def _get_cues(self):
        #cues = [x for x in self.experiments.columns if x.startswith('TR:')]
        cues = [x for x in self.experiments.columns]
        return cues
    def _get_names_cues(self):
        cues = self.names_stimuli + self.names_inhibitors
        return cues
    names_cues = property(_get_names_cues,
            doc="Return list of stimuli and inhibitors together")

    def _get_cellLines(self):
        # XMIDAS created from an input file
        if len(self._data):
            names = [this for this in self._data.columns if "CellLine" in this]
            names = [this.split(":")[1] for this in names]
            return names
        # XMIDAS created from another builder e.g. MIDASBuilder
        else:
            names = self.df.index.levels[0]
            return names
    cellLines = property(_get_cellLines)

    def _get_cellLine(self):
        return self._cellLine
    def _set_cellLine(self, name):
        # TODO check valid name
        names = self.cellLines
        if name not in names:
            raise ValueError("Invalid cellLine name {}. Valid ones are {}".format(name, names))
        self._cellLine = name
        # TODO: do we need to call _init again ?
        self._init()
    cellLine = property(_get_cellLine, _set_cellLine)

    def _get_times(self):
        times = self.df.index.levels[self._time_index]
        return sorted(list(times))
    times = property(_get_times)

    def _get_names_inhibitors(self):
        cues = self._get_cues()
        cues = [x for x in cues if x.endswith(":i")]
        cues = [x[:-2:] for x in cues]
        return cues
    names_inhibitors = property(_get_names_inhibitors)

    def _get_names_stimuli(self):
        cues = self._get_cues()
        return [x for x in cues if not x.endswith(":i")]
    names_stimuli = property(_get_names_stimuli)

    def _init(self):

        # select only data that matches the cell line choice made by the user.
        cellLine = 'TR:%s:CellLine' % self.cellLine
        _data = self._data[self._data[cellLine] == 1]
        # and remove all column with the CellLine keyword
        _data = _data[ [col for col in _data.columns if "CellLine" not in col]]
        #drop ID columns if any
        _data = _data[ [col for col in _data.columns if col.startswith("ID")==False]]


        df_tr = _data[[this for this in _data.columns if this.startswith("TR")]]
        df_da = _data[[this for this in _data.columns if this.startswith("DA")]]
        df_dv = _data[[this for this in _data.columns if this.startswith("DV")]]
        # TODO sort alphabetical ignoring big caps
        df_dv = _data[[this for this in _data.columns if this.startswith("DV")]]

        value_experiments = _data[df_tr.columns]
        value_experiments.replace("NaN", 0, inplace=True)

        value_signals = _data[df_dv.columns].as_matrix()
        value_times = _data[df_da.columns]


        names = [this for this in df_tr[:] if "CellLine" not in this]
        self._experiments = _data[names].drop_duplicates()
        self._experiments.index = range(0, self._experiments.shape[0])
        self._experiments.index = ["experiment_{}".format(this) for this in  self._experiments.index]

        self._experiments.replace("NaN", 0, inplace=True)

        # build the tuples that will be used by the MultiIndex dataframe
        tuples = []
        # make sure to read each row in the original data using .shape
        for ix in _data.index:
            # 1. find name of the experiment by
            this_exp = value_experiments.ix[ix]
            # scan unique experiments and figure out which one is this_exp
            exp_name = None
            for this_unique_exp in self._experiments.iterrows():
                if all(this_unique_exp[1] == this_exp):
                    #[1:] to ignore the cell line TODO
                    # found it
                    exp_name = this_unique_exp[0]
            assert exp_name != None

            # 2. times
            time = set(value_times.ix[ix])
            assert len(time) == 1
            time = list(time)[0]
            tuples.append((self.cellLine, exp_name, time))

        # replace empty strings with 0
        # note that ,,, is interpreted as ,NaN,NaN,NaN
        # but , , , is interpreted as ," "," "," ",
        self._experiments = self._experiments.applymap(lambda x: 0 if isinstance(x, basestring) and x.isspace() else x)
        self._experiments = self._experiments.convert_objects(convert_numeric=True, copy=True)

        index =  pd.MultiIndex.from_tuples(tuples, names=self._levels)
        #keep = [this for this in self.df.columns if this not in ["experiments", "times"]]
        names_species = [x for x in _data.columns if x.startswith('DV:')]
        names_species = [x[3:] for x in names_species]

        self.df = pd.DataFrame(value_signals, index=index, columns=names_species)
        self.df = self.df.sortlevel(["experiment"])
        self.df = self.df.sort_index(axis=1) # sort the species

        if self._missing_time_zero == True:
            self._duplicate_time_zero_using_inhibitors_only()
        if self.df.shape[0] > len(self.times) * self.experiments.shape[0]:
            self.logging.warning("WARNING:: you may have duplicated experiments, pleiase average the replaicates using self.average_replicates(inplace=True)")
        if self.df.max(skipna=True).max(skipna=True) > 1:
            self.logging.warning("WARNING:: values larger than 1. You may want to normalise/scale the data")

        # Get rid of TR in experiments
        self._experiments.columns = [this.replace("TR:", "") for this in self._experiments.columns]
        #

        cues = []
        for c in self._experiments.columns:
            if c.count(":") >=2:
                raise ValueError("Invalid header. Found more than 2 : sign in a column")

            if c.endswith(":Stimuli"):
                cues.append(c.split(":")[0])
            elif c.endswith(":Inhibitors"):
                cues.append(c.split(":")[0] + ":i")
            elif c.endswith("i"):
                cues.append(c[0:-1] + ":i")
            else:
                cues.append(c)
        self._experiments.columns = cues

    def _check_consistency_data(self):
        # times consistency all times must have same length
        #
        pass

    def _duplicate_time_zero_using_inhibitors_only(self):
        """
        Sometimes the time zero data sets are not explicitly written in MIDAS
        files. One example is MD-ExtLiverHepG2-MCP2010-mod4.csv from the

        """
        self.logging.warning("WARNING:: duplicating time zeros data to fit experiment at other times")
        # first figure out the inhibitors

        # find experiment that have missing time zero data
        experiments = list(self.experiments.index)

        tobeadded = None
        for i, this_exp in enumerate(experiments):
            times = self.df.ix[self.cellLine].ix[experiments[i]].index
            if 0 in times:
                pass
            else:
                # need to find the experiment with same inhibitors
                # and time 0
                # there should be only one ? maybe not if replicates.
                these_inhibitors =  self.experiments.ix[this_exp][self.inhibitors]

                for this_exp_intern in experiments:
                    times = self.df.ix[self.cellLine].ix[this_exp_intern].index
                    if 0 in times:
                        if all(self.experiments.ix[this_exp_intern][these_inhibitors.index] ==
                            these_inhibitors):
                            #print("{}(no time zero found) is similar to {}".format(this_exp,  this_exp_intern))
                            break # so that if there are several replicates found, we pick up the first one only

                # get the times for this experimenti. it must contain the time zero
                newdata = self.df.xs((self.cellLine, this_exp_intern))
                # we only need the time 0
                newrow = newdata[newdata.index == 0]
                # let us add some index information that is now missing
                newrow['time'] = 0
                newrow['experiment'] = this_exp
                newrow['cellLine'] = self.cellLine

                # we can now merge with the full data set
                if types.NoneType == type(tobeadded):
                    tobeadded = newrow.copy()
                else:
                    tobeadded = pd.concat([tobeadded, newrow])
        # finally concatenate all the rows with the dataframe df
        df = pd.concat([self.df.reset_index(), tobeadded], ignore_index=True)
        df = df.set_index(["cellLine", "experiment", "time"])
        df = df.sortlevel(["experiment"])
        self.df = df.copy()

    # could be in easydev
    def _str_or_list_to_list(self, labels):
        # TODO: could be part of easydev
        if isinstance(labels, str):
            labels = [labels]
        elif isinstance(labels, float):
            labels = [labels]
        elif isinstance(labels, int):
            labels = [labels]
        elif isinstance(labels, list):
            pass
        else:
            raise TypeError("must provide a list of strings or a string.")
        return labels

[docs]    def remove_species(self, labels):
        """Remove a set of species

        :param labels: list of Species (list of strings) or just one
            species(single string or as a list.

        ::

            m.remove_species("p38")
            m.remove_species(["p38"])

        """
        labels = self._str_or_list_to_list(labels)
        columns = self.df.columns[:]
        for label in labels:
            if label not in columns:
                self.logging.warning("{} not in the species. skipped".format(label))
            else:
                self.df.drop(label, axis=1, inplace=True)
                self.sim.drop(label, axis=1, inplace=True)
                self.errors.drop(label, axis=1, inplace=True)

[docs]    def reset_index(self):
        """Remove all indices (cellLine, time, experiment)

        Done in the 3 dataframes :attr:`df`, :attr:`sim` and :attr:`errors`
        """
        self.df.reset_index(inplace=True)
        self.sim.reset_index(inplace=True)
        self.errors.reset_index(inplace=True)

[docs]    def set_index(self):
        """Reset all indices (cellLine, time, experiment)

        Done in the 3 dataframes :attr:`df`, :attr:`sim` and :attr:`errors`
        """
        if len(self.df):
            self.df.set_index(['cellLine', 'experiment', 'time'], inplace=True)
            self.sim.set_index(['cellLine', 'experiment', 'time'], inplace=True)
            self.errors.set_index(['cellLine', 'experiment', 'time'], inplace=True)

[docs]    def remove_cellLine(self, labels):
        """Remove a cellLine from the dataframe.


        Does not really work since there is only one cellLine in the dataframe.
        all data is contained in :attr:`data` but the current dataframe contains only
        one, which can be changed simply by setting the cellLine attribute with one of the
        valid cellLine found in the :attr:`cellLines` attribute

        """
        self._remove_labels_from_level(labels, "cellLine")

[docs]    def remove_times(self, labels):
        """Remove time values from the data

        :param list labels: one time point or a list of time points. Valid time points
            are in the :attr:`times` attribute.

        """
        self._remove_labels_from_level(labels, "time")

[docs]    def remove_experiments(self, labels):
        """Remove experiment(s) from the dataframe


        :param list labels: one experiment or a list of experiments.
            Valid experiments are in the :attr:`experiments.index` dataframe.
            Experiments are of the form "experiment_12". You can refer
            to an experiment by its number (e.g., here 12).
        """
        self._remove_labels_from_level(labels, "experiment")

    def _remove_labels_from_level(self, labels, level):
        """For a given level and a list of labels, this function
        figures out the rows to remove in the sim/errors/df data frames.

        This is a bit complicated but looks like the proper way

        """
        labels = self._str_or_list_to_list(labels)
        if level == "experiment":
            labels = [x if "experiment" in str(x) else "experiment_"+str(x)
                    for x in labels]

        self.reset_index()
        for label in labels:
            if label not in set(self.df[level]):
                self.logging.warning("{} not in times. Skipped".format(label))
            else:
                self.df.drop(self.df[self.df[level] == label].index, inplace=True)
                self.sim.drop(self.sim[self.sim[level] == label].index, inplace=True)
                self.errors.drop(self.errors[self.errors[level] == label].index, inplace=True)
                if level == "experiment":
                    self.experiments.drop(label, inplace=True)
        # and now set the level back as before
        self.set_index()

[docs]    def remove_stimuli(self, labels):
        """Remove a stimuli from the :attr:`experiment` dataframe

        :param labels: a string or list of string representing the stimuli
        """
        self._remove_column_experiment(labels)

[docs]    def remove_inhibitors(self, labels):
        """Remove inhibitor(s) from the :attr:`experiment` dataframe

        :param labels: a string or list of string representing the inhibitor(s)
        """
        self._remove_column_experiment(labels)

    def _remove_column_experiment(self, labels):
        labels = self._str_or_list_to_list(labels)
        for label in labels:
            if label not in set(self.experiments.columns):
                self.logging.warning("{} not in times. Skipped".format(label))
            else:
                self.experiments.drop(labels, axis=1, inplace=True)

[docs]    def rename_stimuli(self, names_dict):
        """Rename stimuli in the :attr:`experiment` dataframe

        :param names_dict: a dictionary with names (keys) to be replaced (values)

        ::

            from cellnopt.core import *
            m = XMIDAS(cnodata("MD-ToyPB.csv"))
            m.rename_species({"erk":"ERK", "akt":"AKT"})

        .. seealso:: :meth:`rename_stimuli`, :meth:`rename_inhibitors`

        """
        self._check_param_in_list(names_dict.keys(), list(self.experiments.columns))
        columns = list(self.experiments.columns)
        columns = [c if c not in names_dict.keys() else names_dict[c] for c in columns]
        self.experiments.columns = columns

[docs]    def rename_inhibitors(self, names_dict):
        """Rename inhibitors

        :param names_dict: a dictionary with names (keys) to be replaced (values)

        ::

            from cellnopt.core import *
            m = XMIDAS(cnodata("MD-ToyPB.csv"))
            m.rename_species({"raf:i":"RAF:i"})

        .. seealso:: :meth:`rename_stimuli`, :meth:`rename_species`

        .. warning:: inhibitor name must end with the string **:i**

        .. todo:: sanity check that the pair of key/value contain the :i characters


        """
        self._check_param_in_list(names_dict.keys(), list(self.experiments.columns))
        columns = list(self.experiments.columns)
        columns = [c if c not in names_dict.keys() else names_dict[c] for c in columns]
        self.experiments.columns = columns

[docs]    def rename_species(self, names_dict):
        """Rename species in the main :attr:`df` dataframe

        :param names_dict: a dictionary with names (keys) to be replaced (values)

        ::

            from cellnopt.core import *
            m = XMIDAS(cnodata("MD-ToyPB.csv"))
            m.rename_species({"erk":"ERK", "akt":"AKT"})


        .. seealso:: :meth:`rename_stimuli`, :meth:`rename_inhibitors`


        """
        self._check_param_in_list(names_dict.keys(), list(self.df.columns))
        columns = list(self.df.columns)
        columns = [c if c not in names_dict.keys() else names_dict[c] for c in columns]
        self.df.columns = columns
        self.sim.columns = columns

[docs]    def rename_cellLine(self, to_replace):
        """Rename cellLine indices

        :param dict to_replace: dictionary with mapping of values to be replaced.

        For example; to convert time in minutes to time in seconds, use something like::

            m.rename_cellLine({"undefined": "PriHu"})

        """
        self.reset_index()
        self.df.replace({"cellLine": to_replace}, inplace=True)
        self.set_index()

[docs]    def rename_time(self, to_replace):
        """Rename time indices

        :param dict to_replace: dictionary with mapping of values to be replaced.

        For example; to convert time in minutes to time in seconds, use something like::

            m.rename_time({0:0,1:1*60,5:5*60})

        """
        self.reset_index()
        self.df.replace({"time": to_replace}, inplace=True)
        self.set_index()

[docs]    def merge_times(self, how="mean"):
        raise NotImplementedError

[docs]    def add_experiment(self, e):
        raise NotImplementedError

[docs]    def corr(self, names=None, cmap=None):
        """plot correlation between the measured species

        :param list names: restriction to some species if provided.
        :param string cmap: a valid colormap (e.g. jet). Can also use "green" or "heat".

        .. plot::
            :include-source:
            :width: 80%

            >>> from cellnopt.core import *
            >>> m = XMIDAS(cnodata("MD-ToyPB.csv"))
            >>> m.corr(cmap="green")

        """
        cmap = self._get_cmap(cmap)

        corr = self.df.corr()
        N = corr.shape[0]
        names = self.df.columns[:]

        pylab.clf()
        pylab.pcolor(corr, edgecolors="k", cmap=cmap);
        pylab.xticks([x+0.5 for x in range(0,N)], names, rotation=90)
        pylab.yticks([x+0.5 for x in range(0,N)], names)
        pylab.tight_layout()
        pylab.colorbar()
        return corr

[docs]    def scale_max_across_experiments(self, inplace=True):
        """Divide each species column by max acrosss all experiments

        In the MIDAS plot, this is equivalent to dividig each column by
        the max over that column. So, on each column, you should get 1 max values
        set to 1 (if the max is unique). The minimum values may not be set to 0.

        """
        newdf = self.df.divide(self.df.max(), level="experiment")
        if inplace:
             self.df = newdf.copy()
        else:
             return newdf

[docs]    def scale_min_max_across_experiments(self, inplace=True):
        r"""Rescale each species column across all experiments

        .. math::

            X = \frac{X-m}{M-m}


        """
        m = self.df.min()
        M = self.df.max()
        data = (self.df - m)/(M-m.astype(np.float64))
        if inplace:
            self.df = data.copy()
        else:
            return data

[docs]    def scale_max_by_experiments(self, inplace=True):
         newdf = self.df.divide(self.df.max(level="experiment"), level="experiment")
         if inplace:
             self.df = newdf.copy()
         else:
             return newdf

[docs]    def scale_min_max_by_experiments(self, inplace=True):
        m = self.df.min(level="experiment")
        M = self.df.max(level="experiment")
        newdf = self.df.sub(m, level="experiment")
        newdf = newdf.divide(M-m, level="experiment")
        if inplace:
            self.df = newdf.copy()
        else:
            return newdf

[docs]    def scale_max(self, inplace=True):
        """Divide all data by the maximum over entire data set"""
        M = self.df.max().max()
        if inplace:
            self.df /= M
        else:
            return self.df / M

[docs]    def scale_min_max(self, inplace=True):
        r"""Divide all data by the maximum over entire data set

        .. math::

            X = \frac{X-m}{M-m}

        where :math:`m = min_{e,s,t} X` and :math:`M = max_{e,s,t} X`,
        with :math:`e` the experiment, with :math:`s` the species,
        with :math:`t` the time.

        """
        m = self.df.min().min()
        M = self.df.max().max()

        if inplace:
            self.df -= m
            self.df /= (M-m)
        else:
            newdf = self.df - m
            newdf /= (M-m)
            return newdf

[docs]    def create_empty_simulation(self):
        """Populate the simulation dataframe with zeros.

        The simulation has the same layout as the experiment.

        The dataframe is stored in :attr:`sim`.

        """
        self.sim = self.df * 0

[docs]    def create_random_simulation(self):
        """Populate the simulation dataframe with uniformly random values.

        The simulation has the same layout as the experiment.

        The dataframe is stored in :attr:`sim`.
        """
        self.sim = self.df *0 + numpy.random.uniform(size=self.df.shape)

[docs]    def get_diff(self, sim=None, norm="square", normed=True):
        """

        return difference between X and simulation. Take absolute.
        if norm == square or norm == absolute takes absolute values.
        if norm == square, also take power of 2.

        divide by number of time points.

        .. todo:: doc

        """
        # dataframe cannot be compared to None so, we need this trick:
        assert norm in ["absolute", "square"]
        if isinstance(sim, types.NoneType):
            sim = self.sim

        if norm == "square":
            diff = (sim - self.df).abs()**2
        else:
            diff = (sim - self.df).abs()

        diff = diff.sum(level="experiment")

        if normed:
            N = len(self.times)
            diff = diff/float(N)
        return diff

[docs]    def plotSim(self, markersize=3, logx=False, linestyle="--", lw=1,
            color="b", marker="x", **kargs):
        """plot experimental curves

        .. plot::
            :width: 80%
            :include-source:

            >>> from cellnopt.core import *
            >>> m = midas.MIDASReader(cnodata("MD-ToyPB.csv"));
            >>> m.plotMSEs()
            >>> m.plotExp()
            >>> m.plotSim()

        """
        times = numpy.array(self.times)
        # if simulation do not have the same number of points as data
        simtimes = numpy.array(self.sim.index.levels[2])

        if logx == False:
            # a tick at x = 0, 0.5 in each box (size of 1) + last x=1 in last box
            xt = pylab.linspace(0, self.nSignals, self.nSignals*2+1)
            M = max(max(times), max(simtimes))
            times = times/float(max(times))
            simtimes = simtimes/float(M)
            xtlabels = self._get_xtlabels()

        else:
            M = float(max(times))
            xtlin = pylab.linspace(0, self.nSignals, self.nSignals*2+1)
            xt = [int(x)+pylab.log10(1+pylab.mod(x,1)*M)/pylab.log10(1+M)
                    for i,x in enumerate(xtlin)]
            xtlabels = self._get_xtlabels()
            M = max( max(pylab.log10(1+times)), max(pylab.log10(1+simtimes)))
            times = pylab.log10(1+times)/max(pylab.log10(1+times))
            simtimes = pylab.log10(1+simtimes)/float(M)
        #for isim, sim in enumerate(self.sim):

        for i in range(0, self.nExps):
            for j in range(0, self.nSignals):
                # divide data by 1.1 to see results close to 1.
                signal = self.names_signals[j]
                exp = self.experiments.index[i]

                data = self.sim[signal][self.cellLine][exp]

                # sometimes we may want to get rid of all NA and show the lines.
                data = data.dropna()
                times = np.array(list(data.index))
                simtimes = times/float(M)

                pylab.plot(simtimes+j, data/1.05+(self.nExps-i-1), marker=marker, color=color,
                        linestyle=linestyle,
                           markersize=markersize, lw=lw)
                #    plot(times+j, sim[i,j]/1.05+(self.nExps-i-1), 'b--o',
                #           markersize=markersize)
        pylab.gca().set_xticklabels(xtlabels, fontsize=kargs.get("fontsize", 10))
        pylab.gca().set_xticks(xt)

[docs]    def plotExp(self, markersize=3, logx=False,color="black", **kargs):
        """plot experimental curves

        .. plot::
            :width: 80%
            :include-source:

            >>> from cellnopt.core import *
            >>> m = midas.MIDASReader(cnodata("MD-ToyPB.csv"));
            >>> m.plotMSEs()
            >>> m.plotExp()

        .. note:: called by :meth:`plot`
        .. seealso:: :meth:`plot`, :meth:`plotMSEs`, :meth:`plotSim`
        """
        mode = kargs.get("mode", "trend")
        normalise = kargs.get("normalise", True)
        times = np.array(self.times)
        max_time = float(max(self.times))

        if logx == False:
            # a tick at x = 0, 0.5 in each box (size of 1) + last x=1 in last box
            xt = pylab.linspace(0, self.nSignals, self.nSignals*2+1)
            times = times/max_time
            xtlabels = self._get_xtlabels(logx=True)
        else:
            M = max_time
            xtlin = pylab.linspace(0, self.nSignals, self.nSignals*2+1)
            xt = [int(x)+pylab.log10(1+pylab.mod(x,1)*M)/pylab.log10(1+M)
                    for i,x in enumerate(xtlin)]
            xtlabels = self._get_xtlabels()
            times = pylab.log10(1+times)/max(pylab.log10(1+times))

        #for isim, sim in enumerate(self.sim):

        # vMax over all data
        if normalise:
            vMax = float(self.df.max(skipna=True).max(skipna=True))
        else:
            vMax = 1.

        #norm = np.trapz([1]*len(self.times), self.times/max_time)
        #print(norm)

        if mode == "trend":
            ts = TypicalTimeSeries(self.times)

        # TODO must be using the index instead of a range ince indices may not
        # start at zero
        for i in range(0, len(self.experiments)):
            #vMax = float(self.df.max(skipna=True).max(skipna=True))
            for j in range(0, self.nSignals):
                # divide data by 1.1 to see results close to 1.

                y = self.df[self.names_species[j]][self.cellLine][self.experiments.index[i]]
                times = numpy.array(y.index) / max_time
                if mode == "trend":
                    ts._times = times
                #vMax = self.df[self.names_species[j]][self.cellLine].max()

                y = y / vMax / 1.05

                #y = numpy.array([x[i,j] for x in self.exp])/vMax/1.05
                if mode == "trend":
                    try:
                        # time must be normalised by max so alpha is <=1
                        alpha = np.trapz(y.values, times)
                    except:
                        color="white"

                    try:
                        color = ts.get_bestfit_color(y.values)
                    except Exception:
                        color="white"

                    if color == "white":
                        colorc = "k"
                    else:
                        colorc=color

                    try:
                        pylab.plot(times+j, y+self.nExps-i-1 , 'k-o', markersize=markersize, color=colorc)


                        pylab.fill_between(times+j, y+self.nExps-1-i , self.nExps-1-i, alpha=alpha,
                                           color=color)
                    except:
                        pass

                else:
                    pylab.plot(times+j, y+self.nExps-i-1 , 'k-o',
                               markersize=markersize, color="k")


                #    plot(times+j, sim[i,j]/1.05+(self.nExps-i-1), 'b--o', markersize=markersize)
        pylab.gca().set_xticklabels(xtlabels, fontsize=kargs.get("fontsize",10))
        pylab.gca().set_xticks(xt)

    def _get_cmap(self, cmap=None):
        if cmap == "heat":
            cmap = self._colormap.get_cmap_heat_r()
        elif cmap == "green":
            cmap = self._colormap.get_cmap_red_green()
        return cmap

[docs]    def plotMSEs(self, cmap="heat", N=10, norm="square",
        rotation=90,margin=0.05, colorbar=True, vmax=None, vmin=0.,
        mode="trend", **kargs):
        """plot MSE errors and layout


        .. plot::
            :width: 80%
            :include-source:

            >>> from cellnopt.core import *
            >>> m = midas.MIDASReader(cnodata("MD-ToyPB.csv"));
            >>> m.plotMSEs()

        .. todo:: error bars

        .. todo:: dynamic fontsize in the signal names ?

        .. note:: called by :meth:`plot`
        .. seealso:: :meth:`plot`, :meth:`plotMSEs`, :meth:`plotSim`

        .. todo:: need to make it more modular e.g. no cues matrices
        """
        if mode == "trend":
            #should be one with zero being white
            cmap = self._colormap.get_cmap_heat_r()
        else:
            cmap = self._get_cmap(cmap)


        diffs = self.get_diff(self.sim, norm=norm)
        diffs = diffs.ix[self.experiments.index]

        pylab.clf();

        bW = 0.1
        cH = 0.1

        if len(self.names_inhibitors)>0:
            bbW = 0.1
        else:
            bbW = 0

        aH = 1-cH-4*margin
        aW = 1-bW-5*margin - bbW

        # MAIN subplot with signals
        a = pylab.axes([margin, 2*margin, aW, aH])
        M = numpy.nanmax(diffs) # figure out the maximum individual MSE
        m = numpy.nanmin(diffs) # figure out the minimum individual MSE
        vmax_user = vmax
        vmax= max(1, M)         # if M below 1, set the max to 1 otherwise to M
        if vmax_user:
            vmax = vmax_user

        if mode == "mse":
            diffs = masked_array = np.ma.array (diffs, mask=np.isnan(diffs))
            cmap.set_bad("grey", 1.)
            pylab.pcolormesh(pylab.flipud(diffs)**self.cmap_scale, cmap=cmap, vmin=vmin, vmax=vmax, edgecolors='k');
        elif mode == "trend":
            cmap.set_bad("grey", 1.)
            pylab.pcolor(pylab.flipud(diffs*0), cmap=cmap, edgecolors='k');

        a.set_yticks([],[])
        pylab.axis([0, diffs.shape[1], 0, diffs.shape[0]])
        # Could add the names
        ax2 = a.twiny()
        ax2.set_xticks([i+.5 for i,x in enumerate(self.names_species)])
        N = len(self.names_species)
        ax2.set_xticks(pylab.linspace(0.5,N-1, N))
        ax2.set_xticklabels(self.names_species, rotation=90)

        # the stimuli
        b = pylab.axes([margin*2+aW, 2*margin, bW, aH])
        stimuli = numpy.where(numpy.isnan(self.stimuli)==False, self.stimuli, 0.5)

        pylab.pcolor(1-pylab.flipud(stimuli), edgecolors='gray', cmap='gray',vmin=0,vmax=1);
        b.set_yticks([],[])
        b.set_xticks([i+.5 for i,x in enumerate(self.names_stimuli)])
        b.set_xticklabels(self.names_stimuli, rotation=rotation)
        pylab.axis([0,self.stimuli.shape[1], 0, self.stimuli.shape[0]])

        # the inhibitors
        if len(self.names_inhibitors)>0:
            bb = pylab.axes([margin*5+aW, 2*margin, bbW, aH])
            inhibitors = numpy.where(numpy.isnan(self.inhibitors)==False, self.inhibitors, 0.5)
            pylab.pcolor(1-pylab.flipud(inhibitors), edgecolors='gray', cmap='gray',vmin=0,vmax=1);
            bb.set_yticks([],[])
            bb.set_xticks([i+.5 for i,x in enumerate(self.names_inhibitors)])
            bb.set_xticklabels(self.names_inhibitors, rotation=rotation)
            pylab.axis([0,self.inhibitors.shape[1], 0, self.inhibitors.shape[0]])


        d = pylab.axes([margin*2+aW, margin*3+aH, bW, cH])
        pylab.text(0.5,0.5, "Stimuli", color="blue", horizontalalignment="center",
            verticalalignment="center", fontsize=self.fontsize)
        #pcolor(1-numpy.zeros((1, 1)), edgecolors='b', cmap='gray', vmax=1, vmin=0);
        d.set_xticks([],[])
        d.set_yticks([],[])

        if len(self.names_inhibitors)>0:
            dd = pylab.axes([margin*5+aW, margin*3+aH, bbW, cH])
            pylab.text(0.5,0.5, "Inhibitors", color="blue", horizontalalignment="center",
                verticalalignment="center", fontsize=self.fontsize)
            #pcolor(1-numpy.zeros((1, 1)), edgecolors='b', cmap='gray', vmax=1, vmin=0);
            dd.set_xticks([],[])
            dd.set_yticks([],[])

        #colorbar
        # we build our own colorbar to place it on the RHS
        if colorbar and mode=="mse":
            e = pylab.axes([margin*3.5+aW+bW+bbW, 2*margin, margin/2, aH])
            cbar = pylab.linspace(0, 1, N)
            indices = [int(x) for x in cbar**self.cmap_scale*(N-1)]

            cbar = [cbar[i] for i in indices]

            pylab.pcolor(numpy.array([cbar, cbar]).transpose(), cmap=cmap, vmin=0, vmax=1);
            #d.set_xticks([],[])
            e.yaxis.tick_right()
            #e.yaxis.xticks([0,1][0,1])
            # todo: why is it normalised by 20?


            ticks = numpy.array(e.get_yticks())
            M = max(ticks)
            indices = [int(N*x) for x in ticks**self.cmap_scale/(M**self.cmap_scale)]
            e.set_yticks(indices)
            if vmax == 1:
                # set number of digits
                tic = numpy.array(indices)/float(N)
                tic = [int(x*100)/100. for x in tic]
                e.set_yticklabels(tic)
            else:
                e.set_yticklabels([int(x*100)/100. for x in numpy.array(indices)/float(N)*vmax])

            e.set_xticks([],[])

        pylab.sca(a)

    def _get_nExps(self):
         return len(self.experiments)
    nExps = property(_get_nExps, doc="return number of experiments")

    def _get_stimuli(self):
        return self.experiments[[this for this in self.names_stimuli]]
    stimuli = property(_get_stimuli, doc="return the stimuli dataframe")

    def _get_inhibitors(self):
        return self.experiments[[this+":i" for this in self.names_inhibitors]]
    inhibitors = property(_get_inhibitors, doc="return the inhibitors dataframe")

    def _get_nSignals(self):
        return len(self.df.columns)
    nSignals = property(_get_nSignals, doc="return the number of signals")

    def _get_xtlabels(self, logx=False):
        """build the time labels vector

        The vector is [t0,tmid,t0,tmid,...t0,tmid,tend]

        """
        t0 = self.times[0]
        t2 = self.times[-1]

        xtlabels = [int(t0),int((t2-t0)/2)] * self.nSignals + [int(t2)]
        return xtlabels

[docs]    def xplot(self, *args, **kargs):
        """Same as :meth:`plot` using the xkcd layout !"""
        with pylab.xkcd():
            self.plot(*args, **kargs)
            bbox = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
            #pylab.text(3, 12, "XMIDAS", fontsize=14,  verticalalignment='top', bbox=bbox)

            tx = pylab.title("XMIDAS", bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
            tx.set_position((0.6, 1.15))

[docs]    def plot(self, **kargs):
        """

        :param string mode: must be either "mse" or "trend" (defaults to trend)

        calls plotMSEs and plotExp

        if mode == mse, calls also plotSim

        .. plot::
            :include-source:
            :width: 80%

            from cellnopt.core import *
            m = XMIDAS(cnodata("MD-ToyPB.csv"))
            m.plot(mode="trend")

        .. todo:: a zero line

        .. no stimuli or inhibitors
        """
        mode = kargs.get("mode", "trend")
        kargs['mode'] = mode
        assert mode in ["mse", "trend"]

        if mode == "mse":
            if self.df.min().min()<0:
                self.logging.warning("values are expected to be positive")
            if self.df.max().max()>1:
                self.logging.warning("values are expected to be normalised")
        self.plotMSEs(**kargs)

        self.plotExp(**kargs)
        if mode == "mse":
            self.plotSim(**kargs)

[docs]    def save2midas(self, filename, expand_time_column=False):
        """Save XMIDAS into a MIDAS CSV file.

        :param str filename:

        """
        f = open(filename, "w")
        # TODO: cellline

        header = ["TR:%s:CellLine"%self.cellLine]
        for this in self.experiments.columns:
            if "TR:" in this:
                if this.endswith(":i"):
                    header += [this.replace(":i", "i")]
                else:
                    header += [this]
            else:
                if this.endswith(":i"):
                    header += ["TR:"+this.replace(":i", "i")]
                else:
                    header += ["TR:" + this]
        #header += ["TR:"+ this if "TR" not in this else this for this in self.experiments.columns]
        if expand_time_column == False:
            header += ["DA:ALL"]
        else:
            header += ["DA:" +  x for x in self.df.columns]

        try:
            header += ["DV:" +  x for x in self.df.columns]
        except Exception:
            print(self.df.columns)
            raise Exception

        header = ",".join(header)
        f.write(header + "\n")

        for time in self.times:
            # experiments of levels is 1
            # better to use experiments df so that order is same as in experiments
            # for exp in self.df.index.levels[1]:
            for exp in self.experiments.index:

                #FIXME: if we drop an experiment, this fails. do we want to
                # update the laels when calling remove_experiment method
                measurements = self.df.xs((self.cellLine, exp, time))
                measurements = measurements[self.df.columns] # maybe not needed
                # keep it for now to be sure that order of measurements is same as in the header
                experiment = self.experiments.ix[exp]

                #FIXME use isinstance
                if type(measurements) == pd.core.series.Series:
                    if expand_time_column == False:
                        rowdata = [1] + list(experiment.values) + [time] + list(measurements)
                    else:
                        rowdata = [1] + list(experiment.values) +\
                            [time]*len(self.df.columns) + list(measurements)
                    f.write(",".join(["{}".format(x) for x in rowdata]) + "\n")

                elif type(measurements) == pd.core.frame.DataFrame:
                    for measurement in measurements.values:
                        if expand_time_column == False:
                            rowdata = [1] + list(experiment.values) + [time] + list(measurement)
                        else:
                            rowdata = [1] + list(experiment.values) +\
                                [time]*len(self.df.columns) + list(measurement)
                        f.write(",".join(["{}".format(x) for x in rowdata]) + "\n")
                else:
                    raise TypeError()

        f.close()

    def _todelete_get_control(self, experiment_name):
        inhibitors = self.inhibitors.ix[experiment_name]
        # here is a sub selction where experiment matches the experiment_name
        mask = (self.inhibitors == inhibitors).all(axis=1)
        # indices contains all experiment that have the same inhibitors
        indices = self.inhibitors[mask].index

        # now from those experiment, which one is the control (i.e., all stimuli are off)
        stimuli = self.stimuli.ix[indices]
        mask = stimuli.sum(axis=1)==0
        control = stimuli[mask].index

        # TODO assert control is unique
        assert len(control) == 1
        return control[0]


[docs]    def normalise(self, mode, inplace=True, changeThreshold=0, **kargs):
        """Normalise the data

        :param mode: time or  controle
        :param bool inplace:  Defaults to True.

        see :mod:`normalise.XMIDASNormalise`

        .. warning:: not fully tested. the mode "time" should work. The
            control mode has been tested on 2 MIDAS file only.
        """
        assert mode in ["time", "control"], "mode must be control or time"
        kargs['changeThreshold'] = changeThreshold
        if mode == "time":
            n = normalisation.XMIDASNormalise(self, **kargs)
            normed_midas = n.time_normalisation()
        elif mode == "control":
            n = normalisation.XMIDASNormalise(self, **kargs)
            normed_midas = n.control_normalisation()



        if inplace == False:
            return normed_midas
        else:
            self.df = normed_midas.copy()

[docs]    def export2experiments(self):
        """Returns list of Experiments

        Each datum in the dataframe :attr:`df` is converted into an
        instance of :class:`~cellnopt.core.xmidas.Experiment`.

        :return: list of experiments.

        .. todo:: use Experiments

        """
        experiments = []
        for row in self.df.iterrows():
            cellLine, exp, time = row[0]
            data = row[1]
            inhibitors = self.experiments.ix[exp][self.inhibitors]
            stimuli = self.experiments.ix[exp][self.stimuli]
            for species in data.index:
                e = Experiment(species, time=time, stimuli=dict(stimuli),
                        inhibitors=dict(inhibitors), measurement=data[species],
                        cellLine=cellLine)
                experiments.append(e)
        return experiments

[docs]    def add_uniform_distributed_noise(self, inplace=False, dynamic_range=1,
            mode="bounded"):
        """add random (uniformaly distributed) noise to the dataframe

        The noise is uniformliy distributed between -0.5 and 0.5 and added to the
        values contained in the dataframe (for each combinaison of species and
        time/experiment). New values are :math:`\hat{X}=X + noise(-.5, .5)*dr`,
        where dr is the dynamical range. Note that final values may be below
        zero or above 1. If you do not want this feature, set the mode to
        "bounded" (The default is **free**). **bounded** mens

        :param bool inplace: False by default
        :param float dynamic_range: a multiplicative value set to the noise
        :param bool min_value: final values below min are set to min (default is 0)
        :param bool max_value: final values above max are set to max (default is 1)

        """
        # axis=1 means each values is modified
        # axis=0 means the same value is added to the entire column
        dr = dynamic_range
        # add a unique random number to each value irrespective of level/axis
        # N = len(self.df.index)
        # x.df = x.df.apply(lambda x: x + np.random.normal(0, 10, size=N), axis=0)
        # OR change N to be column length and loop over axis 1
        # N = len(self.df.columns)
        # x.df = x.df.apply(lambda x: x + np.random.normal(0, 10, size=N), axis=1)
        if mode=="bounded":
            # because we use min() and max(), we cannot use apply but must use
            # applymap. Otherwise, identical random are generated for each
            # species
            newdf = self.df.applymap(lambda x: x +
                    np.random.uniform(-x.min(),1-x.max())*dr)
        elif mode == "free":
            newdf = self.df + np.random.uniform(self.df) * dr
        else:
            raise ValueError("mode can be bounded or free")

        if inplace:
            self.df = newdf.copy()
        else:
            return newdf

[docs]    def add_gaussian_noise(self, sigma=0.1, inplace=False):
        """add gaussian noise to the data. Results may be negative or above 1"""
        # random.normal accepts x and takes its shape to return random values.
        # so, the addition of x and the ouptut of np.random.normal is as
        # expected: points by point.
        newdf = self.df.apply(lambda x:x+np.random.normal(x, scale=sigma))
        if inplace:
            self.df = newdf.copy()
        else:
            return newdf

    def _make_df_compatible(self):
        """if you use MakeBuilder, you can really add any combi of experiments
        However, the resulting df built is not MIDAS compatible for sure. For example,
        not all same time are available for each experiment.

        """
        raise NotImplementedError

[docs]    def get_residual_errors(self, level="time", normed=False):
        """Return vector with residuals errors

        The residual errors are interesting to look at in the context
        of a boolean analysis. Indeed, residual errors is the minimum error
        that is unavoidable with a boolean network and comes from the discrete
        nature of such a model. In a boolean analysis, one would compare 0/1
        values to continuous values between 0 and 1. Therefore, however good
        is the optimisation, the value of the goodness of fit term cannot go
        under this residual error.

        :param: level to sum over.
        :return: returns residual errors :math:`\sum (round(x)-x)^2`
            the summation is performed over species and experiment by default

        .. doctest::

            >>> from cellnopt.core import cnodata, XMIDAS
            >>> m = XMIDAS(cnodata("MD-ToyMMB_T2.csv"))
            >>> m.get_residual_errors()
            time
            0       0.000000
            10      2.768152
            100     0.954000
            dtype: float64

        """
        #FIXME use normed to divide residual errors by appropriate N
        diff = (self.df - self.df.apply(lambda x: x.round(), axis=1))
        diff_square = diff.apply(lambda x: x**2, axis=1)
        S = diff_square.sum(level="time").sum(axis=1)

        # FIXME. do we take time 0 into account ?
        if normed :
            S /= len(self.experiments) * len(self.times)
        return S

[docs]    def copy(self):
        x = XMIDAS()
        x._data = self._data.copy()
        x._missing_time_zero = self._missing_time_zero
        x.cellLine = self.cellLine
        x.df = self.df.copy()
        x._experiments = self.experiments.copy()
        x.sim = self.sim.copy()
        x.errors = self.errors.copy()
        return x

    def __str__(self):
        txt = "Your data set contains {} cellLines\n".format(len(self.cellLines))
        txt += "   Current selected cell line is {}\n".format(self.cellLine)
        txt += "\nThe data contains \n"
        txt += "{} Species:\n\t {}\n".format(len(self.names_species),
                self.names_species)
        txt += "{} inhibitors:\n\t {}\n".format(len(self.names_inhibitors),
                self.names_inhibitors)
        txt += "{} stimuli:\n\t {}\n".format(len(self.names_stimuli),
                self.names_stimuli)
        return txt

[docs]    def correlation_experiment_one_signal(self, name):
        t = self.df[name]
        pylab.pcolor(t.unstack(1).corr())

[docs]    def pca(self, signal, pca_components=2):
        """Not sure this is the proper way...

        get all experiment related to 1 signal

        .. plot::
            :include-source:
            :width: 80%

            from cellnopt.core import *
            m = midas.XMIDAS(cnodata("MD-ToyPB.csv"))
            #m.df = abs(m.df)
            #m.df/=m.df.max()
            m.pca("gsk3")


        .. todo::  pls = PLSRegression(n_components=3)
            from sklearn.pls import PLSCanonical, PLSRegression

        """
        from sklearn.decomposition import PCA
        pca = PCA(n_components=pca_components)
        #t = self.df[signal]
        #pca.fit(t.unstack(1))

        t = self.df[signal].fillna(0)
        X = t.unstack()
        X_r = pca.fit(X).transform(X)
        pylab.plot(X_r[:,0], X_r[:,1], 'o')
        print(pca.explained_variance_ratio_)

        return pca

[docs]    def boxplot(self, mode="time"):
        """

        :param str mode: time or species

        .. plot::

            from cellnopt.core import *
            m = XMIDAS(cnodata("MD-ToyPB.csv"))
            m.boxplot(mode="other")
            m.boxplot(mode="time")

        """
        if mode == "time":
             self.df.reset_index().boxplot(by="time")
             pylab.tight_layout()
        else:
            self.df.boxplot()

[docs]    def radviz(self, species=None, fontsize=10):
        """

        .. plot::
            :include-source:
            :width: 80%

            from cellnopt.core import *
            m = XMIDAS(cnodata("MD-ToyPB.csv"))
            m.radviz(["ap1", "gsk3", "p38"])

        """
        if species == None:
            species = list(self.df.columns)
        from pandas.tools.plotting import radviz
        df = self.df.reset_index()
        del df['time']
        del df['cellLine']
        pylab.figure(1)
        pylab.clf()
        radviz(df[['experiment']+species], "experiment")
        pylab.legend(fontsize=fontsize)

[docs]    def discretize(self, **kargs):
        return self.discretise(**kargs)

[docs]    def discretise(self, inplace=True,N=2):
        """

        :param int N: number of discrete values (defaults to 2). If set to
            2, values will be either 0 or 1. If set to 5, values wil lbe in
            [0,0.25,0.5,0.75,1]
        :param inplace:

        .. warning. data has to be normalised
        """
        assert N>=1
        N = N-1.
        self.logging.info("Discretization between 0-1 assuming normalised data")
        if inplace:
            self.df = self.df.apply(lambda x: (x*N).round()/N)
        else:
            df = self.df.apply(lambda x: (x*N).round()/N)
            #df = df.values.round(1)
            return df

[docs]    def round(self, inplace=True, decimals=0):
        if inplace == True:
            self.df = self.df.apply(lambda x : x.round(decimals=decimals))
        else:
            return self.df.values.apply(lambda x : x.round(decimals=decimals))

[docs]    def hcluster(self, mode="experiment"):
        """


        .. plot::
            :include-source:
            :width: 80%

            from cellnopt.core import *
            m = midas.XMIDAS(cnodata("MD-ToyPB.csv"))
            m.hcluster("species")

        """
        assert mode in ["experiment", "time", "species"]
        from scipy.spatial.distance import pdist, squareform
        from scipy.cluster.hierarchy import linkage, dendrogram
        pylab.clf()
        if mode == "experiment":
            distxy = squareform(pdist(self.df.unstack("time"), metric='euclidean'))
        elif mode == "time":
            distxy = squareform(pdist(self.df.unstack("experiment"), metric='euclidean'))
        elif mode == "species":
            distxy = squareform(pdist(self.df.transpose(), metric='euclidean'))
        R = dendrogram(linkage(distxy, method='complete'))


        if mode == "time":
            pylab.xticks(pylab.xticks()[0], self.times)
            pylab.title("Clustering by time")
        elif mode == "experiment":
            pylab.xticks(pylab.xticks()[0], self.experiments.index)
            pylab.title("Clustering by experiments")

        else:
            pylab.xticks(pylab.xticks()[0],self.species)
            pylab.title("Clustering by species")

[docs]    def heatmap(self, cmap="heat", transpose=False):
        """Hierarchical clustering on species and one of experiment/time level

        .. plot::
            :include-source:
            :width: 80%

            from cellnopt.core import *
            m = midas.XMIDAS(cnodata("MD-ToyPB.csv"))
            m.heatmap()

        """
        #FIXME 1 looks like dendograms are not shown. why?
        from biokit.viz.heatmap import Heatmap
        data = self.df.query('time>0').unstack(2).ix[self.cellLine]
        if transpose:
            h = Heatmap(data.transpose())
        else:
            h = Heatmap(data)
        h.plot(cmap=cmap)
        return h

[docs]    def shuffle(self, mode="experiment", inplace=True):

        """Shuffle data

        :param str mode: `timeseries` shuffles experiments and species; timeseries
            are unchanged. `all` shuflles through time, experiment and species.


        mode can be

        # `timeseries` that is
        # `all`
        # `signals` or `species`: sum over signals is constant


        #. by_signals (or by_species, by_columns, species, signals,
            columns) shuffles each column independently. All values are shuffled
            but the sum over a column/species remains identical.
            constqnt is df.sum()
        # shuffle over index. This means that values with same cell/exp/time are shuffled;
          This is therefore over species as well but keep a kind of time information
          constqnt is sum over experiment: m.df.sum(level="experiment").sum(axis=1)


        .. plot::
            :width: 80%

            from cellnopt.core import *
            m = midas.XMIDAS(cnodata("MD-ToyPB.csv"))
            m.plot()

        Shuffling qll timeseries keeping their structures:

        .. plot::
            :include-source:
            :width: 80%

            from cellnopt.core import *
            m = midas.XMIDAS(cnodata("MD-ToyPB.csv"))
            m.shuffle(mode="timeseries")
            m.plot()


        """
        if inplace != True:
            raise NotImplementedError

        if mode == "experiment":
            self.df.reindex(self.df.index, key=lambda x:
                list(self.experiments.index).index(x[1]))
        elif mode == "all":
            # The random.shuffle function does not work!! somehow sum of data increases or decreases
            # One must use numpy.random.shuffle instead
            #print(shuffle)
            shape = self.df.shape
            data = self.df.values.reshape(shape[0]*shape[1])
            np.random.shuffle(data)
            count = 0
            # not very efficient but works for now
            for i in range(0,shape[0]):
                for j in range(0,shape[1]):
                    self.df.values[i][j] = data[count]
                    count += 1
        elif mode in ["signals", "species",  "columns"]:
            for c in self.df.columns:
                self.df[c] = np.random.permutation(self.df[c].values)
        elif mode == "indices":
            # m.df.sum(level="experiment").sum(axis=1) is constant
            for this_index in self.df.index:
                np.random.shuffle(self.df.ix[this_index].values)
        elif mode == "timeseries":
            species = list(self.species)
            exps = list(self.experiments.index)
            pairs = []
            for s in species:
                for e in exps:
                    pairs.append((s,e))
            # permutation now
            np.random.shuffle(pairs)
            df = self.df.copy()
            # FIXME: cell not implemented yet
            count = 0
            for s in species:
                for e in exps:
                    rand_s = pairs[count][0]
                    rand_e = pairs[count][1]
                    data = list(df[rand_s].ix[self.cellLine].ix[rand_e])

                    self.df[s].ix[self.cellLine].ix[e] = data
                    count += 1
        else:
            raise NotImplementedError

    # works for simple cases where only one stimuli is on at a time
[docs]    def sort_experiments_by_stimuli(self):
        stimuli = list(self.stimuli.columns)
        list_exp = []
        for stimulus in stimuli:
            # First, we get group of experiment for a given stimuli
            # here groups[1] means get experiments where EGF is on
            groups = self.experiments.groupby(stimulus).groups
            if 1 not in groups.keys():
                continue
            experiments = groups[1]
            # Second, we sort the subset of experiments based on the inhibitors
            # (already sorted alphabetically FIXME ?)
            for inhibitor in list(self.inhibitors.columns):
                groups = self.experiments.ix[experiments].groupby(inhibitor).groups
                if 1 in groups.keys():
                    for exp in groups[1]:
                        if exp not in list_exp:
                            list_exp.append(exp)
                if 0 in groups.keys():
                    for exp in groups[0]:
                        if exp not in list_exp:
                            list_exp.append(exp)
        assert len(list_exp) == len(self.experiments)

        self._experiments = self.experiments.reindex_axis(list_exp, axis=0)
        self.df = self.df.reindex_axis(list_exp, axis=0, level=1)
        return list_exp

[docs]    def sort_experiments_by_inhibitors(self):

        inhibitors = list(self.inhibitors.columns)
        list_exp = []
        for inhibitor in inhibitors:
            # First, we get group of experiment for a given stimuli
            # here groups[1] means get experiments where EGF is on
            groups = self.experiments.groupby(inhibitor).groups
            if 1 not in groups.keys():
                continue
            experiments = groups[1]
            # Second, we sort the subset of experiments based on the inhibitors
            # (already sorted alphabetically FIXME ?)
            for stimulus in list(self.stimuli.columns):
                groups = self.experiments.ix[experiments].groupby(stimulus).groups
                if 1 in groups.keys():
                    for exp in groups[1]:
                        if exp not in list_exp:
                            list_exp.append(exp)
                if 0 in groups.keys():
                    for exp in groups[0]:
                        if exp not in list_exp:
                            list_exp.append(exp)
        for exp in self.experiments.index:
            #print("experiment were not handled. added a posterirori ")
            if exp not in list_exp:
                list_exp.append(exp)
        assert len(list_exp) == len(self.experiments)

        self._experiments = self.experiments.reindex_axis(list_exp, axis=0)
        self.df = self.df.reindex_axis(list_exp, axis=0, level=1)
        return list_exp




    def __eq__(self, other):
        if all(other.df == self.df) == False:
            return False
        if all(other.experiments == self.experiments) == False:
            return False
        return True

"""
class TrendTimeSeries(object):
    def __init__(self, data=None, times=None):
        if isinstance(data, pd.TimeSeries):
            self.data = data

        # pd.DataFrame([0,2,4,6,8], index=[0,10,20,30,40])

    def _set_times(self):
        pass
    def _get_times(self):
        pass
    times = property(_set_times, _get_times, doc="")

    def _set_data(self):
        # if a timeseries ok
        # otherwise transform to timeseries
        pass
    def _get_data(self):

        pass
    data = property(_get_data,_set_data)
"""


[docs]class TypicalTimeSeries(object):
    """Utility that figures out the trend of a time series

    Returns color similar to what is contained in DataRail.


    .. todo:: must deal with NA


    """
    def __init__(self, times=None):
        self._times = times  # ref do not change
    def _get_times(self):
        return self._times
    times = property(_get_times)

[docs]    def transient(self, x=None):
        """

        m = MIDASReader(...)
        y = transient(m.times)
        x = m.times
        plot(x,y)

        returns normqlised vector
        """
        if x == None:
            x = self.times
        M = max(x)
        v = np.array([(M-y)/(M/2.) if y>=M/2. else y/(M/2.) for y in x])
        return self._normed(v)

[docs]    def constant(self, x=0):
        v = np.array([x] * len(self.times))
        v = self._normed(v)
        return v

    def _normed(self, v):
        sumsq = np.sqrt(sum([this**2 for this in v]))
        return v/sumsq

[docs]    def earlier(self, x=None, n=3., N=4.):
        if x == None:
            x = self.times
        M = max(x)
        v = np.array([(M-y)/(n*M/N) if y>=M/N else y/(M/N) for y in x])
        return self._normed(v)

[docs]    def sustained(self, x=None, L=0.5):
        if x == None:
            x = self.times
        M = max(x)
        m = L * M
        v = np.array([y if y<m else m for y in x])
        return self._normed(v)

[docs]    def inverse_sustained(self, x=None, L=0.5):
        if x == None:
            x = self.times
        M = max(x)
        m = L * M
        v = np.array([(M-y) if y < m else M-m for y in x])
        return self._normed(v)


[docs]    def later(self, x=None, L=0.5):
        if x == None:
            x = self.times
        M = max(x)
        m = L * M
        v = np.array([0 if y<m else y-m for y in x])
        return self._normed(v)

    def _correlate(self, a, b):
        a = self._normed(a)
        b = self._normed(b)
        return np.correlate(a,b)

    def _get_correlation(self, a):
        correlation = {}
        correlation['later'] = self._correlate(a, self.later())
        correlation['earlier'] = self._correlate(a, self.earlier())
        correlation['earlier2'] = self._correlate(a, self.earlier(n=1,N=10))
        correlation['transient'] = self._correlate(a, self.transient())
        correlation['constant_half'] = self._correlate(a, self.constant(0.5))
        correlation['constant_unity'] = self._correlate(a, self.constant(1))
        correlation['sustained'] = self._correlate(a, self.sustained(L=.5))
        correlation['inverse_sustained'] = self._correlate(a, self.inverse_sustained(L=.5))

        return correlation

[docs]    def plot(self, data):
        corrs = self._get_correlation(data)
        clf()
        pylab.plot(self.times, self._normed(data), label="data", lw=2, ls="--")
        # transient
        pylab.plot(self.times, self.transient(), 'o-', label="transient " + str(corrs['transient']))
        # earlier
        pylab.plot(self.times, self.earlier(), 'o-', label="earlier " + str(corrs['earlier']))
        pylab.plot(self.times, self.earlier(n=1, N=10), 'o-', label="earlier2 " + str(corrs['earlier2']))
        # later
        pylab.plot(self.times, self.later(), 'o-', label="later " + str(corrs['later']))
        # constant
        pylab.plot(self.times, self.constant(.5), 'o-', label="constant " + str(corrs['constant_half']))
        # sustained
        pylab.plot(self.times, self.sustained(L=.5), 'o-', label="sustained" + str(corrs['sustained']))
        pylab.plot(self.times, self.inverse_sustained(L=.5), 'o-', label="inv sustained" + str(corrs['inverse_sustained']))
        pylab.legend()

[docs]    def get_bestfit(self, data):
        corrs = self._get_correlation(data)
        keys,values = (corrs.keys(), corrs.values())
        M  = max(values)
        return keys[np.argmax(values)]

[docs]    def get_bestfit_color(self, data):
        corrs = self._get_correlation(data)
        keys,values = (corrs.keys(), corrs.values())
        M  = max(values)
        res = keys[np.argmax(values)]

        if "constant" in res:
            return "black"
        elif "later" in res:
            return "red"
        elif "transient" in res:
            return "yellow"
        elif "earlier" in res:
            return "purple"
        elif "sustained" in res:
            return "green"
        else:
            return "white"


[docs]class Experiment(object):
    """Data structure to store a measurement.



    """
    def __init__(self, protein_name, time, stimuli, inhibitors, measurement,
            cellLine="undefined", units="second"):
        """

        :param str protein:
        :param float time:
        :param dict stimuli: a dictionary
        :param dict inhibitors: a dictionary
        :param float measurement: the value
        :param str cellLine: Defaults to "undefined"
        :param str units: Defaults to "second" (not yet used)

        """
        self._time = time

        self._stimuli = None
        self.stimuli = stimuli

        self._measurement = measurement
        self._protein_name = protein_name

        self._inhibitors = None
        self.inhibitors = inhibitors

        self._cellLine = cellLine
        self._units = units

    def _get_units(self):
        return self._units
    def _set_units(self, units):
        assert units in ["second", "hour", "minute", "day"]
        self._units = units
    units = property(_get_units, _set_units ,doc="units (second, hour, minute, day")

    def _get_cellLine(self):
        return self._cellLine
    def _set_cellLine(self, cellLine):
        assert isinstance(cellLine, str)
        self._cellLine = cellLine
    cellLine = property(_get_cellLine, _set_cellLine)

    def _get_time(self):
        return self._time
    def _set_time(self, time):
        assert isinstance(time, (int, float))
        assert time>=0
    time = property(_get_time, _set_time)

    def _get_protein_name(self):
        return self._protein_name
    def _set_protein_name(self, name):
        assert isinstance(name, str)
        self._protein_name = name
    protein_name = property(_get_protein_name, _set_protein_name)

    def _get_stimuli(self):
        return self._stimuli
    def _set_stimuli(self, stimuli):
        isinstance(stimuli, dict)
        for k,v in stimuli.iteritems():
            if v >1 or v<0:
                raise ValueError("Value of the stimulus {} must be inside the range [0,1]".format(k))
        self._stimuli = stimuli.copy()
    stimuli = property(_get_stimuli, _set_stimuli)

    def _get_inhibitors(self):
        return self._inhibitors
    def _set_inhibitors(self, inhibitors):
        isinstance(inhibitors, dict)
        for k,v in inhibitors.iteritems():
            if v >1 or v<0:
                raise ValueError("Value of the inhibitor {} must be inside the range [0,1]".format(k))
        self._inhibitors = inhibitors.copy()
    inhibitors = property(_get_inhibitors, _set_inhibitors)

    def _get_data(self):
        return self._measurement
    def _set_data(self, data):
        assert isinstance(data, (int, float))
        self._measurement = data
    data = property(_get_data, _set_data)

[docs]    def cues_as_dict(self):
        data = self.stimuli.copy()
        data.update(self.inhibitors)
        return data

[docs]    def get_cues(self):
        cues = sorted(self.stimuli.keys()) + sorted(self.inhibitors.keys())
        return cues

    def __str__(self):
        txt = "Protein {} (cellLine {}) measured at time {} has value: {}".format(
                self.protein_name, self.cellLine, self.time, self.data)
        txt += " for the following experiment: \n\tStimuli: {}\n\tInhibitors: {}.".format(
                self.stimuli, self.inhibitors)
        return txt


[docs]class Experiments(object):
    """

        >>> es = Experiments()
        >>> e1 = Experiment("AKT", 0, {"EGFR":1}, {"AKT":0}, 0.1)
        >>> e2 = Experiment("AKT", 5, {"EGFR":1}, {"AKT":0}, 0.5)
        >>> es.add_single_experiments([e1,e2])

    """
    def __init__(self):
        #raise NotImplementedError
        self.experiments = []

[docs]    def add_single_experiments(self, experiments):
        for exp in experiments:
            # sanity check:: stimuli and inhibitors must be provided
            import copy
            self.experiments.append(copy.deepcopy(exp))

    def _get_species(self):
        species = sorted(list(set([e.protein_name for e in self.experiments])))
        return species
    species = property(_get_species)


    def __len__(self):
        return len(self.experiments)



[docs]class MIDASBuilder(object):
    """STarts a MIDAS file from scratch and export 2 CSV MIDAS file.

    .. warning:: to be used with care. Right now it seems to work but still in
        development.

        >>> m = MIDASBuilder()
        >>> e1 = Experiment("AKT", 0, {"EGFR":1}, {"AKT":0}, 0.1)
        >>> e2 = Experiment("AKT", 5, {"EGFR":1}, {"AKT":0}, 0.5)
        >>> e3 = Experiment("AKT", 10, {"EGFR":1}, {"AKT":0}, 0.9)
        >>> e4 = Experiment("AKT", 0, {"EGFR":0}, {"AKT":0}, 0.1)
        >>> e5 = Experiment("AKT", 5, {"EGFR":0}, {"AKT":0}, 0.1)
        >>> e6 = Experiment("AKT", 10, {"EGFR":0}, {"AKT":0}, 0.1)
        >>> for e in [e1,e2,e3,e4,e5,e6]:
        ...     m.add_experiment(e)
        >>> m.export2midas("test.csv")

    This class allows one to add experiments to obtain a dataframe compatible with
    XMIDAS class, which can then be saved using XMIDAS.export2midas.

    More sophisticated builders can be added.


    """
    def __init__(self):
        self.experiments = []


[docs]    def test_example(self):
        e1 = Experiment("DIG1", 0, {"EGF":1, "Akt":1}, {}, 1)
        e2 = Experiment("DIG1", 0, {"EGF":1, "Akt":1}, {}, 1.2)
        e3 = Experiment("DIG1", 0, {"EGF":1, "Akt":1}, {}, 1.3)
        e4 = Experiment("DIG2", 0, {"EGF":1, "Akt":1}, {}, 3)
        e5 = Experiment("DIG2", 10, {"EGF":1, "Akt":1}, {}, 2)
        e6 = Experiment("DIG1", 10, {"EGF":1, "Akt":1}, {}, 1.5)
        e7 = Experiment("DIG3", 20, {"EGF":1, "Akt":0}, {}, 3.5)
        self.experiments = []
        self.add_list_experiments([e1,e2,e3,e4,e5,e6, e7])


[docs]    def add_experiment(self, experiment):
        if len(self.experiments) == 0:
            self.experiments.append(experiment)
        else:
            # check that
            assert sorted(experiment.stimuli) == sorted(self.experiments[0].stimuli)
            assert sorted(experiment.inhibitors) == sorted( self.experiments[0].inhibitors)
            self.experiments.append(experiment)

[docs]    def add_list_experiments(self, experiments):
        for e in experiments:
            self.add_experiment(e)

[docs]    def set_random_experiments(self, stimuli, inhibitors, species, times):
        """

        :param stimuli:
        :param inhibitors:
        :param species:
        :param times:

        """
        raise NotImplementedError

[docs]    def get_colnames(self):
        return self.experiments[0].get_cues()


    def _get_experiment_name(self, e):
        data = e.stimuli.copy()
        data.update(e.inhibitors)

        # let us build a dataframe corresponding to the experiment. This is a 1-row DF
        mydf = pd.DataFrame(data, index=[0], columns=self._dfexp.columns)

        # let us compare it with the full list of unique experiments to figure out
        # its name
        candidate = self._dfexp[(self._dfexp == mydf.ix[0]).all(axis=1)]
        candidate = candidate.index # just need the indices, not the content
        if len(candidate) != 1:
            print(candidate)
            raise ValueError("Found 0 or more than 1 candidate experiment. ")
        return candidate[0]



        # slow
        #q1 = " and ".join(["{}=={}".format(k[self._shift:],v) for  k,v in data.iteritems()])
        #
        #try:
        #    return self._df.query(q1 , engine="python").index[0]
        #except:
        #    return self._get_experiments().query(q1 , engine="python").index[0]

    def _get_xmidas_df(self):
        # this is a df used to stored and extract information from the
        # experiments
        #exp_names = self._get_experiments()
        self._dfexp = self._get_experiments().copy()
        if "TR:" in self._dfexp.columns[0]:
            self._shift=3
            self._dfexp.columns = [col[3:] for col in self._dfexp.columns] # get rid of TR: for query
        else:
            self._shift = 0
        print("building")
        df = pd.DataFrame({
            'time': [this.time for this in self.experiments],
            #'experiment':[self._get_experiment_name(exp_names, this) for this in exps],
            'experiment':[self._get_experiment_name(this) for this in self.experiments],
            'value':[this.data for this in self.experiments],
            'species':[this.protein_name for this in self.experiments],
            'cellLine':[this.cellLine for this in self.experiments]})

        # create multi index data frame
        df.set_index(["cellLine", "experiment", "time"], inplace=True)
        print(2)

        # now we need to move species names as columns and values column as a
        # matrix. There may be NA values.

        # I thought from here, a simple pivot_table call would make the trick of
        # setting species as the column replicates are averaged, which we do not
        # want.

        # amatrix of values. The tricky part is that measuremets/replicaes may
        # be done for a species and not others so there are possibly NA. The
        # matrix that holds the data are a dimension NxM computed here below

        # values will be populated little by little by appending experiments
        # what we know for sure right now are the list of speices and the first
        # experiment index
        Nspecies = len(df.species.unique())
        tuples = []
        df_values = []
        species = sorted(df.species.unique())

        # FIXME: the usage of the query is slow. need to be speed up
        self._df = df
        for index in df.index.unique():
            data = df.xs(index, level=["cellLine", "experiment", "time"])
            # for each combi of cellline/exp/time, what is the max number of
            # replicates over all species
            M = data.groupby("species").species.count().max()
            for x in range(0,M):
                tuples.append(index)

            values = pd.DataFrame([[np.nan] * Nspecies]*M, columns=species)
            for this in species:
                data_species = data.query("species=='{}'".format(this), engine="python")['value']
                if len(data_species):
                    values[this] =  list(data_species) + [np.nan] * (M-len(data_species))
                else:
                    pass
            df_values.append(values)

            # here we built the dt for a single index
        print(3)
        values = pd.concat(df_values, ignore_index=True)
        index = pd.MultiIndex.from_tuples(tuples, names=["cellLine", "experiment", "time"])
        newdf = pd.DataFrame(values.as_matrix(), index=index, columns=species)
        print(4)
        return newdf

    def _get_xmidas(self):
        """pbl: replicates are ignored !!

        .. todo:: get rid of TR: in the experiments df
        """
        df_experiments = self._get_experiments()

        species = sorted(list(set([e.protein_name for e in self.experiments])))
        times = sorted(list(set([e.time for e in self.experiments])))
        experiment_names = list(df_experiments.index)

        df = self._get_xmidas_df()
        df = df.sort_index(axis=1)
        df = df.sortlevel(["experiment"])


        x = XMIDAS()


        # proper column compatible with MIDAS:
        stimuli = sorted(list(set([y for e in self.experiments for y in e.stimuli.keys()])))
        inhibitors = sorted(list(set([y for e in self.experiments for y in e.inhibitors.keys()])))

        columns = []
        for name in df_experiments.columns:
            if name in stimuli:
                if name.startswith("TR:")==False:
                    newname = "TR:" + name
                    columns.append(newname)
                else:
                    columns.append(name)
            elif name in inhibitors:
                if name.startswith("TR:")==False:
                    newname = "TR:" + name + ":i"
                    columns.append(newname)
                else:
                    columns.append(name)
            else:
                raise ValueError()
        df_experiments.columns = columns

        x._experiments = df_experiments.copy()
        x.df = df.copy()
        x.create_empty_simulation()
        x._cellLine = x.cellLines[0]
        x._rawdf = df.copy()
        x._rawexp = x._experiments.copy()
        return x
    xmidas = property(_get_xmidas)

    def _get_experiments(self):
        df_experiments = pd.DataFrame([e.cues_as_dict() for e in
            self.experiments], columns=self.get_colnames())
        df_experiments = df_experiments.drop_duplicates()
        df_experiments.index = range(0, df_experiments.shape[0])
        df_experiments.index = ["experiment_{}".format(this) for this in  df_experiments.index]
        return df_experiments

[docs]    def export2midas(self, filename):
        xmidas = self.xmidas
        xmidas.save2midas(filename)
CellNOpt homepage|cellnopt.core 1.0.0 documentation

Source code for cellnopt.core.midas

Search