MS-DAS 0.9.0 documentation

Source code for msdas.yeast

from __future__  import division

import clustering
import numpy as np
import pylab
import pandas as pd
from msdas import replicates
from readers import MassSpecReader
from easydev import get_share_file as gsf
from cno import CNOGraph

__all__ = ["YEAST", "get_yeast_filenames", "YEAST2MIDAS",
           "get_yeast_raw_data", "get_yeast_small_data"]

"""

    sim = plotLBodeFitness(cnolist, pknmodel,res, plotParams=list(F=.3, cmap_scale=2, cex=0.5))

    Ntimes = length(sim)
    N = nrow(sim[[1]]) * ncol(sim[[1]]) * Ntimes
    signals = cnolist@signal
    sim2 = list()
    for(i in seq_along(sim)){
         sim2[[i]] = matrix(colMeans((signals[[i]]-sim[[i]])^2, na.rm=T), nrow=1)
    }
    sim2 = matrix(unlist(sim2), nrow=Ntimes, byrow=T)
    sim2 = colMeans(sim2, na.rm=T)
    d = data.frame(sim2, row.names=colnames(cnolist@signals[[1]]))
    write.table(d, col.names=F, quote=F, sep=",", file="test.csv")


"""


[docs]class YEAST(object): times = [0,1,5,10,20,45] def __init__(self): pass
[docs]def get_yeast_small_data(): """Return filename of data related to YEAST This is the CSV file resulting from the alignment of 6 small data files provided in the package. Here is the procedure followed to create that file. :: from msdas import * m = MassSpecAlignmentYeast(get_yeast_small_files()) # data is now aligned, we can save it # We also wanted the annotations: a = AnnotationsYeast(m) # creates the identifiers a.get_uniprot_entries() # Creates the entry a.set_annotations() # Creates Entry names a.check_entries_versus_sequence() # columns are rename with prefix "a" a.to_csv("YEAST_small_all_test.csv") """ return gsf("msdas", "data", "YEAST_small_all.csv")
[docs]def get_yeast_raw_data(): """Return filename to the YEAST raw data See :func:`get_yeast_small_data` for procedure on how to generate this data file. The input filenames used are :meth:`get_yeast_filenames` """ return gsf("msdas", "data", "Yeast_all_raw.csv")
[docs]def get_yeast_filenames(mode="subset"): """Returns the list of filenames related to the YEAST data :param str mode: valid values are *subset* or *all*. If subset, filenames looked for are files called alphaX.csv . If mode is set to *all*, filenames looked for all files called MaX_annotated_CLEAN_COMPACT_Normedian_FINAL.csv :return: list of 6 full path names related to the YEAST data set (6 files) The corresponding files are sorted by times (ie, 0, 1, 5, 10, 20, 45). The 6 filenames must be provided as input to the :class:`~msdas.alignment.MassSpecAlignmentYeast`. .. seealso:: :class:`~msdas.alignment.MassSpecAlignmentYeast`. """ times = [0, 1, 5, 10, 20, 45] if mode == "subset": filenames = [gsf("msdas", "data","alpha%s.csv" % str(x)) for x in times] return filenames elif mode == "all": prefix = "Ma" suffix = "s04_annotated_CLEAN_COMPACT_NORMmedian_FINAL.csv" filenames = [gsf("msdas", "data","{}{}{}".format(prefix,str(x),suffix)) for x in times] return filenames else: raise ValueError("mode must be in [subset, all]. Defaults to *subset*")
[docs]class YEAST2MIDAS(MassSpecReader, YEAST): """Class pipeline to read/cluster/write yeast data into MIDAS/PKN. The constructor of this class performs the following tasks: #. Reads small data set (with the interpolated data) #. Reads the raw data set (with replicates) #. Cleanup and merging similar to what was done in the small dat set (see below for details) #. Creates an instance of :class:`msdas.clustering.MSClustering` Tools are provided to #. Reads a PKN and build a MIDAS from the data. A new PKN is generated with new names containing the psites. #. plot time series #. perform clustering YEAST2MIDAS is also a :class:`msdas.readers.MassSpecReader` so you can get lots of plotting tools in addition to those provided within this class. Here is an example:: >>> from msdas import yeast >>> y = yeast.YEAST2MIDAS(get_yeast_small_data(), get_yeast_raw_data()) >>> # get a new data frame containing only exemplars >>> df = y.get_df_exemplars(preference=-30) >>> c = y.get_expanded_cnograph("PKN.sif", df) >>> # create the new MIDAS and a CNOGraph for the new PKN >>> c, m, e = y.export_pkn_and_midas("../share/data/PKN-yeast.sif") :Details: The dataframe :attr:`df` contains all measurements and extra information such as the protein/psites/sequence/uniprot_entry. The measurements itself is made of a subset of the 36 measurements, which are combination of 6 times and 6 alpha experiments. You can imagine the data as a matrix 6 by 6. The measurements that are kept for MIDAS are made of the first row, first column and main diagonal. The raw data is preprocessed using the :class:`msdas.replicates.ReplicatesYeast` and follows those steps: #. drop rows where peptide contains an oxidation :meth:`msdas.readers.Cleaner.drop_oxidation` #. Merge peptide with identical peptide sequence (but different psites) #. clean rows with too much NAs using :meth:`~msdas.replicates.ReplicatesYeast.clean_na_paper` and :meth:`~msdas.replicates.ReplicatesYeast.clean_na_paper` #. Drop rows with only NAs #. normalise the data using the TIC values #. Call :meth:`~msdas.readers.Cleaner.fix_duplicated_identifier` (rename duplicated identifiers) """ def __init__(self, data=None, rawdata=None, verbose=False) : """.. rubric:: Constructor :param data: optional data set (default to :func:`get_yeast_small_data`) :param rawdata: optional data set (default to :func:`get_yeast_raw_data`) """ if data is None: data = get_yeast_small_data() if rawdata is None: rawdata = get_yeast_raw_data() super(YEAST2MIDAS, self).__init__(data, cleanup=True, merge_peptides=True, verbose=verbose) self.logging.info("Reading raw data set") self.replicates = replicates.ReplicatesYeast(rawdata) # because of the next drop_na_exceeds function. # one way to drop NA: 4 values required over the 36 cells #self.replicates.drop_na_exceeds_minnonzero() # another way, which looks like what was done in the paper is # to set to drop rows where one experiment does not have values self.replicates.drop_oxidation() # to match small data set self.replicates.merge_identical_peptide_sequence() #self.replicates._rebuild_identifier() self.replicates.clean_na_paper() # cross (replaces with NA) alpha experiment # if not enough values (at least 4 per set of 6 experiment in alpha direction) self.replicates.clean_na_paper2() # cross (replaces with NA) alpha experiment # if not enough values (at least 4 per set of 6 experiment in NaCl direction self.replicates.drop_na_count(108) # to get closer to small data set self.replicates.normalise() # can be before or after #self.replicates.merge_peptides() #self.replicates.drop_oxidation() # to match small data set self.replicates.sort_psites_ors_only() # ??? self.replicates.fix_duplicated_identifier() # called once for all self._cv_buf = pd.DataFrame(self.replicates.get_coefficient_variation()) # here, we set na to zero, just for the clustering self.cluster = clustering.MSClustering(self, fillna=True, verbose=False) # FIXME: yeast.df is a reference so if changed, it affects the user parameter # select only data with the 3 conditions we are interested in self._measures = list(self.measurements.columns) self._measures_salt = ["a"+str(a)+"_t"+str(salt) for salt in self.times for a in self.times] # probably not needed since done in the instanciation above but good check #self._drop_psites_with_bad_quality() #self._drop_visual_clustering() #self.run_clustering() #print("Scaled dataframe as in the clustering is available in self.df_scaled") #self.df_scaled = self.cluster.scale(self.cluster.get_group()).transpose() self.logging.info("data is in `df` attribute and full raw data with replicates in `replicates` attribute") self._cleanup = False self.mapping = self.mapping_small_to_raw_june() def _get_groups(self): return self.cluster.groups groups = property(_get_groups, doc="get the dataframe indices grouped by proteins (alias to cluster.groups)")
[docs] def cleanup_june(self): """Cleanup the raw and small data set. The small data set contains labels that are incorrect and do not represent what pre-processing was performed. Besides, interpolation (and extrapolation) were applied. In order to replace the interpolated values by NA, we need to know the correct label. See :meth:`mapping_small_to_raw_june`. This method replaces the interpolated values by NAs in the :attr:`df` dataframe, relabel corerctly the identifiers and removes rows in the big dataframe :attr:`replicates` that are now useless FUS3_S177^T180+S177^Y182+T180^Y182', 'HOG1_T174^Y176' time zero are missing for now, we keep the data with interpolated values. #. FAR1_S114 has no t0 values. Downstream of PKN. Removed for now #. 3 RCK2 have no t0 values. Downstream of PKN. Removed for now """ if self._cleanup == True: print("already cleanup. nothing to do") mapping = self.mapping_small_to_raw_june() values = [x for x in set(mapping.values()) if x] # should replace HOG1 t0 a 0.0001 because there is 1 missing value, which # is the only missing value. #index = self.replicates['HOG1', "T174^Y176"].index[0] #self.replicates.df.ix[index, 'a0_t0'] = 0.0001 # Selection of the data in the replicates that we wawnt to look at selection = self.replicates.df.Identifier.apply(lambda x: x in values) print(sum(selection)) self.replicates.df = self.replicates.df[selection] # replace the data with the average of the replicates so that now we have the NAs # first, let us get the average, keeping track of the identifier as the index of the df mu = self.replicates.get_mu_df() mu.index = self.replicates.df.Identifier # now let us drop identifier that are not in the mapping (keys) keys = [k for k,v in mapping.iteritems() if v!=None] # the tokeep should be be touched: self.tokeep = self.df.copy() self.df = self.df[self.df.Identifier.apply(lambda x : x in keys)] # replace identifiers by the correct ones self.df.Identifier = [mapping[x] for x in self.df.Identifier] self.df.Psite = [x.split("_")[1] for x in self.df.Identifier] # not need to redo protein column, which should already be correct. #replace the data for identifier in mu.index: index = self.df[self.df.Identifier == identifier].index[0] self.df.ix[index, mu.columns] = mu.ix[identifier].values index = self.tokeep.query("Identifier == 'HOG1_T174^Y176'").index[0] self.df.ix[index] = self.tokeep.ix[index] index = self.tokeep.query("Identifier == 'FUS3_S177+T180^Y182'").index[0] self.df.ix[index] = self.tokeep.ix[index] self.df.ix[index, 'Identifier'] = 'FUS3_S177^T180+S177^Y182+T180^Y182' self.df.ix[index, 'Psite'] = 'S177^T180+S177^Y182+T180^Y182' self._cleanup == True self.cluster = clustering.MSClustering(self, fillna=False, cleanup=False) # ???? obosloet ????
def __cleanup_june2(self): # 'GPD1' 'S23+S24+S27', 'S23+S24+S25+S27', 'S24^S27' #can be clustered. We keep S14^S27 print("WARNING: should be fixed to not use indices but names") self.df.drop([15, 16], axis=0, inplace=True) # keep GPD1_S24^S27 only (paper) self.df.drop([8, 9], axis=0, inplace=True) #DIG2 clustered into S225 (paper) # DIG1 has 4 clusters: 1 we pick up randomly one of them when there is # more than 1. no NAs exccept 2 in S272^S275+T277^S279 #S126+S127 #S142 and S330 and S395+S397. kee the latest with ;ean CV 13% against 33 and 16% self.df.drop([5,1], axis=0, inplace=True) #S272 AND S272^S275. first has CV 10% and 0 NA; second has 15%.later has 1 NA # drop S272^S275 self.df.drop(3, axis=0, inplace=True) #S272^S275+T277^S279 # NO a0_t0 point so removed self.df.drop(self['FAR1_S114'].index[0], inplace=True) # 4 NAs self.df.drop(self['SSK1_S110'].index[0], inplace=True) self.df.drop(self['SSK1_S673'].index[0], inplace=True) # STE20 clustering is difficult despite 9 peptides #We remove T413+S418 because it contains lots of 16 NAs anyway ot of 36 #and 6 out of 16 (MIDAS) index = self.df[self.df.Identifier == "STE20_T413+S418"].index[0] self.df.drop(index, inplace=True) # 35% errors index = self.df[self.df.Identifier == "STE20_T546+S547"].index[0] self.df.drop(index, inplace=True) # we can drop RCK2. It is going to be removed in the export of MIDAS # isnce not in the PKN, but we can do it here as well # true for FPS1, SIC, TEC nodes = ['FPS1', 'RCK2', 'SIC1', 'TEC1'] self.df = self.df[self.df.Protein.apply(lambda x: x not in nodes)]
[docs] def corr2d(self, name1, name2, show=True): d1 = self.get_data_matrix(name1).fillna(0).as_matrix() d2 = self.get_data_matrix(name2).fillna(0).as_matrix() d1 /= pylab.sqrt(pylab.sum(d1**2)) d2 /= pylab.sqrt(pylab.sum(d2**2)) from scipy.signal import correlate2d im = correlate2d(d1, d2) if show: pylab.clf(); pylab.subplot(2,2,1) pylab.imshow(d1, interpolation="None") pylab.subplot(2,2,2) pylab.imshow(d2, interpolation="None") pylab.subplot(2,2,3) pylab.imshow(im, interpolation="None") pylab.colorbar() return im
[docs] def corr2d_all(self, mode="max"): ident = self.df.Identifier N = len(ident) df = pd.DataFrame(np.zeros((N,N)), columns=ident.values) df.index = df.columns for i1 in self.df.Identifier.index: print(i1) for i2 in self.df.Identifier.index: corr = self.corr2d(ident.ix[i1], ident.ix[i2], show=False) if mode=="max": M = corr.max() elif mode=="integral": M = corr.sum()/36. elif mode=="center": M = corr[5,5] else: raise ValueError("mode must be max, integral or center") df.ix[ident.ix[i1], ident.ix[i2]] = M return df
[docs] def plot_group(self, protein): df = self.measurements.ix[self.groups[protein]].transpose() df.columns = self.df.Identifier.ix[self.groups[protein]] df.plot()
[docs] def remove_protein(self, identifier): self.df.drop(self[identifier].index[0], inplace=True)
[docs] def remove_species_not_in_pkn(self, filename): c = CNOGraph(filename) # let us get rid of the data that will not be available nodes = [y for y in set(self.df.Protein) if y not in sorted(set([x.split("_")[0] for x in c.nodes()]))] # ['FPS1', 'RCK2', 'SIC1', 'TEC1'] self.df = self.df[self.df.Protein.apply(lambda x: x not in nodes)] return nodes
[docs] def get_group_psite_transposed(self, name): """Get a dataframe containing a group of protein :param str name: a valid protein name. See :attr:`df.Protein` to get a list :return: transposed dataframe (indices are psites, columns are measurements for ech combination of alpha/NaCl .. seealso:: :meth:`get_group_psite` """ return self.cluster.get_group(name).transpose()
[docs] def get_group_psite(self, name): """Get a dataframe containing a groupd of psites for one protein :param str name: a valid protein name. See :attr:`df.Protein` to get a list :return: dataframe (indices are psites, columns are measurements for ech combination of alpha/time (see class documentation) .. plot:: :include-source: :width: 80% >>> from msdas import yeast >>> y = yeast.YEAST2MIDAS() >>> y.get_group_psite("DIG1").plot() .. seealso:: :meth:`get_group_psite_transposed` """ return self.cluster.get_group(name)
[docs] def groups_cluster(self): """Returns dataframe grouped by protein and clustered. You must run :meth:`run_affinity_propagation_clustering` before hand to populate the **cluster** column. :: >>> y.cluster = y['DIG1'] >>> y.cluster.run_affinity_propagation_clustering(preference=-30) >>> y.groups_cluster() """ if "cluster" not in self.cluster.df.columns: raise ValueError("cluster column not found in the dataframe. See documentation example.") return self.cluster.df.groupby(["Protein", "cluster"]).groups
[docs] def get_psites_exemplars(self, preference=-30): """Returns list of psites corresponding to the exemplars found in the clustering :param float preference: the parameter of the Affinity Propagation algorithm. See :class:`msdas.cluster.Affinity` Affinity propagation algorithm is run on each protein. For each of them, clusters and their exemplars are found. This method returns the list of exemplars >>> psites = y.get_psites_exemplars(preference=-30) .. seealso:: :meth:`get_psites_mapping`. """ psites_tokeep = [] proteins = list(set(self.df.Protein)) self.af_results = {} for protein in proteins: self.logging.debug(protein), if len(self.groups[protein]) == 1: self.logging.debug(" : no clustering required.") psite = list(self.df.ix[self.groups[protein]]['Identifier']) psites_tokeep.append(psite[0]) else: af = clustering.Affinity(self.get_group_psite(protein), method="euclidean", transpose=True, preference=preference, verbose=False) # Get the indices in the entire dataframe of the exemplars: indices = np.array(self.groups[protein])[af.cluster_centers_indices] self.logging.debug("Found %s clusters " % len(indices)) psites_tokeep.extend(self.df.Identifier[indices]) self.af_results[protein] = af return psites_tokeep
[docs] def get_psites_mapping(self, preference=-30): """Returns exemplars and the list of proteins in the same cluster Affinity propagation algorithm is run on each protein. For each of them, clusters and their exemplars are found. This method returns a dictionary where each key is an exemplar and its value contains the list of protein/psites that belongs to this cluster. :param float preference: the parameter of the Affinity Propagation algorithm. See :class:`msdas.clustering.Affinity` :return: dictionary as explained in the documentation here above. .. seealso:: :meth:`get_psites_exemplars`. """ # this call populates the af_results that can be used here below self.logging.debug("Entering get_psites_mapping----------------------------") psites = self.get_psites_exemplars(preference=preference) all_proteins = list(set(self.df.Protein)) res = {} for protein in all_proteins: if protein in self.af_results.keys(): self.logging.debug("in clustered protein") af = self.af_results[protein] names = af.df.columns d = dict([(names[af.cluster_centers_indices[i]] , list(names[af.labels==i])) for i in set(af.labels)]) res.update(d) else: name = [x for x in psites if x.split("_")[0] == protein] res.update({protein:name}) for name in sorted(res.keys()): self.logging.debug("%20s" % name + "\t" + ", ".join(res[name])) return res
[docs] def get_df_exemplars(self, preference=-30, normalise=None): """Returns a normalised dataframe that corresponds to the exemplars The psites are obtained from :meth:`get_psites_exemplars`. :param float preference: the parameter of the Affinity Propagation algorithm. See :class:`msdas.cluster.Affinity` :param normalise: minmax / maxonly / timezero. Set to None to ignore normalisation """ psites = self.get_psites_exemplars(preference=preference) newdf = self.cluster.get_group()[psites].transpose() if normalise=="minmax": for index in newdf.index: M = newdf.ix[index].max() m = newdf.ix[index].min() newdf.ix[index] = (newdf.ix[index] - m)/(M-m) elif normalise=="maxonly": for index in newdf.index: M = newdf.ix[index].max() newdf.ix[index] = newdf.ix[index] / M elif normalise == "timezero": for index in newdf.index: m = newdf.ix[index][0] newdf.ix[index] = newdf.ix[index] - m M = newdf.ix[index].max() newdf.ix[index] /= M else: pass return newdf
[docs] def to_midas(self, filename, preference=-1): """Given a dataframe, export measurements into MIDAS format :param str filename: :param int preference: a negative parameter used in the clustering. See :class:`msdas.clustering.Affinity` :return: the MIDAS object. Internally, a new dataframe is created by selecting the examplars from the clustering (Default preference of -1 is in principle equivalen of not having any clustering). Then, the MIDAS file is created and saved into a file. :: y = yeast.YEAST2MIDAS(get_yeast_small_data(), get_yeast_raw_data() ) m = y.to_midas(df, "MD-test.csv") from cno import XMIDAS m = XMIDAS("MD-test.csv") """ # create a midas builder df = self.get_df_exemplars(preference=preference) mb = self._get_midas_builder_from_df(df) # and save to m = mb.xmidas m.to_midas(filename) return m
def _get_midas_builder_from_df(self, df): # exports alpha=0, as one condition, NaCl=0 as 1 conditions and # alpha=NaCl= on (where t_alpha = t_NaCl) as a thrid condition from cno import MIDASBuilder m = MIDASBuilder() from cno.io.measurements import Measurement as Experiment _measures = ['a0_t0', 'a0_t1', 'a0_t5', 'a0_t10', 'a0_t20', 'a0_t45', 'a1_t1', 'a5_t5', 'a10_t10', 'a20_t20', 'a45_t45', 'a1_t0', 'a5_t0', 'a10_t0', 'a20_t0', 'a45_t0'] df = df[_measures] inhibitors = {} for psite in df.index: for col in df.columns: value = df.ix[psite][col] alpha, nacl = col.split("_") if col == "a0_t0": # special case that need to be added 3 times i.e. a0_t0 time = 0 stimuli = {"a":1, "NaCl":1} e = Experiment(psite, time, stimuli, inhibitors, value) m.add_measurements([e]) stimuli = {"a":1, "NaCl":0} e = Experiment(psite, time, stimuli, inhibitors, value) m.add_measurements([e]) stimuli = {"a":0, "NaCl":1} e = Experiment(psite, time, stimuli, inhibitors, value) m.add_measurements([e]) elif col.startswith("a0"): # case alpha=0, NaCl=1 e.g., a0_t5 time = int(nacl[1:]) stimuli = {"a":0, "NaCl":1} e = Experiment(psite, time, stimuli, inhibitors, value) m.add_measurements([e]) elif col.endswith("t0"): # case alpha=1, NaCl=0 e.g., a5_t0 time = int(alpha[1:]) stimuli = {"a":1, "NaCl":0} e = Experiment(psite, time, stimuli, inhibitors, value) m.add_measurements([e]) else: # case alpha=1, NaCl=1 e.g. a5_t5 time = int(alpha[1:]) # could be time = int(nacl[1:]) stimuli = {"a":1, "NaCl":1} e = Experiment(psite, time, stimuli, inhibitors, value) m.add_measurements([e]) return m
[docs] def get_expanded_cnograph(self, filename_pkn_no_sites, df): """Expands node in a SIF into their psites :param str filename_pkn_no_sites: a valid filename to a PKN in SIF format. :param dataframe df: a dataframe with protein/psites names as indices like the output of :meth:`get_df_exemplars`. :return: a CNOGraph instance :: df = y.get_df_exemplars(-1) y.get_expanded_cnograph("../../share/data/PKN-yeastScaffold.sif", df) """ from cno import CNOGraph c = CNOGraph(filename_pkn_no_sites) proteins = [this.split("_")[0] for this in df.index] proteins = list(set(proteins)) c._signals = [x.replace("^", "~") for x in proteins[:]] for protein in proteins: psites = [this for this in df.index if this.split("_")[0] == protein] psites = [psite.replace("^", "~") for psite in psites] psites = [psite.replace("+", "-") for psite in psites] self.logging.debug(protein ,psites) if psites is not None and len(psite) == 1: # just rename the node in the PKN if found if protein in c.nodes(): #print("Warning. renaming %s into %s" % (protein, psites[0])) c = c.relabel_nodes({protein: psites[0]}) else: self.debug("Warning. %s not found in PKN" % protein) else: if protein in c.nodes(): self.debug("split {} in {} nodes".format(protein, psites)) c.split_node(protein, psites) else: self.debug("Warnig. %s not found in PKN" % protein) c._stimuli = ["a", "NaCl"] #c._signals = [df.index return c
[docs] def export_pkn_and_midas_june(self, pkn, tag="undefined"): df = self.measurements df.index = self.df.Identifier c = self.get_expanded_cnograph(pkn, df) # midas m = self.get_midas() m.df.columns = [x.replace("^", "~") for x in m.df.columns] m.df.columns = [x.replace("+", "-") for x in m.df.columns] m.sim.columns = m.df.columns[:] m.errors = m.sim.copy() #FIXME: bug in cellnopt.core.xmidas need to set cellline manually if created from mifdasbuilder m._cellLine = "undefined" # FIXME: make sure the order is correct #m.experiments.columns = ["NaCl", "a"] df = self.get_cv() errors = self._get_midas_builder_from_df(df).xmidas errors.df.columns = [x.replace("^", "~") for x in errors.df.columns] errors.df.columns = [x.replace("+", "-") for x in errors.df.columns] # FIXME: should use the mapping here errors.df = errors.df[m.df.columns] errors.create_empty_simulation() # FIXME required to update sim and have corret plotting # rename return c,m, errors
[docs] def export_pkn_and_midas(self, pkn_filename, preference=-30, tag="undefined"): """Creates the expanded PKN and MIDAS file given PKN and normalise method Saves new PKN and MIDAS into PKN-Yeast_psites.sif and MD-Yeast_psites.cv :param str pkn_filename: :param float preference: the parameter of the Affinity Propagation algorithm. See :class:`msdas.cluster.Affinity` :return: a tuple with cnograph and midas instances. """ output_pkn_filename = "PKN-Yeast_psites_%s.sif" % tag output_midas_filename = "MD-Yeast_%s.csv" % tag normalise = "None" print("Generating new dataframe given preference of {} for the clustering".format(preference)) newdf = self.get_df_exemplars(normalise=normalise, preference=preference) #newdf = newdf[self.df.columns] #print newdf.columns print("Expanding the original PKN and saving into {}".format(output_pkn_filename)) c = self.get_expanded_cnograph(pkn_filename, newdf) print("Creating the MIDAS file into {} ".format(output_midas_filename)) # raw data contains all measurements m = self._get_midas_builder_from_df(newdf) m = m.xmidas m.df.columns = [x.replace("^", "~") for x in m.df.columns] m.df.columns = [x.replace("+", "-") for x in m.df.columns] m.sim.columns = m.df.columns[:] m.errors = m.sim.copy() # need to remove species in MIDAS that are not in the PKN #pkn = [x for x in c.nodes() if x not in c._find_and_nodes()] not_found = [] #print m.names_species for name in m.names_species: if name not in [x for x in c.nodes() if x not in c._find_and_nodes()]: print("{} not found in PKN. Removing from MIDAS file".format(name)) # FUS3_S177+T180~Y182 not found in PKN not_found.append(name) print("to be removed") print(not_found) #print m.df.index #print m.df.columns print(m.df.columns) m.remove_species(not_found) #FIXME: bug in cellnopt.core.xmidas need to set cellline manually if created from mifdasbuilder m._cellLine = "undefined" # FIXME: make sure the order is correct #m.save(output_midas_filename) # get the errors, filter with respect to the species in m e = self.xmidas_errors() for this in e.df.columns: if this not in m.df.columns: e.df.drop(this,axis=1, inplace=True) # need to reset the simulation. This is usefule for the layout e.create_empty_simulation() e._cellLine = "undefined" # FIXME: make sure the order is correct e.experiments.columns = ["NaCl", "a"] #m.save(output_midas_filename) return (c, m, e)
[docs] def get_errors(self, default_cv=0.5): """Return errors (coefficient variation) for each identifier To get the actual errors scaled, you need to multiply by the mean, that is the data itself. .. todo:: could be move to replicates module """ errors = [] self.logging.info("filling NA with {}".format(default_cv)) for this in self.df.Identifier: error = self.get_coefficient_variation(this, default_cv=default_cv) errors.append(error) df = pd.DataFrame(errors) df.index = self.df.Identifier # set psites as indices df = df[self._measures] # rearrange columns return df
[docs] def xmidas_errors(self, default_cv=0.5): """Return coefficient of variation into a XMIDAS object""" df = self.get_errors(default_cv=default_cv) errors = self._get_midas_builder_from_df(df).xmidas errors.df.columns = [x.replace("^","~").replace("+", "-") for x in errors.df.columns] try: errors.cellLine = "undefined" except: pass return errors
[docs] def mapping_small_to_raw_june(self): mapping = {} for k in self.df.Identifier: mapping[k] = k mapping['DIG1_S272^T277^S279'] = 'DIG1_S272^S275+T277^S279' mapping['GPD1_S24+S25+S27'] = 'GPD1_S23+S24+S25+S27' mapping['GPD1_S23+S24'] = 'GPD1_S23+S24+S27' mapping["RCK2_S33+T35+T44+S46"] = "RCK2_S32+S33+T35+T44+S46" mapping["RCK2_S45"] = "RCK2_S45+S46" mapping['RCK2_S32+S33'] = "RCK2_S32+S33+T35" mapping['DIG1_S395'] = 'DIG1_S395+S397' mapping['GPA1_S199'] = 'GPA1_T189+S199+S200' mapping['PTP2_S258'] = 'PTP2_Y257+S258' mapping['PBS2_S68'] = 'PBS2_S68+S71+S83' mapping['FUS3_T180'] = 'FUS3_T173+S177+T180+Y182' mapping["FUS3_S177+T180^Y182"] = 'FUS3_S177^T180+S177^Y182+T180^Y182' mapping["SKO1_S94^S108^T113"] = "SKO1_S94+S96^S108^T113" mapping['SSK1_S351'] = "SSK1_S350+S351" mapping['STE11_S326'] = "STE11_S323+S326+S326" # TODO fix this psite name mapping['STE20_S192'] = "STE20_S192+S195" mapping['STE20_S195'] = "STE20_S192+S195+S196" mapping['STE20_S196^T197'] = "STE20_S192^S195+S196^T197" mapping['STE20_S418'] = "STE20_T413+S418" mapping["STE20_T170^T172"] = "STE20_S169+T170^T172" mapping["STE20_T203^T207"] = "STE20_T203^T207+T217+T218" mapping["STE20_T573"] = "STE20_T573+T575" #mapping["STE20_T511"] = "STE20_T511_1" mapping['DIG2_T83'] = 'DIG2_T83+S84' mapping['DIG2_S84'] = 'DIG2_S84+T83' mapping['SIC1_S201'] = 'SIC1_S198+S201' mapping['SIC1_T173'] = 'SIC1_T173+S175' mapping['SIC1_S191'] = 'SIC1_S191_1' mapping['DIG2_S84'] = 'DIG2_T83+S84_2' mapping['DIG2_T83'] = 'DIG2_T83+S84_1' """# for now, let us ignore DIG1, DIG2, and STE12, downstream of FUS3 for k,v in mapping.iteritems(): for this in ['DIG1', 'DIG2', 'STE12']: if k.startswith(this): mapping[k] = None """ return mapping
[docs] def get_midas(self): """Return subset of the small data as a MIDAS object""" data = self.measurements data.index = self.df.Identifier m = self._get_midas_builder_from_df(data).xmidas m.df.columns = [x.replace("^", "~") for x in m.df.columns] m.df.columns = [x.replace("+", "-") for x in m.df.columns] return m
[docs] def get_cv(self): """Return coefficient of variation with proper indices. .. todo:: could be in replicates """ cv = pd.DataFrame(self.replicates.get_coefficient_variation()) cv.index = self.replicates.df.Identifier return cv
[docs] def get_coefficient_variation(self, identifier, default_cv=0.5): """Return CV for a particular identifier .. todo:: could be in replicates """ # no more duplicatesd rows to average #mapping = self.mapping_small_to_raw_june() cv = self._cv_buf indices = list(self.replicates.df[self.replicates.df.Identifier == identifier].index) if len(indices)>1: self.logging.warning("get_coefficient_variation on several rows...") errors = cv.ix[indices] errors.index = self.replicates.metadata.ix[indices]['Identifier'] errors = errors.mean() # takes the mean of the errors. errors.fillna(default_cv, inplace=True) return errors
[docs] def pcolor_errors(self, vmax=None, cmap="hot_r", default_cv=np.nan, fontsize=8, vmin=0, salt=False): """plot coefficient of variation for the small daa set using replicates found in the raw data set. :param salt: if set to True, re-arrange the columns based on salt rather than alpha :return: errors in a dataframe .. plot:: :include-source: :width: 80% from msdas import * y = YEAST2MIDAS(get_yeast_small_data(), get_yeast_raw_data()) y.cleanup_june() # replaces data with regenerated data including NAs errors = y.pcolor_errors(vmax=1,vmin=.2,fontsize=7) """ errors = self.get_errors(default_cv=default_cv) errors = errors.ix[sorted(errors.index)] if salt: errors = errors[self._measures_salt] mask = np.ma.array(errors.as_matrix(), mask=np.isnan(errors.as_matrix())) pylab.clf(); if vmax == None: vmax = errors.max().max() cmap = pylab.get_cmap(cmap) cmap.set_bad("grey", 1) pylab.pcolormesh(mask, vmin=vmin, vmax=vmax, cmap=cmap) pylab.colorbar() N, M = errors.shape pylab.ylim([0, N]) pylab.xlim(0, M) pylab.xticks([0.5 +x for x in range(0, M)], errors.columns, rotation=90, fontsize=fontsize) pylab.yticks([0.5+x for x in range(0, N)], errors.index, fontsize=fontsize) # adding the errors to be conservative pylab.tight_layout() return errors
[docs] def pcolor_na(self, raw=False, fontsize=8): """Plot number of NA for protein that are in the small data set Final number of rows is therefore larger thatn in the small data set. overlap between small and raw data is 32 rows. the remaining 25 are combination from the raw data set. .. plot:: :include-source: :width: 80% from msdas import * import pylab y = YEAST2MIDAS(get_yeast_small_data(), get_yeast_raw_data()) y.cleanup_june() errors = y.pcolor_na() """ proteins = list(set(self.df.Protein)) r = self.replicates psites = r.metadata.ix[r.metadata.query("Protein in proteins", engine="python").index].Identifier tags = self._measures NAs = {} for tag in tags: df = r.get_replicates_from_one_unique_measurement(tag) nas = 3 - pd.notnull(df.ix[psites.index]).sum(axis=1) NAs[tag] = nas.values NAs = pd.DataFrame(NAs, index=psites) NAs = NAs[self._measures] NAs = NAs.ix[sorted(NAs.index)] pylab.clf() pylab.pcolor(NAs) N, M = NAs.shape pylab.ylim([0,N]) pylab.xlim(0, M) pylab.xticks([0.5 +x for x in range(0,M)], NAs.columns, rotation=90, fontsize=fontsize) pylab.yticks([0.5+x for x in range(0,N)], NAs.index, fontsize=fontsize) pylab.colorbar() pylab.tight_layout() return NAs
[docs] def plot_psites_mapping(self, preference=-30): """For each protein, plot all data with the exemplar highlighted. The clustering is made with :mod:`msdas.clustering` and its affinity propagation algorithm. In the yeast case, clustering is performed on euclidean distance. :: from msdas import * y = yeast.YEAST2MIDAS(get_yeast_small_data(), get_yeast_raw_data(), drop_non_midas=False y.plot_psites_mapping() """ mapping = self.get_psites_mapping(preference=preference) for k,v in mapping.iteritems(): if len(v)>1: protein = k.split("_")[0] df = self.df.query("Protein==protein", engine="python") df = df.set_index("Identifier").drop(self.metadata.drop("Identifier", axis=1), axis=1) df = df.transpose() v.remove(k) ax = df.apply(lambda x:x/np.sqrt((x.abs()**2).sum()))[v].plot(marker='o', lw=1) df.apply(lambda x:x/np.sqrt((x.abs()**2).sum()))[k].plot(lw=2, ax=ax, marker='o')
[docs] def plot_timeseries_midas(self, psite="DIG1_S126+S127", sigma=2, hold=False): """Plot data related to a given identifier restricted to the MIDAS-compatible set Data that can be used in the ODE package must be time-series. This correspond to salt=0, or NaCl=0 or salt==NaCl; IN the 6x6 matrix, this correspond to first row, first colum ad diagonal. .. plot:: :include-source: :width: 80% from msdas import * y = YEAST2MIDAS(get_yeast_small_data(), get_yeast_raw_data()) y.plot_timeseries("DIG1_S126+S127") Errors (2 sigmas) are also shown """ l = [0,1,5,10,20,45,46,50,55,65,90,91,95,100,120,145] _measures = ['a0_t0', 'a0_t1', 'a0_t5', 'a0_t10', 'a0_t20', 'a0_t45', 'a1_t1', 'a5_t5', 'a10_t10', 'a20_t20', 'a45_t45', 'a1_t0', 'a5_t0', 'a10_t0', 'a20_t0', 'a45_t0'] df = self.df[_measures].transpose() df.columns = self.df.Identifier df['time'] = l if hold==False: pylab.clf() df.plot(x="time", y=psite, marker="o") pylab.legend([psite]) errors = self.get_errors() normerr = df[psite] pylab.errorbar(df['time'],df[psite], yerr=errors.ix[psite].ix[_measures]*normerr*sigma) pylab.axvline(45.5, alpha=0.5) pylab.axvline(90.5, alpha=0.5) Y0 = df[psite]['a0_t0'] pylab.axhline(Y0, alpha=0.5) pylab.text(20,Y0, "a0_tX") pylab.text(70,Y0, "aX_tX") pylab.text(120,Y0, "aX_t0") pylab.xlim([-.5,146])
[docs] def boxplot_errors(self): """ Show errors (coefficient variation) as a boxplot for each identifier""" pylab.clf() self.get_errors(default_cv=np.nan).transpose().boxplot(vert=False) pylab.tight_layout()
def _find_identifiers_with_missing_time0(self): """Return identifiers for which a0_t0 is missing""" identifiers = self.df[pd.isnull(self.df['a0_t0'])].Identifier return identifiers.values
[docs] def interpolate_time0(self, identifier, cmap="hot_r", method="cubic"): """ :param method: 'linear', 'nearest', 'cubic' """ # the data may contains NA so interp2d does not work. we use griddata from scipy.interpolate import griddata grid_x = np.array([[0]*6,[1]*6,[5]*6,[10]*6,[20]*6,[45]*6]); grid_y = grid_x.T data = self.get_data_matrix(identifier).as_matrix() points = [] values = [] for i in range(0,6): for j in range(0,6): if np.isnan(data[i,j]) == False: points.append([self.times[i], self.times[j]]) values.append(data[i,j]) grid_z0 = griddata(np.array(points), np.array(values), (grid_x, grid_y), method=method) pylab.figure(1) pylab.clf() pylab.subplot(1,2,1) mask = np.ma.array(data, mask=np.isnan(data)) cmap = pylab.get_cmap(cmap) cmap.set_bad("grey", 1) #pylab.pcolormesh(pylab.flipud(mask), vmin=vmin, vmax=vmax, cmap=cmap) pylab.imshow(data, interpolation="None", cmap=cmap) pylab.subplot(1,2,2) pylab.imshow(pylab.flipud(grid_z0), origin="lower", cmap=cmap, interpolation='None') pylab.suptitle(identifier) return data, grid_z0
[docs] def plot_figure6(self, identifier,vmin=0,vmax=2): from easydev import colors c = colors.ColorMapTools() d = {'blue': [0,1,1], 'green':[0,1,0], 'red': [1,1,0]} cmap = c.get_cmap(d, reverse=False) pylab.clf(); m = self.get_data_matrix(identifier) im1 = (1./(m.transpose().divide(m[0], axis=1))).transpose() pylab.subplot(1,2,1) pylab.imshow(pylab.flipud(im1), origin="lower", interpolation="None", cmap=cmap,vmin=vmin,vmax=vmax) pylab.colorbar(); pylab.xlim([.5,5.5]); pylab.yticks(range(0,6), ['45','20','10', '5','1', 'Phe0']) for i,label in enumerate([1,2,3,4,5]): for j,x in enumerate(im1[label].values): pylab.text(1+i, 6-j-1, int(100*x)/100.) pylab.subplot(1,2,2) im2 = (m.ix[0]/m).transpose() pylab.imshow(im2, origin="lower", interpolation="None", cmap=cmap, vmin=vmin,vmax=vmax); pylab.colorbar(); pylab.xlim([.5,5.5]); pylab.yticks(range(0,6), ['45','20','10', '5','1', 'NaCl0']) for i,label in enumerate(im2.columns[1:]): for j,x in im2[label].iterkv(): pylab.text(1+i, 6-j-1, int(100*x)/100.) pylab.suptitle("%s" % identifier) return im1, im2
def load_sim_data(filename, times=[0,1,5,10,20,45]): """ signals = colnames(cnolist@signals$`0`) sim = t(sapply(sim, unlist)) colnames(sim) = signals write.csv(sim, "sim.csv") sim = yeast.load_sim_data("sim.csv") m = midas.XMIDAS("MD-Yeast_test_alpha0oneexp_maxonly.csv") m.sim = sim.copy() m.plot(mode="trend") m.plot(mode="mse") c = CNOGraph("PKN-", "MD-") for name in c.nodes(): if name in diffs.columns: c.node[name]['mse'] = diffs[name][0] else: c.node[name]['mse'] = None """ sim = pd.read_csv(filename, index_col=0) N = len(sim.columns) sim['time'] = times sim['experiment'] = ['experiment_0'] * 6 sim['cellLine'] = ['undefined'] * 6 sim = sim.set_index(['cellLine', 'experiment', 'time']) return sim # function for the linear fit to automate the process def plotfit(x, y, yerr=None, order=1): w = None if (yerr is None or np.sum(yerr)==0) else 1/yerr p, cov = np.polyfit(x, y, order, w=w, cov=True) # coefficients and covariance matrix yfit = np.polyval(p, x) # evaluate the polynomial at x perr = np.sqrt(np.diag(cov)) # standard-deviation estimates for each coefficient R2 = np.corrcoef(x, y)[0, 1]**2 # coefficient of determination between x and y resid = y - yfit chi2red = np.sum((resid/yerr)**2)/(y.size - 2) if w is not None else np.nan #return yfit, p, R2, chi2red, perr, resid pylab.errorbar(x, y, yerr=yerr, fmt = 'bo', ecolor='b', capsize=0, elinewidth=2) pylab.plot(x,y) pylab.xlim([-.5,46]) pylab.plot(x, yfit, 'r', linewidth=3, color=[1, 0, 0, .5])