MS-DAS 0.9.0 documentation

Source code for msdas.alignment

# -*- python -*-
#
#  This file is part of MS-DAS software
#
#  Copyright (c) 2014 - EBI-EMBL
#
#  File author(s): Thomas Cokelaer <cokelaer@ebi.ac.uk>, Marti Bernardo Faura
#  bernardo@ebi.ac.uk
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#
##############################################################################
"""Module to align several files together.

The contents of the files in term of protein name and peptide sequence can
overlap or not. Each file is taken as a different experiment and therefore column
names that are not standard (Protein, Sequence, Entry, ...) are renamed with a
prefix (the filenames or prefixes provided by the user.)


"""
import os

import pylab
from easydev import Logging
from msdas.readers import MassSpecReader
from msdas.readers import PSites
from msdas.tools import SequenceTools


__all__ = ["MassSpecAlignmentYeast", "MassSpecAlignmentBase"]



# ALIGNEMENT not merging
[docs]class MassSpecAlignmentBase(Logging): """Base Class related to Reading/Writing Mass Spectrometry data sets This class gathers common functionalities for more specialised classes :class:`MassSpecAlignmentYeast`. Inputs can be a filename of a instance of MassSpecReader:: from msdas import * m = MassSpecAlignmentBase() r1 = MassSpecReader(get_yeast_filenames()[0]) r2 = MassSpecReader(get_yeast_filenames()[1]) df = m.merge([r1.df,r2.df]) The contents of r1 and r2 is similar (same column names). So columns must be renamed (except for the Proiten, Sequence and so on i.e. metadata). .. seealso:: :class:`MassSpecAlignment`, :class:`MassSpecAlignmentYeast`, :class:`MassSpecAlignmentTCell`. """ AND = "^" def __init__(self, filenames=None, verbose=True, ): """.. rubric:: Constructor :param str filename: Not required but used by sub classes. :param bool verbose: verbosity set on by default """ super(MassSpecAlignmentBase, self).__init__(level=verbose) # create this class to help us reading single CSV later on self.ms_reader = MassSpecReader(verbose=verbose) if isinstance(filenames, list): self.filenames = filenames[:] if isinstance(filenames, str): self.filenames = [filenames] else: self.filenames = filenames #: the dataframe, set to None by default self.df = None self._error_messages = { 'df': "df attribute not found. Please call " + "read(filename) method or use proper MassSpecAlignment class", } #: list of column names to performed the merge on 'Protein', "Sequence", 'Psite', 'Sequence_Phospho' self.merge_on = ['Protein', "Sequence",'Psite', 'Sequence_Phospho'] def _get_mode(self): return self.ms_reader.mode def _set_mode(self, mode): self.ms_reader.mode = mode mode = property(_get_mode, _set_mode, doc="get/set mode of the MassSpecReader")
[docs] def check_format(self): ptools = PSites(verbose=self.level) for psite in self.df.Psite: if ptools.isvalid(psite) == False: raise ValueError("found invalid psite %s" % psite)
[docs] def merge(self, dfs, on=None): """Merge/align several dataframes :param list dfs: list of dataframes to align :param list on: list of column to perform the merge on. If not provided, use the :attr:`merge_on` attribute (Protein, Sequence, Psite, Sequence_Phospho). .. note:: This is a merge using the merge function from pandas library. the merge is performed using the **how** parameter set to **outer**. """ if on == None: on = self.merge_on[:] if len(dfs)==0: raise ValueError("No data provided. Please provide a list of dataframe") if len(dfs) == 1: # nothing to do return dfs[0].copy() df = dfs[0].copy() # length may be different for index, thisdf in enumerate(dfs[1:]): df = df.merge(thisdf, how="outer", on=on) # rename columns so that all Xt are at the end df = df[on + [c for c in df.columns if c not in on]] return df
def __str__(self): txt = "This is MassSpecAlignment instance\n" txt += "It contains %s combination of protein names and psites\n" % len(self.df.Protein) txt += "Data is contained in the attribute df (dataframe)" return txt # not used anymore
class _MassSpecAlignmentTCell(MassSpecAlignmentBase, SequenceTools): """Align several Mass Spectrometry data files. See :class:`MassSpecAlignmentBase` for the list of columns to be found to perform the alignment/merging and :class:`MassSpecAlignmentYeast` for more details MS-DAS provides a set of 3 files to play with:: from msdas import * m = MassSpecAlignmentTCell(get_tcell_filenames(), .. todo:: check specific cases such as Q8IYB3-2 isoform 2 why the isoform 1 picked up The original files contains extra information such as Retention_time__min_, Charge, measured mass that are kept in rawdf attribute. .. warning:: not fully tested """ def __init__(self, filenames=None, verbose=True, prefixes=None): """.. rubric:: contructor :param str filename: See documentation of the class for details about the format :param bool verbose: verbosity set on by default """ super(MassSpecAlignmentTCell, self).__init__(filenames, verbose=verbose) self.prefixes = prefixes if self.filenames: if prefixes == None: self.logging.warning("No prefixes provided. We will use the filename") prefixes = [os.path.splitext(os.path.split(f)[-1])[0] for f in self.filenames] self.prefixes =prefixes if len(prefixes)!=len(self.filenames): raise ValueError("Number of prefixes must match number of filenames") self._init_tcell(prefixes) self.logging.error("MassSpecAlignmentTCell has not been tested thoroughly. Use with care") def _update_df_tcell(self): # we select only some columns selected_columns = [x for x in self.rawdf.columns if x.startswith("Nor") or x.startswith("Raw") or x in ["Max_fold_change","Psite", "Protein", "ProteinID", "Unstimulated", "Sequence"]] self.df = self.rawdf.ix[:, selected_columns].copy() def _read_tcell_csv(self, filename, tag): # Use an external reader that will gather all kind of conventions self.ms_reader.read_csv(filename, sep=",") # if you merge the single file, if may not work. to be tested self.ms_reader.rename_psites() self.ms_reader.set_zero_to_na() # renaming columns self.ms_reader._rename_measurements(tag) return self.ms_reader.df def _init_tcell(self, prefixes): # read first one self._dfs = [] df = self._read_tcell_csv(self.filenames[0], tag=prefixes[0]) self._dfs.append(df) # read other filename and merge into original data frame if self.filenames>1: for i, filename in enumerate(self.filenames[1:]): thisdf = self._read_tcell_csv(filename, tag=prefixes[i+1]) # if we set names in read_csv, floats are converted to strings. # so we set the column names afterwards. self._dfs.append(thisdf) self.df = self.merge(self._dfs).copy()
[docs]class MassSpecAlignmentYeast(MassSpecAlignmentBase): """Align several Mass Spectrometry data files. MS-DAS provides a set of 6 files to play with. Their paths can be obtained using function :func:`get_yeast_filenames`:: from msdas import * filenames = alignment.get_yeast_filenames() yeast = alignment.MassSpecAlignmentYeast(filenames) See :class:`MassSpecAlignmentBase` for the list of columns to be found to perform the alignment/merging. Since measurements may have the same names from one file to another, then need to be renamed. Internallty, a prefix is added. It is populated with prefixes provided by the use; otherwise with the filenames themselves. :: from msdas import * m = MassSpecAlignmentYeast(get_yeast_filenames(), prefixes = ["a0", "a1", "a5", "a10" "a20", "a45]) You can then create a MassSpecReader instance from the dataframe just created and save it to a file:: r = MassSpecReader(m) r.to_csv("test_align.csv") """ def __init__(self, filenames=None, verbose=True, prefixes=None): """.. rubric:: constructor :param list filename: list of filenames (readable by MassSpecReader). :param bool verbose: verbosity set on by default """ super(MassSpecAlignmentYeast, self).__init__(filenames, verbose=verbose) self.mode = "YEAST" self.prefixes = prefixes if self.filenames: if prefixes == None: self.logging.warning("No prefixes provided. We will use the filename") prefixes = [os.path.splitext(os.path.split(f)[-1])[0] for f in self.filenames] if len(prefixes)!=len(self.filenames): raise ValueError("Number of prefixes must match number of filenames") self._init_yeast(prefixes) def _read_yeast_csv(self, filename, tag): # Use an external reader that will gather all kind of conventions self.ms_reader.read_csv(filename, sep=",") # if you merge the single file, if may not work. to be tested self.ms_reader.rename_psites() self.ms_reader.set_zero_to_na() # renaming columns self.ms_reader._rename_measurements(tag) return self.ms_reader.df def _init_yeast(self, prefixes): # read first one self._dfs = [] df = self._read_yeast_csv(self.filenames[0], tag=prefixes[0]) self._dfs.append(df) # read other filename and merge into original data frame if self.filenames>1: for i, filename in enumerate(self.filenames[1:]): thisdf = self._read_yeast_csv(filename, tag=prefixes[i+1]) # if we set names in read_csv, floats are converted to strings. # so we set the column names afterwards. self._dfs.append(thisdf) self.df = self.merge(self._dfs).copy()