MS-DAS 0.9.0 documentation

Source code for msdas.alignment

# -*- python -*-
#  This file is part of MS-DAS software
#  Copyright (c) 2014 - EBI-EMBL
#  File author(s): Thomas Cokelaer <>, Marti Bernardo Faura
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
"""Module to align several files together.

The contents of the files in term of protein name and peptide sequence can
overlap or not. Each file is taken as a different experiment and therefore column
names that are not standard (Protein, Sequence, Entry, ...) are renamed with a
prefix (the filenames or prefixes provided by the user.)

import os

import pylab
from easydev import Logging
from msdas.readers import MassSpecReader
from msdas.readers import PSites
from import SequenceTools

__all__ = ["MassSpecAlignmentYeast", "MassSpecAlignmentBase"]

# ALIGNEMENT not merging
[docs]class MassSpecAlignmentBase(Logging): """Base Class related to Reading/Writing Mass Spectrometry data sets This class gathers common functionalities for more specialised classes :class:`MassSpecAlignmentYeast`. Inputs can be a filename of a instance of MassSpecReader:: from msdas import * m = MassSpecAlignmentBase() r1 = MassSpecReader(get_yeast_filenames()[0]) r2 = MassSpecReader(get_yeast_filenames()[1]) df = m.merge([r1.df,r2.df]) The contents of r1 and r2 is similar (same column names). So columns must be renamed (except for the Proiten, Sequence and so on i.e. metadata). .. seealso:: :class:`MassSpecAlignment`, :class:`MassSpecAlignmentYeast`, :class:`MassSpecAlignmentTCell`. """ AND = "^" def __init__(self, filenames=None, verbose=True, ): """.. rubric:: Constructor :param str filename: Not required but used by sub classes. :param bool verbose: verbosity set on by default """ super(MassSpecAlignmentBase, self).__init__(level=verbose) # create this class to help us reading single CSV later on self.ms_reader = MassSpecReader(verbose=verbose) if isinstance(filenames, list): self.filenames = filenames[:] if isinstance(filenames, str): self.filenames = [filenames] else: self.filenames = filenames #: the dataframe, set to None by default self.df = None self._error_messages = { 'df': "df attribute not found. Please call " + "read(filename) method or use proper MassSpecAlignment class", } #: list of column names to performed the merge on 'Protein', "Sequence", 'Psite', 'Sequence_Phospho' self.merge_on = ['Protein', "Sequence",'Psite', 'Sequence_Phospho'] def _get_mode(self): return self.ms_reader.mode def _set_mode(self, mode): self.ms_reader.mode = mode mode = property(_get_mode, _set_mode, doc="get/set mode of the MassSpecReader")
[docs] def check_format(self): ptools = PSites(verbose=self.level) for psite in self.df.Psite: if ptools.isvalid(psite) == False: raise ValueError("found invalid psite %s" % psite)
[docs] def merge(self, dfs, on=None): """Merge/align several dataframes :param list dfs: list of dataframes to align :param list on: list of column to perform the merge on. If not provided, use the :attr:`merge_on` attribute (Protein, Sequence, Psite, Sequence_Phospho). .. note:: This is a merge using the merge function from pandas library. the merge is performed using the **how** parameter set to **outer**. """ if on == None: on = self.merge_on[:] if len(dfs)==0: raise ValueError("No data provided. Please provide a list of dataframe") if len(dfs) == 1: # nothing to do return dfs[0].copy() df = dfs[0].copy() # length may be different for index, thisdf in enumerate(dfs[1:]): df = df.merge(thisdf, how="outer", on=on) # rename columns so that all Xt are at the end df = df[on + [c for c in df.columns if c not in on]] return df
def __str__(self): txt = "This is MassSpecAlignment instance\n" txt += "It contains %s combination of protein names and psites\n" % len(self.df.Protein) txt += "Data is contained in the attribute df (dataframe)" return txt # not used anymore
class _MassSpecAlignmentTCell(MassSpecAlignmentBase, SequenceTools): """Align several Mass Spectrometry data files. See :class:`MassSpecAlignmentBase` for the list of columns to be found to perform the alignment/merging and :class:`MassSpecAlignmentYeast` for more details MS-DAS provides a set of 3 files to play with:: from msdas import * m = MassSpecAlignmentTCell(get_tcell_filenames(), .. todo:: check specific cases such as Q8IYB3-2 isoform 2 why the isoform 1 picked up The original files contains extra information such as Retention_time__min_, Charge, measured mass that are kept in rawdf attribute. .. warning:: not fully tested """ def __init__(self, filenames=None, verbose=True, prefixes=None): """.. rubric:: contructor :param str filename: See documentation of the class for details about the format :param bool verbose: verbosity set on by default """ super(MassSpecAlignmentTCell, self).__init__(filenames, verbose=verbose) self.prefixes = prefixes if self.filenames: if prefixes == None: self.logging.warning("No prefixes provided. We will use the filename") prefixes = [os.path.splitext(os.path.split(f)[-1])[0] for f in self.filenames] self.prefixes =prefixes if len(prefixes)!=len(self.filenames): raise ValueError("Number of prefixes must match number of filenames") self._init_tcell(prefixes) self.logging.error("MassSpecAlignmentTCell has not been tested thoroughly. Use with care") def _update_df_tcell(self): # we select only some columns selected_columns = [x for x in self.rawdf.columns if x.startswith("Nor") or x.startswith("Raw") or x in ["Max_fold_change","Psite", "Protein", "ProteinID", "Unstimulated", "Sequence"]] self.df = self.rawdf.ix[:, selected_columns].copy() def _read_tcell_csv(self, filename, tag): # Use an external reader that will gather all kind of conventions self.ms_reader.read_csv(filename, sep=",") # if you merge the single file, if may not work. to be tested self.ms_reader.rename_psites() self.ms_reader.set_zero_to_na() # renaming columns self.ms_reader._rename_measurements(tag) return self.ms_reader.df def _init_tcell(self, prefixes): # read first one self._dfs = [] df = self._read_tcell_csv(self.filenames[0], tag=prefixes[0]) self._dfs.append(df) # read other filename and merge into original data frame if self.filenames>1: for i, filename in enumerate(self.filenames[1:]): thisdf = self._read_tcell_csv(filename, tag=prefixes[i+1]) # if we set names in read_csv, floats are converted to strings. # so we set the column names afterwards. self._dfs.append(thisdf) self.df = self.merge(self._dfs).copy()
[docs]class MassSpecAlignmentYeast(MassSpecAlignmentBase): """Align several Mass Spectrometry data files. MS-DAS provides a set of 6 files to play with. Their paths can be obtained using function :func:`get_yeast_filenames`:: from msdas import * filenames = alignment.get_yeast_filenames() yeast = alignment.MassSpecAlignmentYeast(filenames) See :class:`MassSpecAlignmentBase` for the list of columns to be found to perform the alignment/merging. Since measurements may have the same names from one file to another, then need to be renamed. Internallty, a prefix is added. It is populated with prefixes provided by the use; otherwise with the filenames themselves. :: from msdas import * m = MassSpecAlignmentYeast(get_yeast_filenames(), prefixes = ["a0", "a1", "a5", "a10" "a20", "a45]) You can then create a MassSpecReader instance from the dataframe just created and save it to a file:: r = MassSpecReader(m) r.to_csv("test_align.csv") """ def __init__(self, filenames=None, verbose=True, prefixes=None): """.. rubric:: constructor :param list filename: list of filenames (readable by MassSpecReader). :param bool verbose: verbosity set on by default """ super(MassSpecAlignmentYeast, self).__init__(filenames, verbose=verbose) self.mode = "YEAST" self.prefixes = prefixes if self.filenames: if prefixes == None: self.logging.warning("No prefixes provided. We will use the filename") prefixes = [os.path.splitext(os.path.split(f)[-1])[0] for f in self.filenames] if len(prefixes)!=len(self.filenames): raise ValueError("Number of prefixes must match number of filenames") self._init_yeast(prefixes) def _read_yeast_csv(self, filename, tag): # Use an external reader that will gather all kind of conventions self.ms_reader.read_csv(filename, sep=",") # if you merge the single file, if may not work. to be tested self.ms_reader.rename_psites() self.ms_reader.set_zero_to_na() # renaming columns self.ms_reader._rename_measurements(tag) return self.ms_reader.df def _init_yeast(self, prefixes): # read first one self._dfs = [] df = self._read_yeast_csv(self.filenames[0], tag=prefixes[0]) self._dfs.append(df) # read other filename and merge into original data frame if self.filenames>1: for i, filename in enumerate(self.filenames[1:]): thisdf = self._read_yeast_csv(filename, tag=prefixes[i+1]) # if we set names in read_csv, floats are converted to strings. # so we set the column names afterwards. self._dfs.append(thisdf) self.df = self.merge(self._dfs).copy()