# -*- python -*-
#
# This file is part of MS-DAS software
#
# Copyright (c) 2014 - EBI-EMBL
#
# File author(s): Thomas Cokelaer <cokelaer@ebi.ac.uk>, Marti Bernardo Faura
# bernardo@ebi.ac.uk
#
# Distributed under the GPLv3 License.
# See accompanying file LICENSE.txt or copy at
# http://www.gnu.org/licenses/gpl-3.0.html
#
#
##############################################################################
"""Mass Spectrometry Readers (raw data)"""
import os
import pandas as pd
from easydev import Logging
import numpy as np
from msdas.psites import PSites
from msdas.tools import SequenceTools, Modification
import pylab
__all__ = ["MassSpecReader", "Cleaner"]
[docs]class Cleaner(object):
"""Tools inherited by :class:`msdas.readers.MassSpecReader` to perform some
tasks on grouping the peptides. Allows also some manipulation on the NAs.
.. note:: In principle you do not need to use this class, which is
available via :class:`msdas.readers.MassSpecReader`. However, it can be
useful as a standalone class.
Requires the dataframe stored in :attr:`df` to be populated with a dataframe
similar to those used in :class:`msdas.readers.MassSpecReader`.
.. rubric:: handling NA
Here, we create a simple dataframe with two columns. The last row contains
1 NA and 1 zero.::
from msdas import Cleaner
import pandas as pd
c = Cleaner()
c.df = pd.DataFrame({'A':[1,2,3,np.nan], 'B':[4,5,6,0]})
In Mass Spectrometry, zeros have no meaning. There is always some
measurements of abundance being made. Zeros are sometimes used to represent a
NA. The :class:`Cleaner` class has a method :meth:`set_zero_to_na` to replace all
zeros by NAs::
>>> c.set_zero_to_na()
>>> c.df
A B
0 1 4
1 2 5
2 3 6
3 NaN NaN
Rows that contains all NAs are useless and one may want to remove them::
c.drop_na_count(2)
This statement removes all rows that contain 2 NAs. You can get the count
of all NAs through the entire dataframe with the method :meth:`get_na_count`.
"""
def __init__(self):
self.df = None
[docs] def drop_oxidation(self):
"""Drop rows where the Sequence_Phospho contains the Oxidation string
That the the :meth:`merge_identical_peptide_sequence` renames
some sequences and add the "MERGED" prefix. Those rows are not to be removed
since they contains measurements that correspond to a peptide with an oxidation
but also possible another peptide without oxidation. There is no way currently
to know that. So the MERGED peptide are ignored. Ideally, :meth:`drop_oxidation`
should be called before any merging.
"""
indices = self.df.Sequence_Phospho.apply(lambda x: "Oxidation" in x and "MERGED" not in x)
try:
self.logging.warning("Removing {} rows that contain Oxidation string".format(sum(indices)))
except:
print("Removing {} rows that contain Oxidation string".format(sum(indices)))
indices = self.df.Sequence_Phospho.apply(lambda x: "Oxidation" not in x and "MERGED" not in x)
self.df = self.df[indices]
[docs] def set_zero_to_na(self):
"""Replace all zeros with NA
If you are manipulating raw mass spec data, a zero most probably means
NA. Indeed, each measurement is the ion current measured, that is greater
than zero. So, if you want to replace zeros by NA, just call this method.
"""
self.df = self.df.where(self.df!=0, None)
[docs] def get_na_count(self):
"""Return vector with number of NA per row (psite/protein)"""
return len(self.df.columns) - pd.notnull(self.df).sum(axis=1)
[docs] def drop_na_count(self, N=100000, inplace=True):
"""removes rows with at least N NA values
A row may have replicates. In the YEAST data case, we have 3 replicates
and 36 experiments; setting N=54 means removes
all rows for which half of the measurements are NAs.
Here below, we use :class:`msdas.replicates.ReplicatesYeast`, which
inherit from :class:`MassSpecReader` and therefore from :class:`Cleaner`.
Consequently, we can simplify the data by calling :meth:`drop_oxidation`
and :meth:`merge_identical_peptides_sequence`. Then, we drop
rows that have 108 NAs and plot the number of remaining rows. Then, we drop
rows that have 2 NAs and so on until there is no more data. The plot shows
the size of the dataframe as a function of the maximal number of NAs that
is acceptable.
.. plot::
:include-source:
:width: 80%
>>> import pylab
>>> from msdas import ReplicatesYeast, get_yeast_raw_data
>>> r = ReplicatesYeast(get_yeast_raw_data())
>>> r.drop_oxidation()
>>> r.merge_identical_peptide_sequence()
>>> size = []
>>> for n in range(108,-1,-1):
... r.drop_na_count(n)
... size.append(r.df.shape[0])
>>> pylab.plot(size, 'o-')
>>> pylab.grid()
>>> pylab.xlabel("Max number of NA per row")
>>> pylab.ylabel("Number of rows")
"""
data = self.get_na_count()
try:
self.logging.info("Removing {} rows where number of NA is larger than {}".format(len(data[data>=N]), N))
except:
print("Removing {} rows where number of NA is larger than {}".format(len(data[data>=N]), N))
if inplace:
self.df.drop(data[data>=N].index, inplace=inplace)
else:
return self.df.drop(data[data>N].index, inplace=False)
# TODO rename into merge_same_psites
[docs] def merge_peptides_with_same_psites(self):
"""Merge rows that are different peptides but same psites
in other words same psites are presumably peptides including in each other.
Rows may have same protein and psite but different peptides. This
may happen when an enzyme cuts the peptide at different places. Consider
for instance this case::
KVASMNSASLQDEAEPYDS(Phospho)DEAISK,S49,TOP1,
VASMNSASLQDEAEPYDS(Phospho)DEAISK,S49,TOP1
In such cases, we add the data measurements and replace
the 2 rows by a single one. The Peptide stored is the first one (arbitrary choice).
Actually, you may also have the Oxidation present, which may be 2 different rows but
there are added as expected::
DS(Phospho)SLLFSPAAVAM(Oxidation)R S515
DS(Phospho)SLLFSPAAVAMR S515
In this case, the 2 rows are added.
Identifier are unchanged
.. note:: if two peptides are identical, there are also taken into account here and being added.
.. seealso:: :class:`msdas.psites.PSites`
"""
try:
self.logging.info("Merging ambiguous peptides (different peptides but same psites")
except:
print("Merging ambiguous peptides (different peptides but same psites")
tobeadded = []
g = self.df.groupby(["Protein", "Psite"])
groups = g.groups
count = 0
count_group = 0
S = g.aggregate(sum)
try:
self.logging.info("Merging {} rows into {} groups.".format(len(self.df), len(groups)))
except:
pass
todrop = []
for k,v in groups.iteritems():
if len(v)>1:
# add the measurements in the group and replace the entire
# group by this new data
group = self.df.ix[groups[k]]
# this does not work : newrow = group.sum() because some
# columns contains strings. Consequently, the entire row is
# casted into strings. So, we need to use apply
newrow = S.ix[k]
metadata = group.ix[v[0]].copy()
newrow['Psite'] = metadata['Psite']
newrow['Sequence'] = metadata['Sequence']
newrow['Protein'] = metadata['Protein']
newrow['Sequence_Phospho'] = metadata['Sequence_Phospho']
newrow['Identifier'] = metadata['Identifier']
newrow['Entry'] = metadata['Entry']
newrow['Entry_name'] = metadata['Entry_name']
tobeadded.append(newrow)
todrop.extend(v)
count += len(v)
count_group += 1
self.df.drop(todrop, inplace=True)
if len(tobeadded):
self.df = self.df.append(tobeadded, ignore_index=True)
try:
self.logging.info("Merged {} groups (with more than 1 peptide) into {} new rows.".format(count, len(tobeadded)))
self.logging.info("New data frame has {} rows".format(len(self.df)))
except:
print("Merged {} groups (with more than 1 peptide) into {} new rows.".format(count, len(tobeadded)))
print("New data frame has {} rows".format(len(self.df)))
self._rebuild_identifier()
[docs] def merge_identical_peptide_sequence(self):
"""Merge (by adding) rows that have same protein and peptide sequence
They may have different psite locations but their number must agree.
A new name for the psite has to be created to reflect what addition
was performed. Instead of concatenating the psites, we factorise if
possible. For instance, consider the simple case with no common location
but a single location::
DVSQIT(Phospho)SSPK T44
DVSQITSS(Phospho)PK S46
This is a simple OR case and the new name is **T44+S46**. Let us now consider a
case with 2 psites, one being common::
TRS(Phospho)S(Phospho)GTNNKDVSQITSSPK S32^S33
TRSS(Phospho)GT(Phospho)NNKDVSQITSSPK S33^T35
S33 is common to two sequence, so the new psite name is ::
S32+T35^S33
and so on.
.. seealso:: :class:`msdas.psites.PSites`
"""
try:
self.logging.info("Merging identical peptides (but with psites at different locations")
except:
print("Merging identical peptides (but with psites at different locations")
tobeadded = []
# parentheses in (Phospho) are required to not clash with prefix added to the
# sequence_original when there ambiguities
self.df['count_phospho'] = self.df.Sequence_Phospho.apply(lambda x: x.count("(Phospho)"))
# ignore sequence without peptide (if already result of a merge)
df = self.df[self.df['count_phospho']>0]
g = df.groupby(["Protein", "Sequence", "count_phospho"])
groups = g.groups
try:
self.logging.info("grouping in progress")
except:
print("grouping in progress")
count_group = 0
count = 0
pfactory = PSites(verbose=self.level)
todrop = []
S = g.aggregate(sum) # compute some before the loop. This is much 2,3 times faster than a apply function
self.logging.info("Merging {} rows into {} groups.".format(len(self.df), len(g.groups)))
#print("##############")
#print(len(groups))
NN = len(groups)
count = 0
for k, indices in groups.iteritems():
#print(k, count)
count += 1
# if there is only one row for a given protein/sequence/number of phosphos,
# then there is nothing to do otherwise, there are several cases as explained
# below
if len(indices)<=1:
continue
# S142^S150
# S142^S151
#
# in this case, S142 is not ambiguous and final psite could be named S142^150+S151
# note that there is no paraenthese. Yet, it means S142^S150 OR S142^S151
#
# k[2] conains the counting of phosphos
#
# TODO those 3 lines takes 3 seconds ...
group = self.df.ix[groups[k]]
Npsite = pfactory.remove_duplicated("^".join(list(self.df.ix[indices].Psite)))
Npsite = len(Npsite.split("^"))
if k[2] == 1:
psite_name = "+".join(self.df.ix[indices].Psite)
#psite_name = pfactory.sorted(psite_name.split("^"))
# the apply/sum function is slow...
newrow = S.ix[k].copy()
#newrow = group.apply(lambda x: x.sum())
metadata = group.ix[indices[0]]
newrow['Psite'] = psite_name
newrow['Sequence'] = metadata['Sequence']
newrow['Protein'] = metadata['Protein']
try:
newrow['Identifier'] = metadata['Identifier']
newrow['Entry'] = metadata['Entry']
newrow['Entry_name'] = metadata['Entry_name']
except Exception:
raise Exception
newrow['Sequence_Phospho'] = 'MERGED_{}Phosphos_{}locations_'.format(k[2], Npsite) + newrow['Sequence']
newrow['count_phospho'] = k[2]
todrop.extend(indices)
tobeadded.append(newrow)
count += len(indices)
count_group += 1
else:
psites = list(self.df.ix[indices].Psite)
common_psites = pfactory.get_common_psites(psites)
if len(common_psites) == k[2]-1:
psite_name = pfactory.get_factorised_psites(psites)
else:
# concatenate with +
psite_name = "+".join(sorted(psites))
newrow = S.ix[k].copy()
#newrow = group.apply(lambda x: x.sum())
newrow['Psite'] = psite_name
metadata = group.ix[indices[0]]
newrow['Sequence'] = metadata['Sequence']
newrow['Protein'] = metadata['Protein']
try:
newrow['Identifier'] = metadata['Identifier']
newrow['Entry'] = metadata['Entry']
newrow['Entry_name'] = metadata['Entry_name']
except Exception:
raise Exception
newrow['Sequence_Phospho'] = 'MERGED_{}Phosphos_{}locations_'.format(k[2], Npsite) + newrow['Sequence']
newrow['count_phospho'] = k[2]
todrop.extend(indices)
#self.df.drop(indices, inplace=True)
tobeadded.append(newrow)
count += len(indices)
count_group += 1
self.df.drop(todrop, inplace=True)
if len(tobeadded):
self.df = self.df.append(tobeadded, ignore_index=True)
try:
self.logging.info("Merged {} groups (with more than 1 peptide) into {} new rows.".format(count, len(tobeadded)))
except:
print("Merged {} groups (with more than 1 peptide) into {} new rows.".format(count, len(tobeadded)))
del self.df["count_phospho"]
self._rebuild_identifier()
[docs] def fix_duplicated_identifier(self):
"""Make sure identifierse are unique
Some identifiers may be duplicates (same protein, same psites
but different peptides) e.g., ::
Protein Sequence Psite Identifier
DIG2 LSQKEEDHSGKPPTITTSPAEK T83+S84 DIG2_T83+S84
DIG2 EEDHSGKPPTITTSPAEK T83+S84 DIG2_T83+S84
In this case, we append an additional id, which is an integer
starting from 1. First row found has the id 1.::
Protein Sequence Psite Identifier
DIG2 LSQKEEDHSGKPPTITTSPAEK T83+S84 DIG2_T83+S84_1
DIG2 EEDHSGKPPTITTSPAEK T83+S84 DIG2_T83+S84_2
"""
groups = self.df.groupby("Identifier").groups
for k, indices in groups.iteritems():
if len(indices) > 1:
identifiers = list(self.df.ix[indices,'Identifier'])
identifiers = [identifier + "_" + str(i+1) for i,identifier in enumerate(identifiers)]
self.df.ix[indices,'Identifier'] = identifiers
[docs]class MassSpecReader(Logging, SequenceTools, Cleaner):
"""Read a simple Mass Spec data CSV file
First, the MassSpecReader reads a CSV file and stores it in the
:attr:`df` attribute as a dataframe. At this stage, there is no
manipulation except from the fact that columns found in the header are
cleanup (e.g., spaces are removed).
Second, the header is interpreted. The User Guide provides a draft
convention of how the header should look like. It should contains the
following columns:
#. **Protein**: a valid uniprot protein name (e.g., DIG1)
#. **Sequence**: a peptide sequence without annotations. If not provided,
Sequence_Phospho must be provided
#. **Sequence_Phospho**: a sequence with the phsphorylation position tagged
as "(Phospho)" e.g., AVS(Phospho)KK means there was a Phophorylation
on S at position 3 and possibly oxidation as (Oxidation)
#. **Psite**: The absolute position of the phosphorylations. For instance S132^S140
and possibly:
#. **Entry_name**: a valid uniprot entry name (e.g., DIG1_YEAST)
#. **Entry**: a valid uniprot entry (e.g., P23321)
#. **Identifier**: concatenation of Protein and Psite. Maybe overwritten if
:meth:`_rebuild_identifier` is called
If you have another name (e.g., PeptideSequence instead of Sequence_Phospho),
you can populate the :attr:`header_mapping` attribute.
If "Accession" and "Modifications" are found in the header, the accession
column will be expected to contain these kind of strings (FASTA header)::
sp|Q8IYB3-2|SRRM1_HUMAN
from which the Protein column can be extract. Besides, the Modifications
will contain information that could be used to
extract the Psite positions. Modifications entries look like::
[5] S+7777| [1] T+5555
Once the data is intrepreted. Protein, Sequence and Psite should be available.
Finally, a third step performed by this class is to performs some cleanup.
See :meth:`cleanup` for details.
.. warning:: Psites are separated by the ^ character (AND). If there is
an ambiguity, the + sign can be used as a logical OR.
The creation of a MassSpecReader is quite versatile. You can read a CSV
file as described above, or provide an instance of MassSpecReader, or an object
that contains an attribute :attr:`df`, which is compatible.
::
# Let us create a dummy dataframe
import pandas as pd
df = pd.DataFrame({
'Psite': ['S1+S2', 'T3'],
'Sequence_Phospho': ["S(Phospho)S(Phospho)TTT", "SST(Phospho)TT"],
'Protein': ['DIG1', 'DIG2'],
'X1':[1,2],
'X2':[5,2],
'X3':[4,5],
'dummy'[1,2]
}
from msdas import *
r = readers.MassSpecReader(df)
There are some sanity checks performed. In the example above, the case Psite, S1+S2
is acutally ambiguous: it means that there a psite on S1 OR S2 (i.e., there is one psite).
Yet the sequence contains 2 phophoosites. So, there is a typo. If you check the content of
**r.df** you would see only 1 row. Let us fix it
::
>>> df['Psite'] = ['S1^S2', 'T3']
>>> r = readers.MassSpecReader(df)
>>> len(r.df)
2
.. seealso:: the User Guide and notebooks provided in the MSDAS documentation
"""
#: symbol used to encode psites (AND case)
AND = "^"
#: symbol used to encode psites (OR case)
OR = "+"
header_mapping = {
'Protein': ['Standard name', 'ProteinName'],
'Psite': ['Phosphosites'],
'Sequence_Phospho': ['Peptide sequence', 'PeptideSequence']
}
valid_header = ["Identifier", "Protein", "Sequence", "Sequence_Phospho",
"Psite", "Entry", "Entry_name"]
def __init__(self, data=None, mode="default", verbose=True, quotechar="'", merge_peptides=False,
index_col=None, sep=",",cleanup=True):
""".. rubric:: constructor
:param str data: Can be (i) None, (ii) a valid filename with the appropiate format (CSV), (iii)
an instance of MassSpecReader itself, (iv) a dataframe with the correct column names (v)
any object that contains an attribute **df** that is a dataframe with the correct column names.
:param bool verbose: Set to True/False or valid string (DEBUG,INFO,WARNING,ERROR)
:param quotechar: Ignore the quotechar argument if found in the CSV file.
Defaults to ' character. For Tcell data, set to " since
the CSV file contains " characters.
:param bool merge_peptides: calls :meth:`merge_peptides` is True (default to False)
:param int index_col: may be required to read some CSV files
:param str sep: the seprator to be used when reading the CSV file
:param cleanup: calls :meth:`cleanup` method (default to True)
"""
super(MassSpecReader, self).__init__(level=verbose)
self._quotechar = quotechar
self._index_col = index_col
self._sep = sep
self._info = {}
self._merge_peptides = merge_peptides
self.removed = {}
# must be set to zero now
self._mode = None
self._df = None
if isinstance(data, MassSpecReader):
self.df = data.df.copy()
self.mode = data.mode
elif isinstance(data, pd.DataFrame):
self.df = data.copy()
data = True # to force the cleaup and avoid co,paring dataframe with None
self.mode = mode
elif hasattr(data,"df"):
self.df = data.df.copy()
self.mode = mode
# comparison with None should be after dataframe case
elif data == None:
self.mode = mode
else:
# mode must be set now, before calling read_csv
self.mode = mode
if data != None and os.path.exists(data) == False:
raise IOError("The file {} does not exist".format(data))
if data!= None:
self.read_csv(data, quotechar=self._quotechar,
index_col=self._index_col, sep=self._sep)
# if this is failing, we want to raise an error. no try/except
if cleanup == True and data!=None:
self.cleanup()
self._metadata_names = self.valid_header[:]
def _set_df(self, df):
self._df = df
self._interpret_header()
def _get_df(self):
return self._df
df = property(_get_df, _set_df, doc="getter/setter for the main dataframe")
def _get_N(self):
return len(self.measurements.columns)
N = property(_get_N, doc="Return number of measurements.")
def _get_metadata(self):
return self.df[[x for x in self._metadata_names if x in self.df.columns]]
metadata = property(_get_metadata,
doc="""read-only attribute. Returns subset of the :attr:`df` attribute, which contains
metadata (not measurements). The subset is selected based on the :attr:`_metadata_names`
attribute, which by default contains Sequence, Protein, Identifier, Entry, Entry_name,...
look at `_metadata_names` for up-to-date list of column names""")
def _get_measurements(self):
col = [x for x in self.df.columns if x not in self._metadata_names]
return self.df[col]
measurements = property(_get_measurements,
doc="getter to subset of the dataframe :attr:`df` that is not in :attr:`metadata`")
def _get_mode(self):
if self._mode == None:
# try to figure out
self.logging.warning("mode not provided. Trying to figure out")
if "Entry_name" in self.df.columns:
mode = self.df.Entry_name.ix[0].split("_")[1]
self.mode = mode
else:
self.logging.warning("Could not figure out. please provide YEAST")
return self._mode
def _set_mode(self, mode):
if mode == None:
raise ValueError("mode cannot be None")
assert mode.upper() in ["DEFAULT", "YEAST"], "mode can be yeast "
if mode.upper() == "DEFAULT":
mode = "YEAST"
self._mode = mode.upper()
if self._mode == "YEAST":
self._quotechar = "\'"
mode = property(_get_mode, _set_mode,
doc="mode of the Merger (e.g., yeast)")
def _set_sequence(self):
self.debug("set_sequence starting")
if "Sequence_Phospho" not in self.df.columns:
return
tags = ["(Phospho)", "(Oxidation)"]
if "Sequence" in self.df.columns:
self.df.loc[:,'Sequence'] = list(self.df['Sequence_Phospho'].values)
else:
self.df['Sequence'] = list(self.df['Sequence_Phospho'].values)
for index in self.df.index:
seq = self.df.ix[index, 'Sequence']
for tag in tags:
seq = seq.replace(tag, "")
if "(" in seq:
raise ValueError("Found invalid pattern in the sequence: %s" % seq)
self.df.ix[index, 'Sequence'] = seq
self.debug("set_sequence done")
def _get_sequences(self):
return self.df.Sequence
sequences = property(_get_sequences, doc="returns list of Sequences found in the DataFrame")
def _get_psites(self):
return self.df.Psite
psites = property(_get_psites, doc="returns list of Psites found in the DataFrame")
def _rebuild_identifier(self):
self.logging.warning("Rebuilding identifier in the dataframe. MERGED prefixes will be lost")
self.merged = self.df.Sequence_Phospho.apply(lambda x: "MERGED" in x)
if "Identifier" in self.df.columns:
del self.df['Identifier']
if "Identifier" not in self.df.columns:
self.df['Identifier'] = self.df.Protein + "_" + self.df.Psite
#for k,v in self.merged.iterkv():
# if v == True:
# self.df.loc[k,'Identifier'] = "MERGED_" + self.df.loc[k,'Identifier']
if len(set(self.df.Identifier)) != len(self.df.Identifier):
self.logging.warning("Identifiers are not unique. Have you called merge_peptides() ?")
def _check_sequence_name_header(self):
if "Sequence_Phospho" not in self.df.columns and "Sequence" not in self.df.columns:
raise ValueError("Expecting either Sequence of Sequence_Phospho in the header")
# maybe sequence columns is not named properly:
if "Sequence_Phospho" not in self.df.columns:
if sum(self.df.Sequence.apply(lambda x:"Phospho" in x)) == 0:
self._build_sequence_phospho()
else:
self.logging.warning("Some Phospho strings found in Sequence column. No Sequence_Phospho column found.Renaming Sequence into Sequence_Phospho")
self.df.columns = [x if x!="Sequence" else "Sequence_Phospho" for x in self.df.columns]
# if sequence_phospho privded, all sequence must have at least one phospho
if "Sequence_Phospho" in self.df.columns:
if sum(self.df.Sequence_Phospho.apply(lambda x:"Phospho" in x)) != len(self.df):
self.logging.warning("Some Sequence in Sequence_Phospho column do not have nay phosphos.")
def _build_sequence_phospho(self):
"""
.. warning:: not implemented, for now just a copy of Sequence into Sequence_Phospho,
but should be rebuilt from the protein Sequence and psites location.
"""
self.df['Sequence_Phospho'] = self.df.Sequence
def _interpret_header(self):
"""Interpret the header of the CSV file.
#. Check that the Sequence_Phospho column and/or Sequence column are available
#. Check that the expected column from :attr:`header_mapping` are present
#. Set Sequence_Phospho and Sequence properly if this is not the case (see :meth:`_set_sequence`)
#. Finally check that the header is valid
"""
self._check_sequence_name_header()
columns = self.df.columns
# rename columns if needed
for valid, invalids in self.header_mapping.iteritems():
for invalid in invalids:
if valid not in self.df.columns and invalid in columns:
txt = "Column *{}* not found. Please use the correct header. Trying to replace possible match called **{}**"
self.logging.warning(txt.format(valid, invalid))
columns = [valid if c==invalid else c for c in columns]
self.df.columns = columns
self._set_sequence()
self._check_header()
def _check_header(self):
assert "Protein" in self.df.columns, "Protein must be found in header"
assert "Psite" in self.df.columns, "Psite must be found in header"
assert "Sequence_Phospho" in self.df.columns or "Sequence" in self.df.columns, \
"either Sequence or Sequence_Phospho must be provided"
[docs] def boxplot(self, index=None, logy=True):
"""A simple boxplot of the data using logy
:param index: you can rearrange the ordering of the axis column or select a subset
by providing a list of indices.
:param logy: set y-axis with log scale
.. plot::
from msdas import *
m = MassSpecReader(get_yeast_small_data())
m.boxplot()
"""
if index == None:
self.df.boxplot(rot=90, return_type='axes')
else:
self.df[index].boxplot(rot=90, return_type='axes')
if logy:
pylab.semilogy()
[docs] def cleanup(self):
"""Call functions that clean up the dataframe
Here are the steps followed:
#. set zeros to NA :meth:`msdas.readers.Cleaner.set_zero_to_na`
#. Rename psites using MSDAS convention (see :meth:`rename_psites`).
Here just get rid of spaces
#. calls :meth:`remove_ambiguities`
#. calls :meth:`merge_peptides` if :attr:`_merge_peptides` is True
#. rebuild the identifier based on Protein and Psite columns.
"""
#self.debugLevel = "ERROR"
if self.df is None:
self.logging.warning("No data loaded yet. Please populate the attribute *df*, or call read_csv")
# psites seprated by spaces are renamed so that there are now seprated
# by the ^ (and ) character
# oxidation psite are also interpreted.
self.logging.info("Renaming psites with ^ character")
self.rename_psites()
# get rid of zeros in the data
# should be used for raw data, not normalised so should be called by user
self.logging.info("Replacing zeros with NAs")
self.set_zero_to_na()
# see documentatoion of the method itself
self.remove_ambiguities()
if self._merge_peptides:
self.merge_peptides()
self._rebuild_identifier()
[docs] def merge_peptides(self):
"""Merge data using :class:`Cleaner` class
#. Merge peptide with same psites. See :meth:`~msdas.readers.Cleaner.merge_peptide_with_same_psites`
#. Merge identical peptide sequences. See :meth:`~msdas.readers.Cleaner.merge_identical_peptide_sequence`
Note that in the yeast case, we use only the merge_identical_peptide_sequence
"""
self.merge_peptides_with_same_psites()
self.merge_identical_peptide_sequence()
[docs] def read_csv(self, filename=None, quotechar=None, index_col=None, sep=None):
"""Read a CSV file file Mass Spec data
:param str filename: if not provided, uses the class attribute
:attr:`filename`
This function is used by the constructor of MassSpecReader if a filename
is provided. Yet, you can either create an empty instance and read the file
later on. If you read the file later, the difference with the constructor is
that by default the :meth:`cleanup` function is called. So these two codes
are equivalent only if you call :meth:`cleanup`::
from msdas import *
m1 = MassSpecReader(get_yeast_small_data())
::
from msdas import *
m2 = MassSpecReader()
m2.read_csv(get_yeast_small_data())
m2.cleanup()
You can then check that the 2 files are equivalent::
>>> m1 == m2
True
"""
self.level = self.level
if self.mode is None:
raise ValueError("mode is not set. mode must be provided as YEAST")
self.logging.info("Reading %s" % filename)
# Read the data. Must work
if quotechar == None:
quotechar = self._quotechar
#print index_col, sep, quotechar, self.mode
#sep=None
df = pd.read_csv(filename, quotechar=quotechar, index_col=index_col, sep=sep, engine='python')
df.dropna(how="all", inplace=True) # ignore empty lines
# replaces None by NA
import numpy as np
df.fillna(np.NaN, inplace=True)
# spaces need to be removed in the header
df.columns = [col.strip() for col in df.columns]
self.df = df
self.rawdf = self.df.copy() # for bookkeeping
# and columns that are made of strings should also be stripped
def clean_func(x):
try:
return x.strip()
except:
return x
for col in df.columns:
self.df[col] = self.df[col].map(clean_func)
self._interpret_header()
[docs] def remove_ambiguities(self):
"""
#. Remove rows where psites are ambiguous or could not be interpreted.
See :meth:`remove_ambiguous_psites`
#. Remove rows where number of proteins is > 2 :meth:`remove_ambiguous_proteins`
"""
# remove psites where number of psites does not match phospho+oxidation
# rename_psites (append_oxidation) must be called before otherwise all
# sequences with oxidation are removed. converns 203 rows in yeast
self.remove_ambiguous_psites()
# not redundant with drop above. remove rows where protein names are > 1
self.remove_ambiguous_proteins()
[docs] def rename_psites(self):
"""Rename Psites using AND/OR convention.
To store the Psite column in the header, we use the following conventions:
* individual psite are separated by AND using the **^** character
* ambiguous psite when position is not certain are separated by OR that is **+** character
See :class:`msdas.psites.PSites` class documentation for details.
Here, the CSV file may contain Psites stored with spaces, which are considered
as ANDs. So, we first remove spaces, replace them by ANDs. There are also
extra ; characters that are ignored. If there are duplicated psites, which occurs
in the raw data set (yeast case), we remove them. Finally, in the yeast case,
some sequence contain the (Oxidation) string but the corresponding Psite is not
provided. This function extract this information and stores it in the Psite string.
Psites are also sorted by position e.g., S5^S3 is renamed as S3^S5.
"""
ptools = PSites(verbose=self.level) # important to provide level
# removes spaces
psites = [ptools.remove_spaces(p) for p in self.df.Psite]
# add AND character
psites = [self.AND.join(psite.split()) for psite in psites]
# what to do with ; character? assume an AND for now
psites = [x.replace(";", self.AND) for x in psites]
psites = [ptools.remove_duplicated(x) for x in psites]
self.df['Psite'] = psites
[docs] def remove_ambiguous_proteins(self):
"""Remove rows where proteins are not uniquely defined
Some rows may have the protein field incorrectly defined with several
protein names, in which case the row is discarded by this function
"""
# Removes all rows where we cannot understand what is going on.
# e.g. protein or sites names are not unique
toremove = self.df.index[self.df.Protein.apply(lambda x: " " in x)]
self.logging.info("-- Removing {} rows with ambigous protein names:".format(len(toremove)))
self._removed_ambiguous_proteins = toremove[:]
#FIXME: here we use self.df.ix whereas in remove_ambigous_psites, we use self.df directly why?
self.removed['ambiguous_proteins'] = self.df.ix[self._removed_ambiguous_proteins].copy()
self._ambiguous_proteins_df = self.df.ix[toremove==True][['Protein', 'Sequence', 'Psite', 'Sequence_Phospho']].copy()
for index in toremove:
self.logging.debug(" {}:{}".format(index+1,self.df.Protein[index]))
self.logging.info("--------------------------------------------------")
self._info["N_unknown"] = len(toremove)
self.df.drop(toremove, inplace=True)
[docs] def remove_ambiguous_psites(self):
"""Removes Psites where number of Psite does not match content of Sequence
Some sequences are different number of phospho as compared to the Psites
columns. In such case, we remove the rows. The field used for the sequence
is the Sequence_Phospho where psites are encoded with the (Phsospho) string
::
ALY1, S118^S216, TPLPSSS(Phospho)R
.. note:: Oxidation are ignored
"""
if "Sequence_Phospho" not in self.df.columns:
return
# count number of phospho
# IF MERGED, should be skipped
l3 = self.df.Sequence_Phospho.apply(lambda x: "MERGED" not in x)
count_phosphos = self.df.Sequence_Phospho.apply(lambda x: x.count("Phospho"))
#count_oxis = self.df.Sequence_original.apply(lambda x: x.count("Oxidation"))
count_psites = self.df.Psite.apply(lambda x: len(x.split(self.AND)))
#toremove = count_phosphos+count_oxis != count_psites
toremove = (count_phosphos != count_psites) & l3
self._removed_ambiguous_psites = toremove[:]
self.removed['ambiguous_psites'] = self.df.ix[self._removed_ambiguous_psites].copy()
if sum(toremove) > 0:
self._ambiguous_psites_df = self.df.ix[toremove==True][['Protein', 'Sequence', 'Psite', 'Sequence_Phospho']].copy()
self.logging.info("-- %s rows have ambiguous psites and are removed" % sum(toremove))
self.logging.info("save data in attribute _ambiguous_psites_df")
self.df.drop(toremove.index[toremove==True], inplace=True)
self.logging.info("--------------------------------------------------")
#self._info["N_ambiguous_psites"] = sum(toremove==True)
def _rename_measurements(self, tag):
"""Append prefix to all measurement columns
Measurements columns are those that starts with the letter "t" for now.
"""
oldcols = self.df.columns
columns = [tag+"_" + c if c.startswith("t") else c for c in oldcols]
self.df.columns = columns
[docs] def to_csv(self, filename):
"""Save dataframe into a CSV file
"""
#FIXME: why a copy ??
df = self.df.copy()
#df['Sequence'] = df['Sequence_Phospho']
#del df['Sequence_original']
df.to_csv(filename, index=False, sep=",")
[docs] def sort_psites_ors_only(self):
"""Sort the psites location in psites string that contains OR, only
e.g., S3+S1 is renamed as S1+S3 but S3+S1^S7 is unchanged.
"""
func = PSites(verbose=False).sort_psites_ors_only
self.df['Psite'] = self.df.Psite.apply(lambda x: func(x))
self._rebuild_identifier()
[docs] def plot_phospho_stats(self, **kargs):
"""Plots descriptive statistics about the Psites
The data used comes from the DataFrame.
There are 3 plots created:
#. pie charts of number of sites of each category (e.g., S, T, ..)
#. Histogram of number of phospho sites per protein
#. Histogram of number of phospho sites per peptides
.. plot::
:include-source:
:width: 45%
from msdas import *
filenames = get_yeast_filenames()
p1 = readers.MassSpecReader(filenames[0], mode="YEAST")
p1.plot_phospho_stats()
"""
self._global_count = {}
self._number_sites_per_peptide = []
self._number_sites_per_protein = {}
for sites in self.df.Psite:
# then split on "-" in case there are ambiguities and select the
# first letter in any case.
sites = [this[0] for this in sites.split(self.AND)]
for s in sites:
if s not in self._global_count.keys():
self._global_count[s] = 0
else:
self._global_count[s] += 1
# how many phospho sites per peptide ?
self._number_sites_per_peptide.append(len(sites))
pylab.figure(1)
pylab.clf()
pylab.pie(self._global_count.values(), labels=[k+" ("+str(v)+")" for k,v in
self._global_count.iteritems()])
pylab.title("Type of of phospho sites (over all peptides)")
pylab.figure(2)
pylab.clf()
pylab.hist(self._number_sites_per_peptide, [x+0.5 for x in
range(0,1+max(self._number_sites_per_peptide))])
pylab.title("Number of phospho sites per peptide")
pylab.grid()
g = self.df[['Protein', 'Psite']].groupby("Protein")
groups = g.Psite.groups
pfact = PSites()
for group in groups:
sites = [x for x in self.df.Psite[g.Psite.groups[group]]]
#sites = list(pylab.flatten([[y[0] for y in x.split("_")[1:]] for x in sites]))
# FIXME: if merge or not, the results will be different !
sites = pfact.get_unique_psites(sites)
self._number_sites_per_protein[group] = len(sites)
pylab.figure(3)
pylab.clf()
bins = [x+0.5 for x in
range(0,1+max(self._number_sites_per_protein.values()))]
pylab.hist(self._number_sites_per_protein.values(), bins)
#pylab.xticks(bins, )
pylab.title("Number of phospho sites per protein")
pylab.grid()
[docs] def hist_number_peptides(self):
"""Create a bar plot with number of peptides per protein
"""
groups = [(len(v),k) for k,v in self.df.groupby("Protein").groups.iteritems()]
l = sorted(groups)
l.reverse()
y = [this[0] for this in l]
xlabel = [this[1] for this in l]
pylab.clf()
pylab.bar(range(0,len(y)), y)
pylab.xticks([this+.5 for this in range(0,len(y))], xlabel, rotation=90)
for i in range(0,len(y)):
pylab.text(i+0.5, y[i]+.5, y[i])
pylab.ylabel("Number of peptide per protein")
pylab.ylim([0,max(y)+1])
[docs] def hist_peptide_sequence_length(self, drop_duplicates=True, **kargs):
"""Plot histogram of the peptide sequences length.
:param bool drop_duplicates: duplicated peptide sequence are dropped by default
Indeed, you may have two identical peptides with different psites. Since we are
interested in the peptide sequence, we drop duplicated ones.
:param kargs: optional keywords are passed to the dataframe.hist method.
.. plot::
>>> from msdas import *
>>> m = MassSpecReader(get_yeast_filenames()[0], mode="YEAST")
>>> m.hist_peptide_sequence_length()
"""
if drop_duplicates:
data = self.df.Sequence.drop_duplicates().apply(lambda x: len(x))
else:
data = self.df.Sequence.apply(lambda x: len(x))
data.hist(**kargs)
pylab.title("")
pylab.xlabel("Sequence length")
pylab.ylabel("#")
[docs] def pcolor(self, protein=None, tag="t", fillna=None, vmax=None, vmin=None,
cmap="hot_r", fontsize_yticks=8):
"""
:param str protein: a protein name. If not provided, plot all the data.
:param str tag: only column that starts with this tag are used
:param fillna: if na are found, there are plot as grey pixels. You can set
all NAs to a scalar if required.
:param float vmin: vmin parameters used by the matplotlib.pcolor function
:param float vmax: vmax parameters used by the matplotlib.pcolor function
:param cmap: a valid color map from matplotlib
.. plot::
from msdas import *
m = MassSpecReader(get_yeast_small_data(), verbose=False)
m.pcolor("DIG1", tag="a") # pick up measurements with name starting with "a" letter
"""
measures = [c for c in self.df.columns if c.startswith(tag)]
if len(measures)==0:
print("no column matches the provided tag (%s)"%tag)
return
pylab.clf()
if protein:
data = self[protein][measures]
psites = list(self.df.Psite[self[protein].index])
else:
data = self.df[measures]
psites = list(self.df.Psite)
proteins = list(self.df.Protein)
psites = [x+"_"+y for x,y in zip(proteins, psites)]
if vmax == None:
vmax = data.max().max()
if vmin == None:
vmin = data.min().min()
mask = np.ma.array(data.as_matrix(), mask=data.isnull())
cmap = pylab.get_cmap(cmap)
cmap.set_bad("grey", 1)
pylab.pcolormesh(mask, vmin=vmin, vmax=vmax, cmap=cmap)
pylab.colorbar()
pylab.xlabel("measurements")
pylab.ylabel("peptides")
pylab.xlim([0, len(measures)])
pylab.ylim([0, data.shape[0]])
pylab.yticks([x+0.5 for x in range(0,data.shape[0])], psites,
fontsize=fontsize_yticks)
if protein:
pylab.title(protein)
pylab.tight_layout()
return data
[docs] def read_annotations(self, filename):
"""Read annotations saved in a dataframe.
:param str filename: filename where annotations are saved (pickle file)
See :class:`msdas.annotations.Annotations`
This function is required to populate the annotatoins attribute used for example
by :meth:`show_sequence`
"""
self.annotations = pd.read_pickle(filename)
[docs] def status(self):
"""Some basic sanity checks on the dataframe that contains the data"""
# check header
for this in self.valid_header:
if this not in self.df.columns:
print("Could not find %s in the dataframe columns" % this)
# check Psites
import psites
pfactory = psites.PSites()
if "Psite" in self.df.columns:
for k,v in self.df.Psite.iteritems():
if pfactory.isvalid(v) == False:
print("Invalid psite (%s) at index %s" % (k,v))
print("There are maybe others. break for now the psite checking")
break
return True
[docs] def plot_timeseries(self, identifier, replicate=None, **kargs):
"""This is for the yeast case with replicates
:param str identifier:
:param replicates: None means use the first replicate (the one without prefix).
otherwise, provide the suffix corresponding to the replicate (see example below)
:param kargs: any paraemter accepted by the matplotlib.plot function
.. plot::
:include-source:
:width: 80%
from msdas import *
m = readers.MassSpecReader(get_yeast_small_data(), verbose=False)
m.plot_timeseries("DIG1_S142")
If you have replicates, only the first replicate is taken into account and
you therefore need to call plot_timeseries several time providing the
tag of the replicate. The first replicate has no tag. The second replicate
has a suffix (**.1**), the third replicates has the tag **.2** and so on
.. plot::
:include-source:
:width: 80%
from msdas import *
M = readers.MassSpecReader(get_yeast_raw_data(), verbose=False)
M.plot_timeseries("DIG1_S142", replicate=None, color="r")
M.plot_timeseries("DIG1_S142", replicate=1, color="g")
M.plot_timeseries("DIG1_S142", replicate=2, color="b")
"""
if replicate == None:
measures = [x for x in self.measurements if "." not in x]
else:
measures = [x for x in self.measurements if x.endswith("."+str(replicate))]
if len(measures)==0:
raise ValueError("No measurement data found that correspond to the replicate provided" )
try:
# for the replicates; get_mu_df is required
data = self.get_mu_df()[measures]
except:
data= self.df[measures]
indices = self.df[self.df.Identifier == identifier].index
data = data.ix[indices]
kargs['color'] = kargs.get("color", "r")
kargs['marker'] = kargs.get("marker", "o")
kargs['markersize'] = kargs.get("markersize", 10)
kargs['lw'] = kargs.get("lw", "2")
if len(indices)>1:
self.logging.warning("More than 1 row found. Consider calling merging_ambiguous_peptides method")
for index in indices:
pylab.plot(data.ix[index], '--', **kargs)
N = len(measures)
pylab.title(identifier)
pylab.xticks(range(0,N), measures, rotation=90)
pylab.grid()
return data
[docs] def get_data_matrix(self, identifier):
"""YEAST case with replicates
.. todo:: to be moved to yeast module
"""
alpha = ['a0', 'a1', 'a5', 'a10', 'a20', 'a45']
salt = ['t0', 't1', 't5', 't10', 't20', 't45']
try:
data = self.get_mu_df()
data = data[self.df.Identifier == identifier]
if len(data)>1:
self.logging.error("Found more than 1 row matching the psite")
print('a')
except:
data = self.df[self.df.Identifier == identifier]
exps = np.zeros((6,6))
for i,a in enumerate(alpha):
for j,s in enumerate(salt):
exps[i,j] = data[a+"_"+s]
exps = pd.DataFrame(exps)
#TODO : is this the correct order
exps.index = alpha
exps.column = salt
return exps
[docs] def plot_experiments(self, identifier, cmap="hot_r", vmin=None,vmax=None):
"""
should be for yeast only. Put inside a YEAST class ?
if not merged, you could end up with several rows.
may fail in such case...
"""
# fig = plt.figure()
#ax = fig.gca(projection='3d')
#surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm,linewidth=0, antialiased=False)
#ax.zaxis.set_major_locator(LinearLocator(10))
#ax.zaxis.set_major_formatter(FormatStrFormatter('%.02f'))
#fig.colorbar(surf, shrink=0.5, aspect=5)
self.logging.warning("Works with yeast data set only")
exps = self.get_data_matrix(identifier)
mask = np.ma.array(exps, mask=np.isnan(exps))
pylab.clf()
if vmax == None:
vmax = np.nanmax(exps)
if vmin == None:
vmin = np.nanmin(exps)
cmap = pylab.get_cmap(cmap)
cmap.set_bad("grey", 1)
pylab.pcolormesh(pylab.flipud(mask), vmin=vmin, vmax=vmax, cmap=cmap)
alpha = ['a0', 'a1', 'a5', 'a10', 'a20', 'a45']
salt = ['t0', 't1', 't5', 't10', 't20', 't45']
pylab.yticks([x+0.5 for x in range(0, 6)], alpha[::-1])
pylab.xticks([x+0.5 for x in range(0, 6)], salt)
pylab.colorbar()
pylab.title(identifier)
pylab.ylabel("Time after alpha-factor stimulation")
pylab.xlabel("Time after NaCl stimulation")
return exps
[docs] def show_sequence(self, protein, N=40, vmax=None,cmap="hot_r", fontsize=10):
"""Show peptide and phosphorylation distribution on a sequence
.. plot::
:include-source:
import os
from msdas import *
r = readers.MassSpecReader(get_yeast_raw_data())
if os.path.exists("YEAST_annotations.pkl")==False:
a = annotations.Annotations(r, "YEAST")
a.get_uniprot_entries()
a.set_annotations()
a.to_pickle()
r.read_annotations("YEAST_annotations.pkl")
r.show_sequence("DIG1",fontsize=8)
.. warning:: once merged, some phospho are tagged ith merged name so we use the colum Sequence (not Sequence_Phospho)
"""
import textwrap
self.warning("Requires annotations")
indices = self.metadata.groupby("Protein").groups[protein]
psites = self.metadata.Psite.ix[indices]
peptide_sequences = self.metadata.Sequence.ix[indices]
entry = self.metadata.Entry.ix[indices[0]]
sequence = self.annotations.ix[entry].Sequence
psites_location = set([w[1:] for x in psites for z in x.split("^") for w in z.split("+")])
psites_location = [int(x) for x in psites_location]
# build the matrix to plot. The matrix will have the following length and text
text = textwrap.wrap(sequence, N)
ncol = N
nrow = len(text)
# but for now, let us work with the data as an array. We will want a matrix,
# so let us complete the sequence with zeros such that length is the same as a
# matrix
values = [0] * ncol * nrow
# let us look at peptide coverage
for peptide in peptide_sequences:
i = sequence.index(peptide)
j = len(peptide)
for pos in range(i, i+j):
values[pos] += 1
if vmax == None:
M = max(values)
else:
M = vmax
# now we can build the matrix
m = np.array(values).reshape(nrow, ncol)
m = pylab.flipud(m)
pylab.clf();
pylab.pcolor(m, edgecolors="k", vmin=0, vmax=M, cmap=cmap, alpha=0.8);
pylab.colorbar()
for i,x in enumerate(sequence):
if i+1 in psites_location:
pylab.text(i%N+.2, -0.5+nrow-i/N, x, color="green",
bbox=dict(facecolor='white', alpha=0.5), fontsize=fontsize)
else:
pylab.text(i%N+.2, -0.5+nrow-i/N, x, fontsize=fontsize)
pylab.text(i%N+.6, -.9+nrow-i/N, i+1, fontsize=fontsize/1.5)
pylab.title(protein)
pylab.xticks([],[])
pylab.yticks([],[])
return text
def __eq__(self, this):
metadata = self.metadata.fillna(0).eq(this.metadata.fillna(0)).all().all()
if metadata == False:
return False
measures1 = self.measurements.fillna(0).apply(np.round, args=[10]) # 10 decimals
measures2 = this.measurements.fillna(0).apply(np.round, args=[10])
return measures1.eq(measures2).all().all()
def __getitem__(self, name):
if isinstance(name, str):
name = [name]
if len(name)==1:
name = name[0]
df = self.df[self.df.Protein == name].copy()
if len(df)==0:
# try someting else based on identifier instead of protein name
df = self.df[self.df.Identifier == name].copy()
elif len(name) == 2:
protein = name[0]
psite= name[1]
df = self.df[(self.df.Protein == protein) & (self.df.Psite == psite)]
else:
raise ValueError("Expects only 1 or 2 parameters.")
return df
def __str__(self):
import textwrap
N = len([x for x in self.df.columns if x not in ['Sequence', 'Psite', 'Protein']])
msg = "This dataframe contains {} columns in addition to the ".format(N) + \
"standard columns Protein, Sequence, Psite"
txt = "\n".join(textwrap.wrap(msg, 80))
txt += "\nYour data contains {} unique proteins\n".format(len(set(self.df.Protein)))
txt += "Your data contains {} combination of psites/proteins\n".format(self.df.shape[0])
return txt