Source code for msdas.readers

# -*- python -*-
#
#  This file is part of MS-DAS software
#
#  Copyright (c) 2014 - EBI-EMBL
#
#  File author(s): Thomas Cokelaer <cokelaer@ebi.ac.uk>, Marti Bernardo Faura
#  bernardo@ebi.ac.uk
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#
##############################################################################
"""Mass Spectrometry Readers (raw data)"""
import os
import pandas as pd
from easydev import Logging
import numpy as np
from msdas.psites import PSites
from msdas.tools import SequenceTools, Modification

import pylab

__all__ = ["MassSpecReader", "Cleaner"]


[docs]class Cleaner(object):
    """Tools inherited by :class:`msdas.readers.MassSpecReader` to perform some
    tasks on grouping the peptides. Allows also some manipulation on the NAs.

    .. note:: In principle you do not need to use this class, which is
        available via :class:`msdas.readers.MassSpecReader`. However, it can be
        useful as a standalone class.

    Requires the dataframe stored  in :attr:`df` to be populated with a dataframe
    similar to those used in :class:`msdas.readers.MassSpecReader`.


    .. rubric:: handling NA

    Here, we create a simple dataframe with two columns. The last row contains
    1 NA and 1 zero.::

        from msdas import Cleaner
        import pandas as pd

        c = Cleaner()
        c.df = pd.DataFrame({'A':[1,2,3,np.nan], 'B':[4,5,6,0]})

    In Mass Spectrometry, zeros have no meaning. There is always some
    measurements of abundance being made. Zeros are sometimes used to represent a
    NA. The :class:`Cleaner` class has a method :meth:`set_zero_to_na` to replace all
    zeros by NAs::

        >>> c.set_zero_to_na()
        >>> c.df
            A   B
        0   1   4
        1   2   5
        2   3   6
        3 NaN NaN

    Rows that contains all NAs are useless and one may want to remove them::

        c.drop_na_count(2)

    This statement removes all rows that contain 2 NAs. You can get the count
    of all NAs through the entire dataframe with the method :meth:`get_na_count`.

    """
    def __init__(self):
        self.df = None

[docs]    def drop_oxidation(self):
        """Drop rows where the Sequence_Phospho contains the Oxidation string


        That the the :meth:`merge_identical_peptide_sequence` renames
        some sequences and add the "MERGED" prefix. Those rows are not to be removed
        since they contains measurements that correspond to a peptide with an oxidation
        but also possible another peptide without oxidation. There is no way currently
        to know that. So the MERGED peptide are ignored. Ideally, :meth:`drop_oxidation`
        should be called before any merging.

        """
        indices = self.df.Sequence_Phospho.apply(lambda x: "Oxidation" in x and "MERGED" not in x)
        try:
            self.logging.warning("Removing {} rows that contain Oxidation string".format(sum(indices)))
        except:
            print("Removing {} rows that contain Oxidation string".format(sum(indices)))

        indices = self.df.Sequence_Phospho.apply(lambda x: "Oxidation" not in x and "MERGED" not in x)
        self.df = self.df[indices]

[docs]    def set_zero_to_na(self):
        """Replace all zeros with NA

        If you are manipulating raw mass spec data, a zero most probably means
        NA. Indeed, each measurement is the ion current measured, that is greater
        than zero. So, if you want to replace zeros by NA, just call this method.
        """
        self.df = self.df.where(self.df!=0, None)

[docs]    def get_na_count(self):
        """Return vector with number of NA per row (psite/protein)"""
        return len(self.df.columns) - pd.notnull(self.df).sum(axis=1)

[docs]    def drop_na_count(self, N=100000, inplace=True):
        """removes rows with at least N NA values

        A row may have replicates. In the YEAST data case, we have 3 replicates
        and 36 experiments; setting N=54 means removes
        all rows for which half of the measurements are NAs.

        Here below, we use :class:`msdas.replicates.ReplicatesYeast`, which
        inherit from :class:`MassSpecReader` and therefore from :class:`Cleaner`.
        Consequently, we can simplify the data by calling :meth:`drop_oxidation`
        and :meth:`merge_identical_peptides_sequence`. Then, we drop
        rows that have 108 NAs and plot the number of remaining rows. Then, we drop
        rows that have 2 NAs and so on until there is no more data. The plot shows
        the size of the dataframe as a function of the maximal number of NAs that
        is acceptable.

        .. plot::
            :include-source:
            :width: 80%

            >>> import pylab
            >>> from msdas import ReplicatesYeast, get_yeast_raw_data
            >>> r = ReplicatesYeast(get_yeast_raw_data())
            >>> r.drop_oxidation()
            >>> r.merge_identical_peptide_sequence()
            >>> size = []
            >>> for n in range(108,-1,-1):
            ...     r.drop_na_count(n)
            ...     size.append(r.df.shape[0])
            >>> pylab.plot(size, 'o-')
            >>> pylab.grid()
            >>> pylab.xlabel("Max number of NA per row")
            >>> pylab.ylabel("Number of rows")


        """
        data = self.get_na_count()
        try:
            self.logging.info("Removing {} rows where number of NA is larger than {}".format(len(data[data>=N]), N))
        except:
            print("Removing {} rows where number of NA is larger than {}".format(len(data[data>=N]), N))

        if inplace:
            self.df.drop(data[data>=N].index, inplace=inplace)
        else:
            return self.df.drop(data[data>N].index, inplace=False)

    # TODO rename into merge_same_psites
[docs]    def merge_peptides_with_same_psites(self):
        """Merge rows that are different peptides but same psites

        in other words same psites are presumably peptides including in each other.


        Rows may have same protein and psite but different peptides. This
        may happen when an enzyme cuts the peptide at different places. Consider
        for instance this case::

            KVASMNSASLQDEAEPYDS(Phospho)DEAISK,S49,TOP1,
             VASMNSASLQDEAEPYDS(Phospho)DEAISK,S49,TOP1

        In such cases, we add the data measurements and replace
        the 2 rows by a single one. The Peptide stored is the first one (arbitrary choice).


        Actually, you may also have the Oxidation present, which may be 2 different rows but
        there are added as expected::

                DS(Phospho)SLLFSPAAVAM(Oxidation)R       S515
                DS(Phospho)SLLFSPAAVAMR       S515

        In this case, the 2 rows are added.

        Identifier are unchanged

        .. note:: if two peptides are identical, there are also taken into account here and being added.

        .. seealso:: :class:`msdas.psites.PSites`
        """
        try:
            self.logging.info("Merging ambiguous peptides (different peptides but same psites")
        except:
            print("Merging ambiguous peptides (different peptides but same psites")
        tobeadded = []

        g = self.df.groupby(["Protein", "Psite"])
        groups = g.groups
        count = 0
        count_group = 0

        S = g.aggregate(sum)

        try:
            self.logging.info("Merging {} rows into {} groups.".format(len(self.df), len(groups)))
        except:
            pass

        todrop = []
        for k,v in groups.iteritems():
            if len(v)>1:
                # add the measurements in the group and replace the entire
                # group by this new data
                group = self.df.ix[groups[k]]
                # this does not work :  newrow = group.sum() because some
                # columns contains strings. Consequently, the entire row is
                # casted into strings. So, we need to use apply
                newrow = S.ix[k]
                metadata = group.ix[v[0]].copy()
                newrow['Psite'] = metadata['Psite']
                newrow['Sequence'] = metadata['Sequence']
                newrow['Protein'] = metadata['Protein']
                newrow['Sequence_Phospho'] = metadata['Sequence_Phospho']
                newrow['Identifier'] = metadata['Identifier']
                newrow['Entry'] = metadata['Entry']
                newrow['Entry_name'] = metadata['Entry_name']

                tobeadded.append(newrow)
                todrop.extend(v)

                count += len(v)
                count_group += 1

        self.df.drop(todrop, inplace=True)
        if len(tobeadded):
            self.df = self.df.append(tobeadded, ignore_index=True)

        try:
            self.logging.info("Merged {} groups (with more than 1 peptide) into {} new rows.".format(count, len(tobeadded)))
            self.logging.info("New data frame has {} rows".format(len(self.df)))
        except:
            print("Merged {} groups (with more than 1 peptide) into {} new rows.".format(count, len(tobeadded)))
            print("New data frame has {} rows".format(len(self.df)))


        self._rebuild_identifier()

[docs]    def merge_identical_peptide_sequence(self):
        """Merge (by adding) rows that have same protein and peptide sequence

        They may have different psite locations but their number must agree.
        A new name for the psite has to   be created to reflect what addition
        was performed. Instead of concatenating  the psites, we factorise if
        possible. For instance, consider the simple case with no common location
        but a single location::

            DVSQIT(Phospho)SSPK        T44
            DVSQITSS(Phospho)PK        S46

        This is a simple OR case  and the new name is **T44+S46**. Let us now consider a
        case with 2 psites, one being common::

            TRS(Phospho)S(Phospho)GTNNKDVSQITSSPK    S32^S33
            TRSS(Phospho)GT(Phospho)NNKDVSQITSSPK    S33^T35

        S33 is common to two sequence, so the new psite name is ::

            S32+T35^S33

        and so on.

        .. seealso:: :class:`msdas.psites.PSites`
        """
        try:
            self.logging.info("Merging identical peptides (but with psites at different locations")
        except:
            print("Merging identical peptides (but with psites at different locations")
        tobeadded = []
        # parentheses in (Phospho) are required to not clash with prefix added to the
        # sequence_original when there ambiguities
        self.df['count_phospho'] = self.df.Sequence_Phospho.apply(lambda x: x.count("(Phospho)"))

        # ignore sequence without peptide (if already result of a merge)
        df = self.df[self.df['count_phospho']>0]
        g = df.groupby(["Protein", "Sequence", "count_phospho"])
        groups = g.groups

        try:
            self.logging.info("grouping in progress")
        except:
            print("grouping in progress")
        count_group = 0
        count = 0

        pfactory = PSites(verbose=self.level)

        todrop = []

        S = g.aggregate(sum) # compute some before the loop. This is much 2,3 times faster than a apply function
        self.logging.info("Merging {} rows into {} groups.".format(len(self.df), len(g.groups)))

        #print("##############")
        #print(len(groups))
        NN = len(groups)
        count = 0
        for k, indices in groups.iteritems():
            #print(k, count)
            count += 1


            # if there is only one row for a given protein/sequence/number of phosphos,
            # then there is nothing to do otherwise, there are several cases as explained
            # below
            if len(indices)<=1:
                continue

            # S142^S150
            # S142^S151
            #
            # in this case, S142 is not ambiguous and final psite could be named S142^150+S151
            # note that there is no paraenthese. Yet, it means S142^S150 OR S142^S151
            #
            # k[2] conains the counting of phosphos
            #

            # TODO those 3 lines takes 3 seconds ...
            group = self.df.ix[groups[k]]
            Npsite = pfactory.remove_duplicated("^".join(list(self.df.ix[indices].Psite)))
            Npsite = len(Npsite.split("^"))

            if k[2] == 1:
                psite_name = "+".join(self.df.ix[indices].Psite)
                #psite_name = pfactory.sorted(psite_name.split("^"))
                # the apply/sum function is slow...
                newrow = S.ix[k].copy()
                #newrow = group.apply(lambda x: x.sum())
                metadata = group.ix[indices[0]]

                newrow['Psite'] = psite_name
                newrow['Sequence'] = metadata['Sequence']
                newrow['Protein'] = metadata['Protein']

                try:
                    newrow['Identifier'] = metadata['Identifier']
                    newrow['Entry'] = metadata['Entry']
                    newrow['Entry_name'] = metadata['Entry_name']
                except Exception:
                    raise Exception

                newrow['Sequence_Phospho'] = 'MERGED_{}Phosphos_{}locations_'.format(k[2], Npsite) + newrow['Sequence']
                newrow['count_phospho'] = k[2]
                todrop.extend(indices)
                tobeadded.append(newrow)
                count += len(indices)
                count_group += 1
            else:
                psites = list(self.df.ix[indices].Psite)
                common_psites = pfactory.get_common_psites(psites)
                if len(common_psites) == k[2]-1:
                    psite_name = pfactory.get_factorised_psites(psites)
                else:
                    # concatenate with +
                    psite_name  = "+".join(sorted(psites))

                newrow = S.ix[k].copy()
                #newrow = group.apply(lambda x: x.sum())
                newrow['Psite'] = psite_name
                metadata = group.ix[indices[0]]
                newrow['Sequence'] = metadata['Sequence']
                newrow['Protein'] = metadata['Protein']
                try:
                    newrow['Identifier'] = metadata['Identifier']
                    newrow['Entry'] = metadata['Entry']
                    newrow['Entry_name'] = metadata['Entry_name']
                except Exception:
                    raise Exception

                newrow['Sequence_Phospho'] = 'MERGED_{}Phosphos_{}locations_'.format(k[2], Npsite) + newrow['Sequence']
                newrow['count_phospho'] = k[2]
                todrop.extend(indices)
                #self.df.drop(indices, inplace=True)
                tobeadded.append(newrow)
                count += len(indices)
                count_group += 1

        self.df.drop(todrop, inplace=True)
        if len(tobeadded):
            self.df = self.df.append(tobeadded, ignore_index=True)
        try:
            self.logging.info("Merged {} groups (with more than 1 peptide) into {} new rows.".format(count, len(tobeadded)))
        except:
            print("Merged {} groups (with more than 1 peptide) into {} new rows.".format(count, len(tobeadded)))
        del self.df["count_phospho"]
        self._rebuild_identifier()

[docs]    def fix_duplicated_identifier(self):
        """Make sure identifierse are unique

        Some identifiers may be duplicates (same protein, same psites
        but different peptides) e.g., ::

             Protein                Sequence    Psite     Identifier
                DIG2  LSQKEEDHSGKPPTITTSPAEK  T83+S84   DIG2_T83+S84
                DIG2      EEDHSGKPPTITTSPAEK  T83+S84   DIG2_T83+S84

        In this case, we append an additional id, which is an integer
        starting from 1. First row found has the id 1.::

             Protein                Sequence    Psite     Identifier
                DIG2  LSQKEEDHSGKPPTITTSPAEK  T83+S84   DIG2_T83+S84_1
                DIG2      EEDHSGKPPTITTSPAEK  T83+S84   DIG2_T83+S84_2

        """
        groups = self.df.groupby("Identifier").groups
        for k, indices in groups.iteritems():
            if len(indices) > 1:
                identifiers = list(self.df.ix[indices,'Identifier'])
                identifiers = [identifier + "_" + str(i+1) for i,identifier in enumerate(identifiers)]
                self.df.ix[indices,'Identifier'] = identifiers





[docs]class MassSpecReader(Logging, SequenceTools, Cleaner):
    """Read a simple Mass Spec data CSV file

    First, the MassSpecReader reads a CSV file and stores it in the
    :attr:`df` attribute as a dataframe. At this stage, there is no
    manipulation except from the fact that columns found in the header are
    cleanup (e.g., spaces are removed).

    Second, the header is interpreted. The User Guide provides a draft
    convention of how the header should look like. It should contains the
    following columns:

    #. **Protein**: a valid uniprot protein name (e.g., DIG1)
    #. **Sequence**: a peptide sequence without annotations. If not provided,
       Sequence_Phospho must be provided
    #. **Sequence_Phospho**: a sequence with the phsphorylation position tagged
       as "(Phospho)" e.g., AVS(Phospho)KK means there was a Phophorylation
       on S at position 3 and possibly oxidation as (Oxidation)
    #. **Psite**: The absolute position of the phosphorylations. For instance S132^S140

    and possibly:

    #. **Entry_name**: a valid uniprot entry name (e.g., DIG1_YEAST)
    #. **Entry**: a valid uniprot entry (e.g., P23321)
    #. **Identifier**: concatenation of Protein and Psite. Maybe overwritten if
       :meth:`_rebuild_identifier` is called

    If you have another name (e.g., PeptideSequence instead of Sequence_Phospho),
    you can populate the :attr:`header_mapping` attribute.


    If "Accession" and "Modifications" are found in the header, the accession
    column will be expected to contain these kind of strings (FASTA header)::

        sp|Q8IYB3-2|SRRM1_HUMAN

    from which the Protein column can be extract. Besides, the Modifications
    will contain information that could be used to
    extract the Psite positions. Modifications entries look like::

        [5] S+7777| [1] T+5555

    Once the data is intrepreted. Protein, Sequence and Psite should be available.

    Finally, a third step performed by this class is to performs some cleanup.
    See :meth:`cleanup` for details.

    .. warning:: Psites are separated by the ^ character (AND). If there is
        an ambiguity, the + sign can be used as a logical OR.


    The creation of a MassSpecReader is quite versatile. You can read a CSV
    file as described above, or provide an instance of MassSpecReader, or an object
    that contains an attribute :attr:`df`, which is compatible.


    ::

        # Let us create a dummy dataframe
        import pandas as pd
        df = pd.DataFrame({
            'Psite': ['S1+S2', 'T3'],
            'Sequence_Phospho': ["S(Phospho)S(Phospho)TTT", "SST(Phospho)TT"],
            'Protein': ['DIG1', 'DIG2'],
            'X1':[1,2],
            'X2':[5,2],
            'X3':[4,5],
            'dummy'[1,2]
        }

        from msdas import *
        r = readers.MassSpecReader(df)

    There are some sanity checks performed. In the example above, the case Psite, S1+S2
    is acutally ambiguous: it means that there a psite on S1 OR S2 (i.e., there is one psite).
    Yet the sequence contains 2 phophoosites. So, there is a typo. If you check the content of
    **r.df** you would see only 1 row. Let us fix it

    ::

        >>> df['Psite'] = ['S1^S2', 'T3']
        >>> r = readers.MassSpecReader(df)
        >>> len(r.df)
        2

    .. seealso:: the User Guide and notebooks provided in the MSDAS documentation


    """
    #: symbol used to encode psites (AND case)
    AND = "^"
    #: symbol used to encode psites (OR case)
    OR = "+"

    header_mapping = {
        'Protein': ['Standard name', 'ProteinName'],
        'Psite': ['Phosphosites'],
        'Sequence_Phospho': ['Peptide sequence', 'PeptideSequence']
    }

    valid_header = ["Identifier", "Protein", "Sequence", "Sequence_Phospho",
                    "Psite", "Entry", "Entry_name"]
    def __init__(self, data=None, mode="default", verbose=True, quotechar="'", merge_peptides=False,
                 index_col=None, sep=",",cleanup=True):
        """.. rubric:: constructor

        :param str data: Can be (i) None, (ii) a valid filename with the appropiate format (CSV), (iii)
            an instance of MassSpecReader itself, (iv) a dataframe with the correct column names (v)
            any object that contains an attribute **df** that is a dataframe with the correct column names.
        :param bool verbose: Set to True/False or valid string (DEBUG,INFO,WARNING,ERROR)
        :param quotechar: Ignore the quotechar argument if found in the CSV file.
            Defaults to ' character. For Tcell data, set to " since
            the CSV file contains " characters.
        :param bool merge_peptides: calls :meth:`merge_peptides` is True (default to False)
        :param int index_col: may be required to read some CSV files
        :param str sep: the seprator to be used when reading the CSV file
        :param cleanup: calls :meth:`cleanup` method (default to True)

        """
        super(MassSpecReader, self).__init__(level=verbose)
        self._quotechar = quotechar
        self._index_col = index_col
        self._sep = sep
        self._info = {}

        self._merge_peptides = merge_peptides

        self.removed = {}
        # must be set to zero now
        self._mode = None
        self._df = None

        if isinstance(data, MassSpecReader):
            self.df = data.df.copy()
            self.mode = data.mode

        elif isinstance(data, pd.DataFrame):
            self.df = data.copy()
            data = True # to force the cleaup and avoid co,paring dataframe with None
            self.mode = mode
        elif hasattr(data,"df"):
            self.df = data.df.copy()
            self.mode = mode
        # comparison with None should be after dataframe case
        elif data == None:
            self.mode = mode
        else:
            # mode must be set now, before calling read_csv
            self.mode = mode
            if data != None and os.path.exists(data) == False:
                raise IOError("The file {} does not exist".format(data))
            if data!= None:
                self.read_csv(data, quotechar=self._quotechar,
                              index_col=self._index_col, sep=self._sep)

        # if this is failing, we want to raise an error. no try/except
        if cleanup == True and data!=None:
            self.cleanup()

        self._metadata_names = self.valid_header[:]

    def _set_df(self, df):
        self._df = df
        self._interpret_header()
    def _get_df(self):
        return self._df
    df = property(_get_df, _set_df, doc="getter/setter for the main dataframe")

    def _get_N(self):
        return len(self.measurements.columns)
    N = property(_get_N, doc="Return number of measurements.")

    def _get_metadata(self):
        return self.df[[x for x in self._metadata_names if x in self.df.columns]]
    metadata = property(_get_metadata,
                        doc="""read-only attribute. Returns subset of the :attr:`df` attribute, which contains
                        metadata (not measurements). The subset is selected based on the :attr:`_metadata_names`
                        attribute, which by default contains Sequence, Protein, Identifier, Entry, Entry_name,...
                        look at  `_metadata_names` for up-to-date list of column names""")

    def _get_measurements(self):
        col = [x for x in self.df.columns if x not in self._metadata_names]
        return self.df[col]
    measurements = property(_get_measurements,
        doc="getter to subset of the dataframe :attr:`df` that is not in :attr:`metadata`")

    def _get_mode(self):
        if self._mode == None:
            # try to figure out
            self.logging.warning("mode not provided. Trying to figure out")
            if "Entry_name" in self.df.columns:
                mode = self.df.Entry_name.ix[0].split("_")[1]
                self.mode = mode
            else:
                self.logging.warning("Could not figure out. please provide YEAST")
        return self._mode
    def _set_mode(self, mode):
        if mode == None:
            raise ValueError("mode cannot be None")
        assert mode.upper() in ["DEFAULT", "YEAST"], "mode can be yeast "
        if mode.upper() == "DEFAULT":
            mode = "YEAST"
        self._mode = mode.upper()
        if self._mode == "YEAST":
            self._quotechar = "\'"
    mode = property(_get_mode, _set_mode,
                    doc="mode of the Merger (e.g., yeast)")

    def _set_sequence(self):
        self.debug("set_sequence starting")
        if "Sequence_Phospho" not in self.df.columns:
            return

        tags = ["(Phospho)", "(Oxidation)"]

        if "Sequence" in self.df.columns:
            self.df.loc[:,'Sequence'] = list(self.df['Sequence_Phospho'].values)
        else:
            self.df['Sequence'] = list(self.df['Sequence_Phospho'].values)

        for index in self.df.index:
            seq = self.df.ix[index, 'Sequence']
            for tag in tags:
                seq = seq.replace(tag, "")
            if "(" in seq:
                raise ValueError("Found invalid pattern in the sequence: %s" % seq)
            self.df.ix[index, 'Sequence'] = seq
        self.debug("set_sequence done")

    def _get_sequences(self):
        return self.df.Sequence
    sequences = property(_get_sequences, doc="returns list of Sequences found in the DataFrame")

    def _get_psites(self):
        return self.df.Psite
    psites = property(_get_psites, doc="returns list of Psites found in the DataFrame")

    def _rebuild_identifier(self):
        self.logging.warning("Rebuilding identifier in the dataframe. MERGED prefixes will be lost")

        self.merged = self.df.Sequence_Phospho.apply(lambda x: "MERGED" in x)

        if "Identifier" in self.df.columns:
            del self.df['Identifier']

        if "Identifier" not in self.df.columns:
            self.df['Identifier'] = self.df.Protein + "_" + self.df.Psite

        #for k,v in self.merged.iterkv():
        #    if v == True:
        #        self.df.loc[k,'Identifier'] = "MERGED_" + self.df.loc[k,'Identifier']

        if len(set(self.df.Identifier)) != len(self.df.Identifier):
            self.logging.warning("Identifiers are not unique. Have you called merge_peptides() ?")

    def _check_sequence_name_header(self):

        if "Sequence_Phospho" not in self.df.columns and "Sequence" not in self.df.columns:
            raise ValueError("Expecting either Sequence of Sequence_Phospho in the header")

        # maybe sequence columns is not named properly:
        if "Sequence_Phospho" not in self.df.columns:
            if sum(self.df.Sequence.apply(lambda x:"Phospho" in x)) == 0:
                self._build_sequence_phospho()
            else:
                self.logging.warning("Some Phospho strings found in Sequence column. No Sequence_Phospho column found.Renaming Sequence into Sequence_Phospho")
                self.df.columns = [x if x!="Sequence" else "Sequence_Phospho" for x in self.df.columns]

        # if sequence_phospho privded, all sequence must have at least one phospho
        if "Sequence_Phospho" in self.df.columns:
            if sum(self.df.Sequence_Phospho.apply(lambda x:"Phospho" in x)) != len(self.df):
                self.logging.warning("Some Sequence in Sequence_Phospho column do not have nay phosphos.")

    def _build_sequence_phospho(self):
        """

        .. warning:: not implemented, for now just a copy of Sequence into Sequence_Phospho,
            but should be rebuilt from the protein Sequence and psites location.
        """
        self.df['Sequence_Phospho'] = self.df.Sequence

    def _interpret_header(self):
        """Interpret the header of the CSV file.

        #. Check that the Sequence_Phospho column and/or Sequence column are available
        #. Check that the expected column from :attr:`header_mapping` are present
        #. Set Sequence_Phospho and Sequence properly if this is not the case (see :meth:`_set_sequence`)
        #. Finally check that the header is valid

        """
        self._check_sequence_name_header()


        columns = self.df.columns
        # rename columns if needed
        for valid, invalids in self.header_mapping.iteritems():
            for invalid in invalids:
                if valid not in self.df.columns and invalid in columns:
                    txt = "Column *{}* not found. Please use the correct header. Trying to replace possible match called **{}**"
                    self.logging.warning(txt.format(valid, invalid))
                    columns = [valid if c==invalid else c for c in columns]
        self.df.columns = columns


        self._set_sequence()


        self._check_header()


    def _check_header(self):
        assert "Protein" in self.df.columns, "Protein must be found in header"
        assert "Psite" in self.df.columns, "Psite must be found in header"
        assert "Sequence_Phospho" in self.df.columns or "Sequence" in self.df.columns, \
            "either Sequence or Sequence_Phospho must be provided"

[docs]    def boxplot(self, index=None, logy=True):
        """A simple boxplot of the data using logy
     
        :param index: you can rearrange the ordering of the axis column or select a subset
            by providing a list of indices.
        :param logy: set y-axis with log scale
     
        .. plot::
    
            from msdas import *
            m = MassSpecReader(get_yeast_small_data())
            m.boxplot()
   
  
        """
        if index == None:
            self.df.boxplot(rot=90, return_type='axes')
        else:
            self.df[index].boxplot(rot=90, return_type='axes')

        if logy:
            pylab.semilogy()
 


[docs]    def cleanup(self):
        """Call functions that clean up the dataframe

        Here are the steps followed:

        #. set zeros to NA :meth:`msdas.readers.Cleaner.set_zero_to_na`
        #. Rename psites using MSDAS convention (see :meth:`rename_psites`).
           Here just get rid of spaces
        #. calls :meth:`remove_ambiguities`
        #. calls :meth:`merge_peptides` if :attr:`_merge_peptides` is True
        #. rebuild the identifier based on Protein and Psite columns.

        """
        #self.debugLevel = "ERROR"
        if self.df is None:
            self.logging.warning("No data loaded yet. Please populate the attribute *df*, or call read_csv")

        # psites seprated by spaces are renamed so that there are now seprated
        # by the ^ (and ) character
        # oxidation psite are also interpreted.
        self.logging.info("Renaming psites with ^ character")
        self.rename_psites()

        # get rid of zeros in the data
        # should be used for raw data, not normalised so should be called by user
        self.logging.info("Replacing zeros with NAs")
        self.set_zero_to_na()

        # see documentatoion of the method itself
        self.remove_ambiguities()

        if self._merge_peptides:
            self.merge_peptides()

        self._rebuild_identifier()

[docs]    def merge_peptides(self):
        """Merge data using :class:`Cleaner` class

        #. Merge peptide with same psites. See :meth:`~msdas.readers.Cleaner.merge_peptide_with_same_psites`
        #. Merge identical peptide sequences. See :meth:`~msdas.readers.Cleaner.merge_identical_peptide_sequence`

        Note that in the yeast case, we use only the merge_identical_peptide_sequence

        """
        self.merge_peptides_with_same_psites()
        self.merge_identical_peptide_sequence()

[docs]    def read_csv(self, filename=None, quotechar=None, index_col=None, sep=None):
        """Read a CSV file file Mass Spec data

        :param str filename: if not provided, uses the class attribute
            :attr:`filename`


        This function is used by the constructor of MassSpecReader if a filename
        is provided. Yet, you can either create an empty instance and read the file
        later on. If you read the file later, the difference with the constructor is
        that by default the :meth:`cleanup` function is called. So these two codes
        are equivalent only if you call :meth:`cleanup`::

            from msdas import *
            m1 = MassSpecReader(get_yeast_small_data())

        ::

            from msdas import *
            m2 = MassSpecReader()
            m2.read_csv(get_yeast_small_data())
            m2.cleanup()

        You can then check that the 2 files are equivalent::

            >>> m1 == m2
            True


        """
        self.level = self.level
        if self.mode is None:
            raise ValueError("mode is not set. mode must be provided as YEAST")
        self.logging.info("Reading %s" % filename)

        # Read the data. Must work
        if quotechar == None:
            quotechar = self._quotechar

        #print index_col, sep, quotechar, self.mode
        #sep=None
        df = pd.read_csv(filename, quotechar=quotechar, index_col=index_col, sep=sep, engine='python')
        df.dropna(how="all", inplace=True) # ignore empty lines
        
        # replaces None by NA
        import numpy as np
        df.fillna(np.NaN, inplace=True)

        # spaces need to be removed in the header
        df.columns = [col.strip() for col in df.columns]

        self.df = df
        self.rawdf = self.df.copy() # for bookkeeping

        # and columns that are made of strings should also be stripped
        def clean_func(x):
            try:
                return x.strip()
            except:
                return x
        for col in df.columns:
            self.df[col] = self.df[col].map(clean_func)

        self._interpret_header()

[docs]    def remove_ambiguities(self):
        """

        #. Remove rows where psites are ambiguous or could not be interpreted.
           See :meth:`remove_ambiguous_psites`
        #. Remove rows where number of proteins is > 2 :meth:`remove_ambiguous_proteins`

        """
        # remove psites where number of psites does not match phospho+oxidation
        # rename_psites (append_oxidation) must be called before otherwise all
        # sequences with oxidation are removed. converns 203 rows in yeast
        self.remove_ambiguous_psites()

        # not redundant with drop above. remove rows where protein names are > 1
        self.remove_ambiguous_proteins()


[docs]    def rename_psites(self):
        """Rename Psites using AND/OR convention.


        To store the Psite column in the header, we use the following conventions:

        * individual psite are separated by AND using the **^** character
        * ambiguous psite when position is not certain are separated by OR that is **+** character

        See :class:`msdas.psites.PSites` class documentation for details.

        Here, the CSV file may contain Psites stored with spaces, which are considered
        as ANDs. So, we first remove spaces, replace them by ANDs. There are also
        extra ; characters that are ignored. If there are duplicated psites, which occurs
        in the raw data set (yeast case), we remove them. Finally, in the yeast case,
        some sequence contain the (Oxidation) string but the corresponding Psite is not
        provided. This function extract this information and stores it in the Psite string.

        Psites are also sorted by position e.g., S5^S3 is renamed as S3^S5.
        """
        ptools = PSites(verbose=self.level) # important to provide level

        # removes spaces
        psites = [ptools.remove_spaces(p) for p in self.df.Psite]

        # add AND character
        psites = [self.AND.join(psite.split()) for psite in psites]

        # what to do with ; character? assume an AND for now
        psites = [x.replace(";", self.AND) for x in psites]

        psites = [ptools.remove_duplicated(x) for x in psites]
        self.df['Psite'] = psites

[docs]    def remove_ambiguous_proteins(self):
        """Remove rows where proteins are not uniquely defined

        Some rows may have the protein field incorrectly defined with several
        protein names, in which case the row is discarded by this function


        """
        # Removes all rows where we cannot understand what is going on.
        # e.g. protein or sites names are not unique
        toremove = self.df.index[self.df.Protein.apply(lambda x: " " in x)]
        self.logging.info("-- Removing {} rows with ambigous protein names:".format(len(toremove)))
        self._removed_ambiguous_proteins = toremove[:]
        #FIXME: here we use self.df.ix whereas in remove_ambigous_psites, we use self.df directly why?
        self.removed['ambiguous_proteins'] = self.df.ix[self._removed_ambiguous_proteins].copy()

        self._ambiguous_proteins_df = self.df.ix[toremove==True][['Protein', 'Sequence', 'Psite', 'Sequence_Phospho']].copy()

        for index in toremove:
            self.logging.debug(" {}:{}".format(index+1,self.df.Protein[index]))
        self.logging.info("--------------------------------------------------")
        self._info["N_unknown"] = len(toremove)
        self.df.drop(toremove, inplace=True)

[docs]    def remove_ambiguous_psites(self):
        """Removes Psites where number of Psite does not match content of Sequence

        Some sequences are different number of phospho as compared to the Psites
        columns. In such case, we remove the rows. The field used for the sequence
        is the Sequence_Phospho where psites are encoded with the (Phsospho) string

        ::

          ALY1,    S118^S216,   TPLPSSS(Phospho)R

        .. note:: Oxidation are ignored

        """
        if "Sequence_Phospho" not in self.df.columns:
            return
        # count number of phospho
        # IF MERGED, should be skipped
        l3 = self.df.Sequence_Phospho.apply(lambda x: "MERGED" not in x)

        count_phosphos = self.df.Sequence_Phospho.apply(lambda x: x.count("Phospho"))
        #count_oxis = self.df.Sequence_original.apply(lambda x: x.count("Oxidation"))
        count_psites = self.df.Psite.apply(lambda x: len(x.split(self.AND)))

        #toremove = count_phosphos+count_oxis != count_psites
        toremove = (count_phosphos != count_psites) & l3

        self._removed_ambiguous_psites = toremove[:]

        self.removed['ambiguous_psites'] = self.df.ix[self._removed_ambiguous_psites].copy()

        if sum(toremove) > 0:
            self._ambiguous_psites_df = self.df.ix[toremove==True][['Protein', 'Sequence', 'Psite', 'Sequence_Phospho']].copy()
            self.logging.info("-- %s rows have ambiguous psites and are removed" % sum(toremove))
            self.logging.info("save data in attribute _ambiguous_psites_df")
            self.df.drop(toremove.index[toremove==True], inplace=True)
            self.logging.info("--------------------------------------------------")

        #self._info["N_ambiguous_psites"] = sum(toremove==True)

    def _rename_measurements(self, tag):
        """Append prefix to all measurement columns


        Measurements columns are those that starts with the letter "t" for now.

        """
        oldcols = self.df.columns
        columns = [tag+"_" + c if c.startswith("t") else c for c in oldcols]
        self.df.columns = columns

[docs]    def to_csv(self, filename):
        """Save dataframe into a CSV file

        """
        #FIXME: why a copy ??
        df = self.df.copy()

        #df['Sequence'] = df['Sequence_Phospho']
        #del df['Sequence_original']
        df.to_csv(filename, index=False, sep=",")

[docs]    def sort_psites_ors_only(self):
        """Sort the psites location in psites string that contains OR, only

        e.g., S3+S1 is renamed as S1+S3 but S3+S1^S7 is unchanged.

        """
        func = PSites(verbose=False).sort_psites_ors_only
        self.df['Psite'] = self.df.Psite.apply(lambda x: func(x))
        self._rebuild_identifier()

[docs]    def plot_phospho_stats(self, **kargs):
        """Plots descriptive statistics about the Psites

        The data used comes from the DataFrame.

        There are 3 plots created:

            #. pie charts of number of sites of each category (e.g., S, T, ..)
            #. Histogram of number of phospho sites per protein
            #. Histogram of number of phospho sites per peptides

        .. plot::
            :include-source:
            :width: 45%

            from msdas import *
            filenames = get_yeast_filenames()
            p1 = readers.MassSpecReader(filenames[0], mode="YEAST")
            p1.plot_phospho_stats()

        """
        self._global_count = {}
        self._number_sites_per_peptide = []
        self._number_sites_per_protein = {}
        for sites in self.df.Psite:

            # then split on "-" in case there are ambiguities and select the
            # first letter in any case.
            sites = [this[0] for this in sites.split(self.AND)]
            for s in sites:
                if s not in self._global_count.keys():
                    self._global_count[s] = 0
                else:
                    self._global_count[s] += 1
            # how many phospho sites per peptide ?
            self._number_sites_per_peptide.append(len(sites))

        pylab.figure(1)
        pylab.clf()
        pylab.pie(self._global_count.values(), labels=[k+" ("+str(v)+")" for k,v in
            self._global_count.iteritems()])
        pylab.title("Type of of phospho sites (over all peptides)")

        pylab.figure(2)
        pylab.clf()
        pylab.hist(self._number_sites_per_peptide, [x+0.5 for x in
            range(0,1+max(self._number_sites_per_peptide))])
        pylab.title("Number of phospho sites per peptide")
        pylab.grid()

        g = self.df[['Protein', 'Psite']].groupby("Protein")
        groups = g.Psite.groups

        pfact = PSites()

        for group in groups:
            sites = [x for x in self.df.Psite[g.Psite.groups[group]]]
            #sites = list(pylab.flatten([[y[0] for y in x.split("_")[1:]] for x in sites]))
            # FIXME: if merge or not, the results will be different !
            sites = pfact.get_unique_psites(sites)
            self._number_sites_per_protein[group] = len(sites)

        pylab.figure(3)
        pylab.clf()
        bins = [x+0.5 for x in
            range(0,1+max(self._number_sites_per_protein.values()))]
        pylab.hist(self._number_sites_per_protein.values(), bins)
        #pylab.xticks(bins, )
        pylab.title("Number of phospho sites per protein")
        pylab.grid()


[docs]    def hist_number_peptides(self):
        """Create a bar plot with number of peptides per protein


        """
        groups = [(len(v),k) for k,v in self.df.groupby("Protein").groups.iteritems()]
        l = sorted(groups)
        l.reverse()
        y = [this[0] for this in l]
        xlabel = [this[1] for this in l]
        pylab.clf()
        pylab.bar(range(0,len(y)), y)
        pylab.xticks([this+.5 for this in range(0,len(y))], xlabel, rotation=90)
        for i in range(0,len(y)):
            pylab.text(i+0.5, y[i]+.5, y[i])
        pylab.ylabel("Number of peptide per protein")
        pylab.ylim([0,max(y)+1])


[docs]    def hist_peptide_sequence_length(self, drop_duplicates=True, **kargs):
        """Plot histogram of the peptide sequences length.

        :param bool drop_duplicates: duplicated peptide sequence are dropped by default
            Indeed, you may have two identical peptides with different psites. Since we are
            interested in the peptide sequence, we drop duplicated ones.
        :param kargs: optional keywords are passed to the dataframe.hist method.

        .. plot::

            >>> from msdas import *
            >>> m = MassSpecReader(get_yeast_filenames()[0], mode="YEAST")
            >>> m.hist_peptide_sequence_length()

        """
        if drop_duplicates:
            data = self.df.Sequence.drop_duplicates().apply(lambda x: len(x))
        else:
            data = self.df.Sequence.apply(lambda x: len(x))
        data.hist(**kargs)
        pylab.title("")
        pylab.xlabel("Sequence length")
        pylab.ylabel("#")

[docs]    def pcolor(self, protein=None, tag="t", fillna=None, vmax=None, vmin=None,
               cmap="hot_r", fontsize_yticks=8):
        """

        :param str protein: a protein name. If not provided, plot all the data.
        :param str tag: only column that starts with this tag are used
        :param fillna: if na are found, there are plot as grey pixels. You can set
            all NAs to a scalar if required.
        :param float vmin: vmin parameters used by the matplotlib.pcolor function
        :param float vmax: vmax parameters used by the matplotlib.pcolor function
        :param cmap: a valid color map from matplotlib

        .. plot::

            from msdas import *
            m = MassSpecReader(get_yeast_small_data(), verbose=False)
            m.pcolor("DIG1", tag="a") # pick up measurements with name starting with "a" letter


        """
        measures = [c for c in self.df.columns if c.startswith(tag)]
        if len(measures)==0:
            print("no column matches the provided tag (%s)"%tag)
            return
        pylab.clf()

        if protein:
            data = self[protein][measures]
            psites = list(self.df.Psite[self[protein].index])
        else:
            data = self.df[measures]
            psites = list(self.df.Psite)
            proteins = list(self.df.Protein)
            psites = [x+"_"+y for x,y in zip(proteins, psites)]

        if vmax == None:
            vmax = data.max().max()
        if vmin == None:
            vmin = data.min().min()
        mask = np.ma.array(data.as_matrix(), mask=data.isnull())

        cmap = pylab.get_cmap(cmap)
        cmap.set_bad("grey", 1)
        pylab.pcolormesh(mask, vmin=vmin, vmax=vmax, cmap=cmap)
        pylab.colorbar()
        pylab.xlabel("measurements")
        pylab.ylabel("peptides")
        pylab.xlim([0, len(measures)])
        pylab.ylim([0, data.shape[0]])
        pylab.yticks([x+0.5 for x in range(0,data.shape[0])], psites,
                      fontsize=fontsize_yticks)
        if protein:
            pylab.title(protein)
        pylab.tight_layout()
        return data

[docs]    def read_annotations(self, filename):
        """Read annotations saved in a dataframe.

        :param str filename: filename where annotations are saved (pickle file)

        See :class:`msdas.annotations.Annotations`

        This function is required to populate the annotatoins attribute used for example
        by :meth:`show_sequence`

        """
        self.annotations = pd.read_pickle(filename)

[docs]    def status(self):
        """Some basic sanity checks on the dataframe that contains the data"""
        # check header
        for this in self.valid_header:
            if this not in self.df.columns:
                print("Could not find %s in the dataframe columns" % this)

        # check Psites
        import psites
        pfactory = psites.PSites()
        if "Psite" in self.df.columns:
            for k,v in self.df.Psite.iteritems():
                if pfactory.isvalid(v) == False:
                    print("Invalid psite (%s) at index %s" % (k,v))
                    print("There are maybe others. break for now the psite checking")
                    break

        return True

[docs]    def plot_timeseries(self, identifier, replicate=None, **kargs):
        """This is for the yeast case with replicates

        :param str identifier:
        :param replicates: None means use the first replicate (the one without prefix).
            otherwise, provide the suffix corresponding to the replicate (see example below)
        :param kargs: any paraemter accepted by the matplotlib.plot function

        .. plot::
            :include-source:
            :width: 80%


            from msdas import *
            m = readers.MassSpecReader(get_yeast_small_data(), verbose=False)
            m.plot_timeseries("DIG1_S142")

        If you have replicates, only the first replicate is taken into account and
        you therefore need to call plot_timeseries several time providing the
        tag of the replicate. The first replicate has no tag. The second replicate
        has a suffix (**.1**), the third replicates has the tag **.2** and so on

        .. plot::
            :include-source:
            :width: 80%

            from msdas import *
            M = readers.MassSpecReader(get_yeast_raw_data(), verbose=False)
            M.plot_timeseries("DIG1_S142", replicate=None, color="r")
            M.plot_timeseries("DIG1_S142", replicate=1, color="g")
            M.plot_timeseries("DIG1_S142", replicate=2, color="b")

        """
        if replicate == None:
            measures = [x for x in self.measurements if "." not in x]
        else:
            measures = [x for x in self.measurements if x.endswith("."+str(replicate))]
        if len(measures)==0:
            raise ValueError("No measurement data found that correspond to the replicate provided" )

        try:
            # for the replicates; get_mu_df is required
            data = self.get_mu_df()[measures]
        except:
            data= self.df[measures]

        indices = self.df[self.df.Identifier == identifier].index
        data = data.ix[indices]
        kargs['color'] = kargs.get("color", "r")
        kargs['marker'] = kargs.get("marker", "o")
        kargs['markersize'] = kargs.get("markersize", 10)
        kargs['lw'] = kargs.get("lw", "2")
        if len(indices)>1:
            self.logging.warning("More than 1 row found. Consider calling merging_ambiguous_peptides method")
        for index in indices:
            pylab.plot(data.ix[index], '--', **kargs)

        N = len(measures)
        pylab.title(identifier)
        pylab.xticks(range(0,N), measures, rotation=90)
        pylab.grid()
        return data

[docs]    def get_data_matrix(self, identifier):
        """YEAST case with replicates

        .. todo:: to be moved to yeast module
        """
        alpha = ['a0', 'a1', 'a5', 'a10', 'a20', 'a45']
        salt = ['t0', 't1', 't5', 't10', 't20', 't45']

        try:
            data = self.get_mu_df()
            data = data[self.df.Identifier == identifier]
            if len(data)>1:
                self.logging.error("Found more than 1 row matching the psite")
            print('a')
        except:
            data = self.df[self.df.Identifier == identifier]
        exps = np.zeros((6,6))

        for i,a in enumerate(alpha):
            for j,s in enumerate(salt):
                exps[i,j] = data[a+"_"+s]

        exps = pd.DataFrame(exps)
        #TODO : is this the correct order
        exps.index = alpha
        exps.column = salt
        return exps

[docs]    def plot_experiments(self, identifier, cmap="hot_r", vmin=None,vmax=None):
        """

        should be for yeast only. Put inside a YEAST class ?

        if not merged, you could end up with several rows.
        may fail in such case...


        """
        #        fig = plt.figure()
        #ax = fig.gca(projection='3d')
        #surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm,linewidth=0, antialiased=False)
        #ax.zaxis.set_major_locator(LinearLocator(10))
        #ax.zaxis.set_major_formatter(FormatStrFormatter('%.02f'))
        #fig.colorbar(surf, shrink=0.5, aspect=5)

        self.logging.warning("Works with yeast data set only")

        exps = self.get_data_matrix(identifier)
        mask = np.ma.array(exps, mask=np.isnan(exps))
        pylab.clf()
        if vmax == None:
            vmax = np.nanmax(exps)
        if vmin == None:
            vmin = np.nanmin(exps)
        cmap = pylab.get_cmap(cmap)
        cmap.set_bad("grey", 1)

        pylab.pcolormesh(pylab.flipud(mask), vmin=vmin, vmax=vmax, cmap=cmap)
        alpha = ['a0', 'a1', 'a5', 'a10', 'a20', 'a45']
        salt = ['t0', 't1', 't5', 't10', 't20', 't45']
        pylab.yticks([x+0.5 for x in range(0, 6)], alpha[::-1])
        pylab.xticks([x+0.5 for x in range(0, 6)], salt)
        pylab.colorbar()
        pylab.title(identifier)
        pylab.ylabel("Time after alpha-factor stimulation")
        pylab.xlabel("Time after NaCl stimulation")

        return exps

[docs]    def show_sequence(self, protein, N=40, vmax=None,cmap="hot_r", fontsize=10):
        """Show peptide and phosphorylation distribution on a sequence

        .. plot::
            :include-source:

            import os
            from msdas import *
            r = readers.MassSpecReader(get_yeast_raw_data())
            if os.path.exists("YEAST_annotations.pkl")==False:
                a = annotations.Annotations(r, "YEAST")
                a.get_uniprot_entries()
                a.set_annotations()
                a.to_pickle()
            r.read_annotations("YEAST_annotations.pkl")
            r.show_sequence("DIG1",fontsize=8)

        .. warning:: once merged, some phospho are tagged ith merged name so we use the colum Sequence (not Sequence_Phospho)
        """
        import textwrap
        self.warning("Requires annotations")
        indices = self.metadata.groupby("Protein").groups[protein]
        psites = self.metadata.Psite.ix[indices]
        peptide_sequences = self.metadata.Sequence.ix[indices]
        entry = self.metadata.Entry.ix[indices[0]]
        sequence = self.annotations.ix[entry].Sequence

        psites_location = set([w[1:] for x in psites for z in x.split("^") for w in z.split("+")])
        psites_location = [int(x) for x in psites_location]
        # build the matrix to plot. The matrix will have the following length and text
        text = textwrap.wrap(sequence, N)
        ncol = N
        nrow = len(text)
        # but for now, let us work with the data as an array. We will want a matrix,
        # so let us complete the sequence with zeros such that length is the same as a
        # matrix
        values = [0] * ncol * nrow

        # let us look at peptide coverage
        for peptide in peptide_sequences:
            i = sequence.index(peptide)
            j = len(peptide)
            for pos in range(i, i+j):
                values[pos] += 1
        if vmax == None:
            M = max(values)
        else:
            M = vmax
        # now we can build the matrix
        m = np.array(values).reshape(nrow, ncol)
        m = pylab.flipud(m)

        pylab.clf();
        pylab.pcolor(m, edgecolors="k", vmin=0, vmax=M, cmap=cmap, alpha=0.8);
        pylab.colorbar()
        for i,x in enumerate(sequence):
            if i+1 in psites_location:
                pylab.text(i%N+.2, -0.5+nrow-i/N, x, color="green",
                           bbox=dict(facecolor='white', alpha=0.5), fontsize=fontsize)
            else:
                pylab.text(i%N+.2, -0.5+nrow-i/N, x, fontsize=fontsize)

            pylab.text(i%N+.6, -.9+nrow-i/N, i+1, fontsize=fontsize/1.5)
        pylab.title(protein)
        pylab.xticks([],[])
        pylab.yticks([],[])
        return text

    def __eq__(self, this):
        metadata = self.metadata.fillna(0).eq(this.metadata.fillna(0)).all().all()
        if metadata == False:
            return False
        measures1 = self.measurements.fillna(0).apply(np.round, args=[10]) # 10 decimals
        measures2 = this.measurements.fillna(0).apply(np.round, args=[10])
        return measures1.eq(measures2).all().all()

    def __getitem__(self, name):
        if isinstance(name, str):
            name = [name]
        if len(name)==1:
            name = name[0]
            df = self.df[self.df.Protein == name].copy()
            if len(df)==0:
                # try someting else based on identifier instead of protein name
                df =  self.df[self.df.Identifier == name].copy()
        elif len(name) == 2:
            protein = name[0]
            psite= name[1]
            df = self.df[(self.df.Protein == protein) & (self.df.Psite == psite)]
        else:
            raise ValueError("Expects only 1 or 2 parameters.")

        return df

    def __str__(self):
        import textwrap
        N = len([x for x in self.df.columns if x not in ['Sequence', 'Psite', 'Protein']])
        msg = "This dataframe contains {} columns in addition to the ".format(N) + \
            "standard columns Protein, Sequence, Psite"
        txt = "\n".join(textwrap.wrap(msg, 80))
        txt += "\nYour data contains {} unique proteins\n".format(len(set(self.df.Protein)))
        txt += "Your data contains {} combination of psites/proteins\n".format(self.df.shape[0])
        return txt
MS-DAS 0.9.0 documentation

Source code for msdas.readers

Search