Source code for msdas.annotations

# -*- python -*-
#
#  This file is part of MS-DAS software
#
#  Copyright (c) 2014 - EBI-EMBL
#
#  File author(s): Thomas Cokelaer <cokelaer@ebi.ac.uk>, Marti Bernardo Faura
#  bernardo@ebi.ac.uk
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#
##############################################################################
import os
import textwrap
import types

import pandas as pd
import pylab
from bioservices import UniProt

from msdas.readers import MassSpecReader



__all__ = ["Annotations", "AnnotationsYeast"]


[docs]class Annotations(MassSpecReader):
    """Create/store/read annotations from uniprot and figure out entry names

    The Annotations classes allows one to populate the dataframe attribute
    :attr:`df` with the **Entry** and **Entry_name**  columns (UniProt entries).
    This is not strictly speaking required columns but provide more tools if
    available. The annotations also creates a new dataframe called :attr:`annotations`
    that stores in particular the protein sequence and the GO terms. The former
    being used to check the peptide sequence and the latter to plot relevant
    histogram about GO terms.

    This class inherits from :class:`msdas.readers.MassSpecReader`. Consequently,
    input can be a MassSpecReader instance, or a filename or even nothing (data
    can be read at a later stage). The dataframe must contain the Protein column.

    One reason to fetch the entries from UniProt is that the protein column name
    may contain typos or non-uniprot entries, therefore it is quite useful to
    fetch all entries from uniprot based on the protein name provided. This can
    be done thanks to the :meth:`get_uniprot_entries`. This method fills a dictionary called
    :attr:`_mapping` (note the underscore), which is used to populate a new column
    in the dataframe called **Entry**.

    If your initial dataframe contains the columns "Entry" with all valid UniProt
    entries (e.g., P23300) then the :attr:`_mapping` attribute is populated during the
    initialisation and the call to :meth:`get_uniprot_entries` can be skipped.
    If called, it will also be faster but will overwrite the content of the Entry column.
    You can also fill/correct/complete the :attr:`_mapping` attribute before calling
    :meth:`get_uniprot_entries`

    .. doctest::

        >>> from msdas import annotations
        >>> import pandas as pd
        >>> df = pd.DataFrame({
            'Protein':['DIG1'],
            'Sequence_Phospho':['SS(Phospho)T'],
            'Psite':['S2']})
        >>> a = annotations.Annotations(df, "YEAST")
        >>> a._mapping
        {}
        >>> a.get_uniprot_entries()
        {'DIG1_YEAST': ['Q03063']}
        >>> a.df.Entry
        0    Q03063
        Name: Entry, dtype: object

    Then, call :meth:`set_annotations`, which will fetch all annotations from
    uniprot and store them in a new dataframe in the :attr:`annotations` attribute

    ::

        a.set_annotations()
        a.annotations

    A new field called **Entry_name** is also added to the dataframe itself::

        a.df.Entry_name

    On a big data set, it may take a few minutes to fetch all information from uniprot.
    So, we also provide tools to save and read back the relevant information (
    :meth:`read_annotations`, :meth:`to_pickle`, :meth:`to_csv`  ) ::

        from msdas import *
        r = readers.MassSpecReader(get_yeast_raw_data())
        # this takes about 10 minutes depending on the connection for 1600 unique protein names
        r.get_uniprot_entries()
        r.set_annotations()
        r.to_pickle(tag="test") # creates a file called YEAST_annotations_test.pkl
        r.to_csv("data.csv")

    Next time, just type::

        from msdas import *
        a = annotations.Annotations("data.csv", "YEAST")
        a.read_annotations("YEAST_annotations_test.pkl")

    To check that the entries are correct, one thing that can be done is to
    look for the peptide sequence into the FASTA sequence found in the annotations::

        a.check_entries_versus_sequence()

    This is a very good sanity check to verify that the entry names found
    correspond to the peptide provided. If not, the protein name was probably wrong
    or was a gene name that could not be mapped correctly to the correct protein.

    If some entries are not found or mapping was not found, you need to
    manually check the issues and update the :attr:`_mapping` attribute,
    update the uniprot entries and annotations::

        a._mapping[entry] = ['entry name']
        a.get_uniprot_entries()
        a.set_annotations()
        a.check_entries_versus_sequence()

    if you cannot find a mapping, we would recommend to delete the item from
    the dataframe :attr:`df`.
    """
    def __init__(self, data, organism=None, verbose=True, annotations=None, **kargs):
        """.. rubric:: Constructor

        :param data: a MassSpecReader compatible input (e.g., CSV file, None, a
            MassSpecReader instance). See :class:`msdas.readers.MassSpecReader`
            documentation for details
        :param organism: valid uniprot identifier for the organism e.g., HUMAN
            YEAST.
        :param annotations: a pickled file containing the annotations saved
            using :meth:`to_pickle`.
        :param kargs: valid parameter recognised by :class:`msdas.readers.MassSpecReader`

        """
        super(Annotations, self).__init__(data=data, verbose=verbose, **kargs)
        if organism is None:
            raise ValueError("organism must be provided e.g. YEAST, HUMAN")

        self.organism = organism

        #: the dataframe where annotations from uniprot will be stored.
        self.annotations = None

        self._mapping = {}
        self.build_mapping_from_df()# if Entry is provided

        self._init_uniprot()

        if annotations:
            self.read_pickle(annotations)

    def _init_uniprot(self):
        if hasattr(self, "_uniprot") == False:
            self._uniprot = UniProt(verbose=self.debugLevel)

    def _update_species_to_find(self):
        entry_names = [x + "_" + self.organism for x in self.df.Protein]
        #unique_entry_names = list(set(entry_names))

        species_to_find = [k for k in entry_names if k not in self._mapping.keys()]
        species_to_find = list(set(species_to_find))
        self._species_to_find = list(set(species_to_find))

[docs]    def build_mapping_from_df(self):
        """Populate the _mapping dictionary using the Uniprot Entry column"""
        if "Entry" in self.df.columns:
            for index in self.df.index:
                k = self.df.Protein.ix[index]
                if k.endswith("_"+self.organism) == False:
                    k += "_" + self.organism
                v = self.df.Entry.ix[index]
                self._mapping[k] = [v]
        else:
            self.warning("Entry column not found in the dataframe. call get_uniprot_entries")

[docs]    def get_uniprot_entries(self, Nmax=50):
        """Search for the uniprot entries and entry names given protein column.

        Protein names from the dataframe are first used to feed uniprot mapping tool.
        Some protein names won't be found as a uniprot entry because there are
        not uniprot entry name but gene names.  We therefore also scan
        missing entries by looking for gene names. Once found, the proposed
        items that contain the gene names and organism are candidates for the
        entry names. There may be several solutions though, which explain why
        the values in the :attr:`_mapping` dictionary  are made of lists. If several
        candidates are found, warning and raised.

        Results are stored in :attr:`_mapping` and in the dataframe itself.

        Let us show one example with 3 protein names that cover all cases:

            * DIG1, is a valid uniprot entry
            * ASC1 is not a uniprot entry. It is a gene name from which the entry
              may be retrieved automatically.
            * LEU1 is a gene name AND a uniprot entry. This is an ambiguous
              case. The default is to use the uniprot entry but if you call
              :meth:`check_entries_versus_sequence` (after meth:`set_annotations`)
              you will see that there is a mismatch meaning that LEU1_YEAST provided
              in the protein column is catually not the protein name but the gene name


        ::

            >>> import pandas as pd
            >>> from msdas import *
            >>> df = pd.DataFrame({'Protein':['DIG1', 'LEU1', 'ASC1'],
                'Sequence_Phospho':['S(Phospho)APAQVTQHSK', 'VEVTS(Phospho)EDEK',
                                    'DS(Phospho)VTIISAGNDK'],
                'Psite':['S142','S495', 'S166']})
            >>> a = Annotations(df, "YEAST)
            >>> a.get_uniprot_entries()
            >>> a._mapping
            {'ASC1_YEAST': ['P38011', 'P01120'],
             'DIG1_YEAST': ['Q03063'],
             'LEU1_YEAST': ['P06208-1', 'P06208']}

        Here, DIG1 has one unique entry. This is expected because DIG1 is in fact
        an entry name (unique by definition). ASC1 is a gene name. This method
        figures out that it correspond to either P38011 or P01120. There are
        several entries because mapping from gene to protein is not unique.
        By default, the entry with highest score appears first. There is no 100% guarantee
        that this mapping is correct and :meth:`check_entries_versus_sequence`
        should be called to check that the peptide sequence is contained in
        this entry sequence. The last case (LEU1) is even more problematic because
        it is a valid entry name even though the protein name provided is actually
        a gene name... again call :meth:`check_entries_versus_sequence`.


            >>> a.set_annotations()
            >>> a.check_entries_versus_sequence()
            P06208-1 not found in the annotations index

        So, here we are told that amongst the 3 entries, P06208-1 is not found.
        This the LEU1 case. If you were to use batch tool, you would figure out
        given the peptide sequence that this is actually LEUC_YEAST entry with
        uniprot entry LEUC_YEAST/P07264.

        So, you need to manually update the mapping:

            >>> a._mapping['LEU1_YEAST'] = ['P07264']
            >>> a.get_uniprot_entries() # to update the main df with new entries
            >>> a.set_annotations() # to retrieve the sequence of LEUC_YEAST
            >>> a.check_entries_versus_sequence()

        .. seealso:: :meth:`set_annotations`

        """
        # get the mapping using bioservices.uniprot

        # apply function is 3 times slower than list...
        # entry_names = self.df.Protein.apply(lambda x: x + "_" + self.organism)
        self._update_species_to_find()
        if len(self._species_to_find)>0:
            self.logging.info("Fetching uniprot accession numbers for %s entries" % len(self.df.Protein))
            self.logging.info("Fetching uniprot accession numbers for %s unique entries" % len(self.df.Protein.unique()))
            mapping = self._uniprot.multi_mapping(fr="ID", to="ACC",Nmax=Nmax,
                                              query=self._species_to_find)
            for k,v in mapping.iteritems():
                if k not in self._mapping.keys():
                    self._mapping[k] = v

        # some species may not be found (secondary accession number) if _human
        # appended in tcell case. so we may need to call again the mapping but
        # without the appended organism string.
        self._update_species_to_find()
        if len(self._species_to_find):
            self.logging.info("Some species were not found ({}). Using secondary accession:".format(len(self._species_to_find)))
            self.logging.info("Fetching uniprot without trailing species")
            self.logging.info("Fetching  %s new ones " % len(self._species_to_find))
            mapping = self._uniprot.multi_mapping(fr="ID", to="ACC",
                    query=[x.split("_")[0] for x in self._species_to_find], Nmax=Nmax)
            for k,v in mapping.iteritems():
                self._mapping[k+ "_" + self.organism] = v

        # Some are not yet found. this could be because the provided protein name is actually a
        # gene name...
        def func(x, tag):
            if len(x)==0:
                return False
            else:
                return tag in x[0].split()

        self._genes = {}
        self._update_species_to_find()
        if len(self._species_to_find):

            self.logging.info("Some species are still not found {}. Trying to use gene names".format(len(self._species_to_find)))
            self.logging.info("Fetching uniprot accession numbers for those without _species appended")
            self.logging.info("Fetching  %s new ones " % len(self._species_to_find))
            for i,this in enumerate(self._species_to_find):
                if " " in this:
                    continue
                self.logging.info("Searching for entry {}/{} for gene names".format(i+1,len(self._species_to_find)))
                df = self._uniprot.get_df(this.split("_")[0], organism=self.organism)
                l1 = df['Gene names'].apply(lambda x : func(x,this.split("_")[0] ))
                l2 = df['Entry name'].apply(lambda x: x.endswith(self.organism))

                if sum(l1&l2) >= 1:
                    k = list(df.ix[l2&l1]['Entry name'])
                    v = list(df.ix[l2&l1]['Entry'])
                    self.logging.debug(k, v, this)
                    if k in self._mapping.keys():
                        raise ValueError("!!!!!!!%s Already in the dictionary " % k)
                    #self._mapping[k] = v
                    self._mapping[this] = v
                else:
                    print("skipping %s... sum=%s" % (this, sum(l1&l2)))

        self._update_species_to_find()
        if len(self._species_to_find):
            self.logging.info("Some species were not found. Using gene names")
            self.logging.info("Fetching uniprot accession numbers for those without _species appended")
            self.logging.info("Fetching  %s new ones " % len(self._species_to_find))

        self._append_uniprot_entries_to_df()

    def _append_uniprot_entries_to_df(self):

        if "Entry" in self.df.columns:
            self.logging.warning("Overwritting column called Entry in the dataframe")

        # get list of unique entry names.
        entry_names = self.df.Protein.apply(lambda x: x + "_" + self.organism)
        #remapping = [(k,v[0]) for k,va.g in a._mapping.iteritems()]
        # add into dataframe the uniprot entries but must have same order as
        # in the dataframe (entry_names)
        uniprot_entries = []
        for name in entry_names:
            # if not found, let us use unknown as a label. could use NA?
            uniprot_entry = self._mapping.get(name, "")
            if uniprot_entry == "":
                uniprot_entries.append("")
                print("!! ", name, " not found")
            else:
                if len(uniprot_entry)>1:
                    self.logging.info("Found entry with several matches: %s %s . Only first one is selected (highest uniprot score)" % (name,
                                      uniprot_entry))
                uniprot_entries.append(uniprot_entry[0])

        # index=df.index is important to use the join afterwards
        thisdf = pd.DataFrame({'Entry': uniprot_entries}, index=self.df.index)
        if "Entry" in self.df.columns:
            del self.df['Entry']
        self.df = self.df.join(thisdf)

    def _append_uniprot_entry_names_to_df(self):

        if isinstance(self.annotations, types.NoneType) == True:
            self.error("must call set_annotations first")
            return
        # let us add the Entry_name column as well
        entry_names = [self.annotations.ix[e]['Entry name'] if e in self.annotations.index else "" for e in self.df.Entry]
        self.df['Entry_name'] = entry_names

[docs]    def plot_goid_histogram(self, drop_duplicates=True):
        """Histrogram of the number of GO terms per petide or protein

        :param drop_duplicates: ignore duplicates entries

        .. plot::
            :width: 80%
            :include-source:

            from msdas import *
            m = Annotations(get_yeast_small_data(), "YEAST", verbose=False)
            m.set_annotations()
            m.plot_goid_histogram()

        .. todo:: is this functional process or not

        """
        if self.annotations is False:
            raise AttributeError(self._error_messages['annotations'])
        if drop_duplicates:
            entries = self.df.Entry.drop_duplicates()
        counter = self.annotations.ix[entries]['Gene ontology IDs'].apply(lambda x: len(x))

        M = counter.max()
        # if we want the GO per peptides, then we need to look at the original
        # dataframe that contains several psites per peptide. UniProt_entry is
        # not a set so values from counter may be duplicated, which is what we
        # want for this first figure
        duplicated_counter = [counter[x] for x in self.df.Entry]
        pylab.figure(1)
        pylab.clf()
        pylab.hist(duplicated_counter, bins=[x+.5 for x in range(0,M+1)])
        pylab.title("Distribution of number of GO id terms per peptide")
        pylab.grid()

        # annotations contains the unique protein entry, so here we get the number of GO terms per protein
        counter = self.annotations['Gene ontology IDs'].apply(lambda x: len(x))
        M = counter.max()
        pylab.figure(2)
        pylab.clf()
        pylab.hist(counter, bins = [x+.5 for x in range(0,M+1)])
        pylab.title("Distribution of number of GO id terms per protein")
        pylab.grid()

[docs]    def set_annotations(self, Nmax=100):
        """Fetched all information from uniprot and set :attr:`annotations`
        as a pandas dataframe.


        Look into the dataframe Entry column and update the annotations dataframe
        to populate missing entries. The Entry column in the :attr:`df` should have been
        populated by :meth:`get_uniprot_entries` with valid entries from Uniprot.

        If you have thousand of entries, this is taking a few minutes. You can
        save the annotations and read them back using :meth:`msdas.MassSpecReader.read_annotations`
        and :meth:`to_pickle`.


        """
        self.logging.info("Fectching information from uniprot. Takes some time")

        #could split if too long
        entries = [this for this in list(set(self.df.Entry)) if this]

        # not need to search again if already present in the attribute
        if self.annotations is not None:
            entries = [x for x in entries if x not in list(self.annotations.index)]

        if len(entries)==0:
            self.warning("No new entries found. Your annotations dataframe is already up-to-date")
            self.annotations.drop_duplicates(subset="Entry name", inplace=True)
            self._append_uniprot_entry_names_to_df()
            return

        annotations = self._uniprot.get_df(entries, nChunk=Nmax)
        annotations = annotations[annotations.Entry.apply(lambda x: x in entries)]

        if len(annotations) == 0:
            raise ValueError("your list of protein is empty")
        self.logging.info("Fectching {}".format(len(annotations)))
        annotations.set_index(["Entry"], inplace=True)
        if self.annotations is None:
            self.annotations = annotations
        else:
            self.annotations = self.annotations.append(annotations)
        #self.annotations.set_index(["Entry"], inplace=True)
        self.logging.info("Annotations have been loaded. You can save the annotations" +
            " dataframe attribute using x.to_pickle('annotations.pkl') " +
            " Next time, you could just load if using \n\n" +
            "     >>> m = readers.MassSpecReader(filename, mode='yeast')\n" +
            "     >>>  m.read_annotations('annotations.pkl')")

        #indices are the uniprot entry. Some may be identical with slightly different columns
        # but the entry name should be unique. Here, we keep the first instance of each entry
        self.annotations.drop_duplicates(subset="Entry name", inplace=True)
        self._append_uniprot_entry_names_to_df()

[docs]    def to_pickle(self, tag=None, overwrite=False):
        """Save annotations dataframe as a pickle

        :param tag: a tag to append to the name of the annotations file.
        :param overwrite: overwrite file if it exists

        filename is going to be organism_annotations_tag.pkl
        """
        filename = self.organism + "_annotations"
        if tag != None and isinstance(tag, str):
            filename += "_" + tag
        filename += ".pkl"
        if overwrite == False:
            if os.path.exists(filename):
                raise IOError("file %s already exists" % filename)
        self.annotations.to_pickle(filename)

[docs]    def read_pickle(self, filename):
        """Read annotations in pickled format as saved by :meth:`to_pickle`

        :param str filename: filename to read
        """
        try:
            self.annotations = pd.read_pickle(filename)

            # update the mapping dictionary
            for k,v in self.annotations['Entry name'].iteritems():
                if k not in self._mapping.keys():
                    self._mapping[v] = [k]
        except:
            self.logging.error("Could not read your file. Expected a pkl \
            containing a dataframe with Entry name and index being uniprot \
            indices. ")

[docs]    def hist_most_relevant_goids(self, N=10, tight_layout=True, wrap_length=40,
                                 drop_duplicates=True, **kargs):
        """Plot histogram of the GO identifiers found in all proteins.

        :param int N: restrict histogram to terms that appear at least N times
        :param int wrap_length:  wrap text on the y-axis by wrap_length (defaults to 40)
        :param drop_duplicates: drop the duplicated entries
        :param kargs: pandas.plot arguments accepted.

        .. plot::
            :include-source:
            :width: 80%

            from msdas import *
            m = Annotations(get_yeast_small_data(), "YEAST", verbose=False)
            m.set_annotations()
            m.hist_most_relevant_goids(N=5)


        .. todo:: this is made on the annotations dataframe. Should be
            done based on the entry names in the dataframe

        """
        if self.annotations is False:
            raise AttributeError(self._error_messages['annotations'])
        kargs['legend'] = kargs.get("legend", False)


        if drop_duplicates:
            entries = self.df.Entry.drop_duplicates()

        goids = [y for x in self.annotations.ix[entries]['Gene ontology (GO)'] for y in x]
        uniq_goids = set(goids)
        names = [x for x in uniq_goids]

        # let us wrap the string by 40 character max to avoid long labels in the figure
        names = ["\n".join(textwrap.wrap(name, width=wrap_length)) for name in names]

        count = [goids.count(x) for x in uniq_goids]
        df = pd.DataFrame({'name':names, 'size':count}, index=range(0, len(uniq_goids)))
        if N:
            subdf = df[df['size']>N].set_index("name")
        subdf.sort("size").plot(kind="barh", **kargs)
        if tight_layout:
            pylab.tight_layout()

[docs]    def check_entries_versus_sequence(self):
        """Check that peptide sequence are contained in uniprot sequence

        This is a very good sanity check on the validity of the uniprot entry names
        found by :meth:`get_uniprot_entries` method

        If a peptide sequence is not found, it means that the protein name is
        not correct.

        See AnnotationsYeast class where the :meth:`AnnotationsYeast.update_mapping` is used to
        update the incorrect mapping.

        .. seealso:: :meth:`find_sequence_blast`

        """

        self.logging.info("Comparing peptide sequence in the attribute df with sequences in the annotations")
        self.logging.info("row index, protein name, uniprot entry")
        if isinstance(self.annotations, types.NoneType):
            raise Exception("annotations not set. call set_annotations")
        found = False
        for i in self.df.index:
            entry = self.df.ix[i].Entry
            if entry not in self.annotations.index:
                print("{} not found in the annotations index".format(entry))
                continue
            if self.df.ix[i].Sequence not in self.annotations.ix[entry].Sequence:
                if found == False:
                    print("Found unknown entries\nindex, protein name, uniprot entry ")
                    found = True
                print(i, self.df.ix[i].Protein, self.df.ix[i].Entry)

[docs]    def find_sequence_blast(self, seq, email):
        """Utility to search for a sequence using BLAST via bioservices

        :param str seq: the sequence
        :param email: a valid email address

        .. note:: This is using NCIBlast web service via
            `BioServices <https://pypi.python.org/pypi/bioservices>`_.
        """
        from bioservices import NCBIblast
        s = NCBIblast(verbose=self.level)
        jobid = s.run(program="blastp", sequence=seq, stype="protein",
                      database="uniprotkb", email=email)
        return s.getResult(jobid, "out")

[docs]    def to_csv(self, filename):
        """Export the dataframe with data and annotations into a CSV file

        :meth:`set_annotations` and :meth:`get_uniprot_entries` must have been called.


        """
        if "Entry" not in self.df.columns or "Entry_name" not in self.df.columns:
            raise ValueError("Entry or Entry_name missing in dataframe. You must call get_entries_uniprot and set_annotations methods")
        self.df.Identifier = self.df.Protein + "_" + self.df.Psite
        self.df.to_csv(filename, index=False, sep=",")




[docs]class AnnotationsYeast(Annotations):
    """Class dedicated to the YEAST data analysis

    This class is almost identical to :class:`Annotations`. It contains
    extra code to cleanup the mapping based on further manual investigations
    of the gene mapping to protein, which may be ambiguous (see
    :meth:`Annotations.get_uniprot_entries` for details).

    ::

        from msdas import *
        from easydev import gsf
        filename = gsf("msdas", "data", "YEAST_raw_sample.csv")
        r = MassSpecReader(r)
        a = AnnotationsYEAST(r)
        a.get_uniprot()
        a.set_annotations()


    Only 80% of the protein names are found directly using UniProt.
    The 20% remaining are actually gene names on which a mapping to protein has to be done.
    Yet, sometimes, there is an ambiguity that remains either because the gene name
    is also a valid entry name or because the gene maps to several entry names.
    This list gives some of these ambiguities. The first one is used by default
    (based on highest score) but may not be correct.
    See for instance :meth:`Annotations.check_entries_versus_sequence` to help
    you figuring out which one is the correct one.

        * ALD3_YEAST ['P54114', 'P40047']
        * ALD4_YEAST ['P46367', 'P54114']
        * CKI1_YEAST ['P20485', 'P23292']
        * CPR1_YEAST ['P14832', 'P16603']
        * PRS5_YEAST ['Q12265', 'P23638']
        * RPL16B_YEAST ['P26785', 'Q3E757', 'P05739']
        * RPL32_YEAST ['P38061', 'P14120']
        * RPL6A_YEAST ['Q02326', 'P05737']
        * RPS7A_YEAST ['P26786', 'P0CX36']
        * RPS7B_YEAST ['P48164', 'P0CX35']
        * CPR1_YEAST ['P14832', 'P16603']
        * ECM17_YEAST ['P40458', 'P47169']
        * RPL16B_YEAST ['P26785', 'Q3E757', 'P05739']
        * RPS7B_YEAST ['P48164', 'P0CX35']
        * ASC1_YEAST ['P38011', 'P01120']
        * ECM17_YEAST ['P40458', 'P47169']


     :Notes on the data:

        * NPL3_356^S349 has a wrong psite name. Given the sequence, it should be NPL3_S349
        * Same issue with TIF3 (IF4B_YEAST) where trailing number without phospho was removed by hand.
        * FLO9 1004^554^464^374^T329^T779, T(Phospho)GTFTSTSTEM(Oxidation)TTVTGTNGQPTDETVIVI should be T779
        * One entry is AD5;7 which is wrongly named to not clash with CSV format. The proper
          name is indeed AD5,7. We renamed it in the file as PUR2_YEAST. We checked the sequence
        * To find the mapping, we used blast from bioservices to figure out the sequence of
          the protein and checked on uniprot. See update_mapping function
        * Typo in the original code for ABP1 peptide: small e was found.
          KEPVKT eSP APAAK should be KEPVKT PSP APAAK
        * Possible typo is STE11_S323 location  should be S326
        * Also possible typoe IMP2 has 2 rows called IMP2' (note the quote)


    Here are proteins names provided in the Yeast_data_set that are actually gene
    names. Using bioservices, we figure out possible uniprot entries but similarlly
    to the YEF3 there is maybe an ambiguity on the name::

        77 P54114 ALD4   could be ALDH4_YEAST ([') whereas P54114 is ALD3
        2077 P23638 PRS5    KPR5_YEAST  '
        6664 P01120 ASC1  GBLP_YEAST  P38011

    there are 6 peptide labelled CTR9. In fact, it is 2 different peptides. first 4 are CTR9
    2 last are EAMAISEHNVKDDSDLSDKDNEYDEEQPR . This is defintely CTR9 but is not exactly in
    the sequence. Missing K at the end

    SIR1_YEAST exists but the peptide sequence  cannot be found either using blast or manual searh on uniprot
    possibly sirp1 but not yeast organism; it is yeasb. Actually may still be SIR1
    but Uniprot changed the sequence. See persona communicatoin with uniprot.


    """
    def __init__(self, data,  verbose=True, annotations=None, **kargs):
        """.. rubric:: Constructor

        Same as :class:`Annotations` except that organism must not be provided.
        """


        super(AnnotationsYeast, self).__init__(data=data, organism="YEAST",
            verbose=verbose, annotations=annotations, **kargs)


        index = list(self.df[self.df.Protein == "ADE5;7"].index)
        if len(index)==1:
            self.warning("renaming ADE5;7 (should be ADE5,7 anyway) into PUR2")
            self.df  = self.df.set_value(index[0], "Protein", "PUR2")
            self._rebuild_identifier()
        if len(index)>1:
            raise NotImplementedError

[docs]    def update_mapping(self):
        """Update the mapping with known keys

        There are issues in the naming because of a mixing of protein and gene
        names. Methods in :class:`Annotations` found most of the mapping.
        However, there are some ambiguities and the mappind dictionary is corrected
        as follows. Checked with uniprot and blast.

        we remove "ADE5,7_YEAST" (replaced by PUR2 if found)

        ================ ======== ============================================
        ================ ======== ============================================
        ADE5,7_YEAST     P07244   PUR2_YEAST
        YEF3_YEAST       P16521
        FEN1_YEAST       P25358   FEN1 is actually ELO2_YEAST
        HIS7_YEAST       P33734   HIS7 is actually HIS5_YEAST
        LEU1_YEAST       P07264   this is LEUC_YEAST P07264
        NTH1_YEAST       P32356   TREA_YEAST
        NTH2_YEAST       P35172   TREB_YEAST
        ECM17_YEAST      P47169   MET5_YEAST
        RPL6A_YEAST      Q02326   RL6A_YEAST
        RPS7A_YEAST      P26786   RS7A_YEAST
        YJU2_YEAST       P28320   CWC16_YEAST no blast results but checked
        ASC1_YEAST       P38011   GBLP_YEAST
        PSA1_YEAST       P41940   no blast result; gene name MPG1_YEAST
        CTR9_YEAST       P89105   CTR9_YEAST correct  but see note below
        IMP2_YEAST       P32351   IMPX_YEAST no blast result;IMP2 is gene name
        ================ ======== ============================================


        .. note:: there is also a protein called IMP2' (note the quote), which
            presumably is also IMP2. Kept as it is fow now


         """
        if len(self._mapping) == 0:
            raise ValueError("You should call set_annotations and get_uniprot first")
        #self._mapping['ADE5,7_YEAST'] = ['P07244'] # PUR2_YEAST
        self._mapping['YEF3_YEAST'] = ["P16521"]  #
        self._mapping['FEN1_YEAST'] = ["P25358"]  # FEN1 is actually ELO2_YEAST
        self._mapping['HIS7_YEAST'] = ["P33734"]   # HIS5 is actually HIS5_YEAST
        self._mapping['LEU1_YEAST'] = ["P07264"]  # this is LEUC_YEAST P07264
        self._mapping['NTH1_YEAST'] = ['P32356']  # TREA_YEAST
        self._mapping['NTH2_YEAST'] = ['P35172']  # TREB_YEAST
        self._mapping['ECM17_YEAST'] = ['P47169'] # MET5_YEAST
        self._mapping['RPL6A_YEAST'] = ['Q02326'] # RL6A_YEAST
        self._mapping['RPS7A_YEAST'] = ['P26786'] # RS7A_YEAST
        self._mapping['YJU2_YEAST'] = ['P28320']  # CWC16_YEAST no blast results but checked
        self._mapping['ASC1_YEAST'] =  ['P38011'] # GBLP_YEAST
        self._mapping['PSA1_YEAST'] = ['P41940']  # no blast result; a gene name corresponding to MPG1_YEAST
        self._mapping['CTR9_YEAST'] = ['P89105']  # CTR9_YEAST  P89105 this is correct  but see note documetnation
        self._mapping['IMP2_YEAST'] = ['P32351'] # IMPX_YEAST no blast result; IMP" is gene name
MS-DAS 0.9.0 documentation

Source code for msdas.annotations

Search