MS-DAS 0.9.0 documentation

Source code for msdas.annotations

# -*- python -*-
#
#  This file is part of MS-DAS software
#
#  Copyright (c) 2014 - EBI-EMBL
#
#  File author(s): Thomas Cokelaer <cokelaer@ebi.ac.uk>, Marti Bernardo Faura
#  bernardo@ebi.ac.uk
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#
##############################################################################
import os
import textwrap
import types

import pandas as pd
import pylab
from bioservices import UniProt

from msdas.readers import MassSpecReader



__all__ = ["Annotations", "AnnotationsYeast"]


[docs]class Annotations(MassSpecReader): """Create/store/read annotations from uniprot and figure out entry names The Annotations classes allows one to populate the dataframe attribute :attr:`df` with the **Entry** and **Entry_name** columns (UniProt entries). This is not strictly speaking required columns but provide more tools if available. The annotations also creates a new dataframe called :attr:`annotations` that stores in particular the protein sequence and the GO terms. The former being used to check the peptide sequence and the latter to plot relevant histogram about GO terms. This class inherits from :class:`msdas.readers.MassSpecReader`. Consequently, input can be a MassSpecReader instance, or a filename or even nothing (data can be read at a later stage). The dataframe must contain the Protein column. One reason to fetch the entries from UniProt is that the protein column name may contain typos or non-uniprot entries, therefore it is quite useful to fetch all entries from uniprot based on the protein name provided. This can be done thanks to the :meth:`get_uniprot_entries`. This method fills a dictionary called :attr:`_mapping` (note the underscore), which is used to populate a new column in the dataframe called **Entry**. If your initial dataframe contains the columns "Entry" with all valid UniProt entries (e.g., P23300) then the :attr:`_mapping` attribute is populated during the initialisation and the call to :meth:`get_uniprot_entries` can be skipped. If called, it will also be faster but will overwrite the content of the Entry column. You can also fill/correct/complete the :attr:`_mapping` attribute before calling :meth:`get_uniprot_entries` .. doctest:: >>> from msdas import annotations >>> import pandas as pd >>> df = pd.DataFrame({ 'Protein':['DIG1'], 'Sequence_Phospho':['SS(Phospho)T'], 'Psite':['S2']}) >>> a = annotations.Annotations(df, "YEAST") >>> a._mapping {} >>> a.get_uniprot_entries() {'DIG1_YEAST': ['Q03063']} >>> a.df.Entry 0 Q03063 Name: Entry, dtype: object Then, call :meth:`set_annotations`, which will fetch all annotations from uniprot and store them in a new dataframe in the :attr:`annotations` attribute :: a.set_annotations() a.annotations A new field called **Entry_name** is also added to the dataframe itself:: a.df.Entry_name On a big data set, it may take a few minutes to fetch all information from uniprot. So, we also provide tools to save and read back the relevant information ( :meth:`read_annotations`, :meth:`to_pickle`, :meth:`to_csv` ) :: from msdas import * r = readers.MassSpecReader(get_yeast_raw_data()) # this takes about 10 minutes depending on the connection for 1600 unique protein names r.get_uniprot_entries() r.set_annotations() r.to_pickle(tag="test") # creates a file called YEAST_annotations_test.pkl r.to_csv("data.csv") Next time, just type:: from msdas import * a = annotations.Annotations("data.csv", "YEAST") a.read_annotations("YEAST_annotations_test.pkl") To check that the entries are correct, one thing that can be done is to look for the peptide sequence into the FASTA sequence found in the annotations:: a.check_entries_versus_sequence() This is a very good sanity check to verify that the entry names found correspond to the peptide provided. If not, the protein name was probably wrong or was a gene name that could not be mapped correctly to the correct protein. If some entries are not found or mapping was not found, you need to manually check the issues and update the :attr:`_mapping` attribute, update the uniprot entries and annotations:: a._mapping[entry] = ['entry name'] a.get_uniprot_entries() a.set_annotations() a.check_entries_versus_sequence() if you cannot find a mapping, we would recommend to delete the item from the dataframe :attr:`df`. """ def __init__(self, data, organism=None, verbose=True, annotations=None, **kargs): """.. rubric:: Constructor :param data: a MassSpecReader compatible input (e.g., CSV file, None, a MassSpecReader instance). See :class:`msdas.readers.MassSpecReader` documentation for details :param organism: valid uniprot identifier for the organism e.g., HUMAN YEAST. :param annotations: a pickled file containing the annotations saved using :meth:`to_pickle`. :param kargs: valid parameter recognised by :class:`msdas.readers.MassSpecReader` """ super(Annotations, self).__init__(data=data, verbose=verbose, **kargs) if organism is None: raise ValueError("organism must be provided e.g. YEAST, HUMAN") self.organism = organism #: the dataframe where annotations from uniprot will be stored. self.annotations = None self._mapping = {} self.build_mapping_from_df()# if Entry is provided self._init_uniprot() if annotations: self.read_pickle(annotations) def _init_uniprot(self): if hasattr(self, "_uniprot") == False: self._uniprot = UniProt(verbose=self.debugLevel) def _update_species_to_find(self): entry_names = [x + "_" + self.organism for x in self.df.Protein] #unique_entry_names = list(set(entry_names)) species_to_find = [k for k in entry_names if k not in self._mapping.keys()] species_to_find = list(set(species_to_find)) self._species_to_find = list(set(species_to_find))
[docs] def build_mapping_from_df(self): """Populate the _mapping dictionary using the Uniprot Entry column""" if "Entry" in self.df.columns: for index in self.df.index: k = self.df.Protein.ix[index] if k.endswith("_"+self.organism) == False: k += "_" + self.organism v = self.df.Entry.ix[index] self._mapping[k] = [v] else: self.warning("Entry column not found in the dataframe. call get_uniprot_entries")
[docs] def get_uniprot_entries(self, Nmax=50): """Search for the uniprot entries and entry names given protein column. Protein names from the dataframe are first used to feed uniprot mapping tool. Some protein names won't be found as a uniprot entry because there are not uniprot entry name but gene names. We therefore also scan missing entries by looking for gene names. Once found, the proposed items that contain the gene names and organism are candidates for the entry names. There may be several solutions though, which explain why the values in the :attr:`_mapping` dictionary are made of lists. If several candidates are found, warning and raised. Results are stored in :attr:`_mapping` and in the dataframe itself. Let us show one example with 3 protein names that cover all cases: * DIG1, is a valid uniprot entry * ASC1 is not a uniprot entry. It is a gene name from which the entry may be retrieved automatically. * LEU1 is a gene name AND a uniprot entry. This is an ambiguous case. The default is to use the uniprot entry but if you call :meth:`check_entries_versus_sequence` (after meth:`set_annotations`) you will see that there is a mismatch meaning that LEU1_YEAST provided in the protein column is catually not the protein name but the gene name :: >>> import pandas as pd >>> from msdas import * >>> df = pd.DataFrame({'Protein':['DIG1', 'LEU1', 'ASC1'], 'Sequence_Phospho':['S(Phospho)APAQVTQHSK', 'VEVTS(Phospho)EDEK', 'DS(Phospho)VTIISAGNDK'], 'Psite':['S142','S495', 'S166']}) >>> a = Annotations(df, "YEAST) >>> a.get_uniprot_entries() >>> a._mapping {'ASC1_YEAST': ['P38011', 'P01120'], 'DIG1_YEAST': ['Q03063'], 'LEU1_YEAST': ['P06208-1', 'P06208']} Here, DIG1 has one unique entry. This is expected because DIG1 is in fact an entry name (unique by definition). ASC1 is a gene name. This method figures out that it correspond to either P38011 or P01120. There are several entries because mapping from gene to protein is not unique. By default, the entry with highest score appears first. There is no 100% guarantee that this mapping is correct and :meth:`check_entries_versus_sequence` should be called to check that the peptide sequence is contained in this entry sequence. The last case (LEU1) is even more problematic because it is a valid entry name even though the protein name provided is actually a gene name... again call :meth:`check_entries_versus_sequence`. >>> a.set_annotations() >>> a.check_entries_versus_sequence() P06208-1 not found in the annotations index So, here we are told that amongst the 3 entries, P06208-1 is not found. This the LEU1 case. If you were to use batch tool, you would figure out given the peptide sequence that this is actually LEUC_YEAST entry with uniprot entry LEUC_YEAST/P07264. So, you need to manually update the mapping: >>> a._mapping['LEU1_YEAST'] = ['P07264'] >>> a.get_uniprot_entries() # to update the main df with new entries >>> a.set_annotations() # to retrieve the sequence of LEUC_YEAST >>> a.check_entries_versus_sequence() .. seealso:: :meth:`set_annotations` """ # get the mapping using bioservices.uniprot # apply function is 3 times slower than list... # entry_names = self.df.Protein.apply(lambda x: x + "_" + self.organism) self._update_species_to_find() if len(self._species_to_find)>0: self.logging.info("Fetching uniprot accession numbers for %s entries" % len(self.df.Protein)) self.logging.info("Fetching uniprot accession numbers for %s unique entries" % len(self.df.Protein.unique())) mapping = self._uniprot.multi_mapping(fr="ID", to="ACC",Nmax=Nmax, query=self._species_to_find) for k,v in mapping.iteritems(): if k not in self._mapping.keys(): self._mapping[k] = v # some species may not be found (secondary accession number) if _human # appended in tcell case. so we may need to call again the mapping but # without the appended organism string. self._update_species_to_find() if len(self._species_to_find): self.logging.info("Some species were not found ({}). Using secondary accession:".format(len(self._species_to_find))) self.logging.info("Fetching uniprot without trailing species") self.logging.info("Fetching %s new ones " % len(self._species_to_find)) mapping = self._uniprot.multi_mapping(fr="ID", to="ACC", query=[x.split("_")[0] for x in self._species_to_find], Nmax=Nmax) for k,v in mapping.iteritems(): self._mapping[k+ "_" + self.organism] = v # Some are not yet found. this could be because the provided protein name is actually a # gene name... def func(x, tag): if len(x)==0: return False else: return tag in x[0].split() self._genes = {} self._update_species_to_find() if len(self._species_to_find): self.logging.info("Some species are still not found {}. Trying to use gene names".format(len(self._species_to_find))) self.logging.info("Fetching uniprot accession numbers for those without _species appended") self.logging.info("Fetching %s new ones " % len(self._species_to_find)) for i,this in enumerate(self._species_to_find): if " " in this: continue self.logging.info("Searching for entry {}/{} for gene names".format(i+1,len(self._species_to_find))) df = self._uniprot.get_df(this.split("_")[0], organism=self.organism) l1 = df['Gene names'].apply(lambda x : func(x,this.split("_")[0] )) l2 = df['Entry name'].apply(lambda x: x.endswith(self.organism)) if sum(l1&l2) >= 1: k = list(df.ix[l2&l1]['Entry name']) v = list(df.ix[l2&l1]['Entry']) self.logging.debug(k, v, this) if k in self._mapping.keys(): raise ValueError("!!!!!!!%s Already in the dictionary " % k) #self._mapping[k] = v self._mapping[this] = v else: print("skipping %s... sum=%s" % (this, sum(l1&l2))) self._update_species_to_find() if len(self._species_to_find): self.logging.info("Some species were not found. Using gene names") self.logging.info("Fetching uniprot accession numbers for those without _species appended") self.logging.info("Fetching %s new ones " % len(self._species_to_find)) self._append_uniprot_entries_to_df()
def _append_uniprot_entries_to_df(self): if "Entry" in self.df.columns: self.logging.warning("Overwritting column called Entry in the dataframe") # get list of unique entry names. entry_names = self.df.Protein.apply(lambda x: x + "_" + self.organism) #remapping = [(k,v[0]) for k,va.g in a._mapping.iteritems()] # add into dataframe the uniprot entries but must have same order as # in the dataframe (entry_names) uniprot_entries = [] for name in entry_names: # if not found, let us use unknown as a label. could use NA? uniprot_entry = self._mapping.get(name, "") if uniprot_entry == "": uniprot_entries.append("") print("!! ", name, " not found") else: if len(uniprot_entry)>1: self.logging.info("Found entry with several matches: %s %s . Only first one is selected (highest uniprot score)" % (name, uniprot_entry)) uniprot_entries.append(uniprot_entry[0]) # index=df.index is important to use the join afterwards thisdf = pd.DataFrame({'Entry': uniprot_entries}, index=self.df.index) if "Entry" in self.df.columns: del self.df['Entry'] self.df = self.df.join(thisdf) def _append_uniprot_entry_names_to_df(self): if isinstance(self.annotations, types.NoneType) == True: self.error("must call set_annotations first") return # let us add the Entry_name column as well entry_names = [self.annotations.ix[e]['Entry name'] if e in self.annotations.index else "" for e in self.df.Entry] self.df['Entry_name'] = entry_names
[docs] def plot_goid_histogram(self, drop_duplicates=True): """Histrogram of the number of GO terms per petide or protein :param drop_duplicates: ignore duplicates entries .. plot:: :width: 80% :include-source: from msdas import * m = Annotations(get_yeast_small_data(), "YEAST", verbose=False) m.set_annotations() m.plot_goid_histogram() .. todo:: is this functional process or not """ if self.annotations is False: raise AttributeError(self._error_messages['annotations']) if drop_duplicates: entries = self.df.Entry.drop_duplicates() counter = self.annotations.ix[entries]['Gene ontology IDs'].apply(lambda x: len(x)) M = counter.max() # if we want the GO per peptides, then we need to look at the original # dataframe that contains several psites per peptide. UniProt_entry is # not a set so values from counter may be duplicated, which is what we # want for this first figure duplicated_counter = [counter[x] for x in self.df.Entry] pylab.figure(1) pylab.clf() pylab.hist(duplicated_counter, bins=[x+.5 for x in range(0,M+1)]) pylab.title("Distribution of number of GO id terms per peptide") pylab.grid() # annotations contains the unique protein entry, so here we get the number of GO terms per protein counter = self.annotations['Gene ontology IDs'].apply(lambda x: len(x)) M = counter.max() pylab.figure(2) pylab.clf() pylab.hist(counter, bins = [x+.5 for x in range(0,M+1)]) pylab.title("Distribution of number of GO id terms per protein") pylab.grid()
[docs] def set_annotations(self, Nmax=100): """Fetched all information from uniprot and set :attr:`annotations` as a pandas dataframe. Look into the dataframe Entry column and update the annotations dataframe to populate missing entries. The Entry column in the :attr:`df` should have been populated by :meth:`get_uniprot_entries` with valid entries from Uniprot. If you have thousand of entries, this is taking a few minutes. You can save the annotations and read them back using :meth:`msdas.MassSpecReader.read_annotations` and :meth:`to_pickle`. """ self.logging.info("Fectching information from uniprot. Takes some time") #could split if too long entries = [this for this in list(set(self.df.Entry)) if this] # not need to search again if already present in the attribute if self.annotations is not None: entries = [x for x in entries if x not in list(self.annotations.index)] if len(entries)==0: self.warning("No new entries found. Your annotations dataframe is already up-to-date") self.annotations.drop_duplicates(subset="Entry name", inplace=True) self._append_uniprot_entry_names_to_df() return annotations = self._uniprot.get_df(entries, nChunk=Nmax) annotations = annotations[annotations.Entry.apply(lambda x: x in entries)] if len(annotations) == 0: raise ValueError("your list of protein is empty") self.logging.info("Fectching {}".format(len(annotations))) annotations.set_index(["Entry"], inplace=True) if self.annotations is None: self.annotations = annotations else: self.annotations = self.annotations.append(annotations) #self.annotations.set_index(["Entry"], inplace=True) self.logging.info("Annotations have been loaded. You can save the annotations" + " dataframe attribute using x.to_pickle('annotations.pkl') " + " Next time, you could just load if using \n\n" + " >>> m = readers.MassSpecReader(filename, mode='yeast')\n" + " >>> m.read_annotations('annotations.pkl')") #indices are the uniprot entry. Some may be identical with slightly different columns # but the entry name should be unique. Here, we keep the first instance of each entry self.annotations.drop_duplicates(subset="Entry name", inplace=True) self._append_uniprot_entry_names_to_df()
[docs] def to_pickle(self, tag=None, overwrite=False): """Save annotations dataframe as a pickle :param tag: a tag to append to the name of the annotations file. :param overwrite: overwrite file if it exists filename is going to be organism_annotations_tag.pkl """ filename = self.organism + "_annotations" if tag != None and isinstance(tag, str): filename += "_" + tag filename += ".pkl" if overwrite == False: if os.path.exists(filename): raise IOError("file %s already exists" % filename) self.annotations.to_pickle(filename)
[docs] def read_pickle(self, filename): """Read annotations in pickled format as saved by :meth:`to_pickle` :param str filename: filename to read """ try: self.annotations = pd.read_pickle(filename) # update the mapping dictionary for k,v in self.annotations['Entry name'].iteritems(): if k not in self._mapping.keys(): self._mapping[v] = [k] except: self.logging.error("Could not read your file. Expected a pkl \ containing a dataframe with Entry name and index being uniprot \ indices. ")
[docs] def hist_most_relevant_goids(self, N=10, tight_layout=True, wrap_length=40, drop_duplicates=True, **kargs): """Plot histogram of the GO identifiers found in all proteins. :param int N: restrict histogram to terms that appear at least N times :param int wrap_length: wrap text on the y-axis by wrap_length (defaults to 40) :param drop_duplicates: drop the duplicated entries :param kargs: pandas.plot arguments accepted. .. plot:: :include-source: :width: 80% from msdas import * m = Annotations(get_yeast_small_data(), "YEAST", verbose=False) m.set_annotations() m.hist_most_relevant_goids(N=5) .. todo:: this is made on the annotations dataframe. Should be done based on the entry names in the dataframe """ if self.annotations is False: raise AttributeError(self._error_messages['annotations']) kargs['legend'] = kargs.get("legend", False) if drop_duplicates: entries = self.df.Entry.drop_duplicates() goids = [y for x in self.annotations.ix[entries]['Gene ontology (GO)'] for y in x] uniq_goids = set(goids) names = [x for x in uniq_goids] # let us wrap the string by 40 character max to avoid long labels in the figure names = ["\n".join(textwrap.wrap(name, width=wrap_length)) for name in names] count = [goids.count(x) for x in uniq_goids] df = pd.DataFrame({'name':names, 'size':count}, index=range(0, len(uniq_goids))) if N: subdf = df[df['size']>N].set_index("name") subdf.sort("size").plot(kind="barh", **kargs) if tight_layout: pylab.tight_layout()
[docs] def check_entries_versus_sequence(self): """Check that peptide sequence are contained in uniprot sequence This is a very good sanity check on the validity of the uniprot entry names found by :meth:`get_uniprot_entries` method If a peptide sequence is not found, it means that the protein name is not correct. See AnnotationsYeast class where the :meth:`AnnotationsYeast.update_mapping` is used to update the incorrect mapping. .. seealso:: :meth:`find_sequence_blast` """ self.logging.info("Comparing peptide sequence in the attribute df with sequences in the annotations") self.logging.info("row index, protein name, uniprot entry") if isinstance(self.annotations, types.NoneType): raise Exception("annotations not set. call set_annotations") found = False for i in self.df.index: entry = self.df.ix[i].Entry if entry not in self.annotations.index: print("{} not found in the annotations index".format(entry)) continue if self.df.ix[i].Sequence not in self.annotations.ix[entry].Sequence: if found == False: print("Found unknown entries\nindex, protein name, uniprot entry ") found = True print(i, self.df.ix[i].Protein, self.df.ix[i].Entry)
[docs] def find_sequence_blast(self, seq, email): """Utility to search for a sequence using BLAST via bioservices :param str seq: the sequence :param email: a valid email address .. note:: This is using NCIBlast web service via `BioServices <https://pypi.python.org/pypi/bioservices>`_. """ from bioservices import NCBIblast s = NCBIblast(verbose=self.level) jobid = s.run(program="blastp", sequence=seq, stype="protein", database="uniprotkb", email=email) return s.getResult(jobid, "out")
[docs] def to_csv(self, filename): """Export the dataframe with data and annotations into a CSV file :meth:`set_annotations` and :meth:`get_uniprot_entries` must have been called. """ if "Entry" not in self.df.columns or "Entry_name" not in self.df.columns: raise ValueError("Entry or Entry_name missing in dataframe. You must call get_entries_uniprot and set_annotations methods") self.df.Identifier = self.df.Protein + "_" + self.df.Psite self.df.to_csv(filename, index=False, sep=",")
[docs]class AnnotationsYeast(Annotations): """Class dedicated to the YEAST data analysis This class is almost identical to :class:`Annotations`. It contains extra code to cleanup the mapping based on further manual investigations of the gene mapping to protein, which may be ambiguous (see :meth:`Annotations.get_uniprot_entries` for details). :: from msdas import * from easydev import gsf filename = gsf("msdas", "data", "YEAST_raw_sample.csv") r = MassSpecReader(r) a = AnnotationsYEAST(r) a.get_uniprot() a.set_annotations() Only 80% of the protein names are found directly using UniProt. The 20% remaining are actually gene names on which a mapping to protein has to be done. Yet, sometimes, there is an ambiguity that remains either because the gene name is also a valid entry name or because the gene maps to several entry names. This list gives some of these ambiguities. The first one is used by default (based on highest score) but may not be correct. See for instance :meth:`Annotations.check_entries_versus_sequence` to help you figuring out which one is the correct one. * ALD3_YEAST ['P54114', 'P40047'] * ALD4_YEAST ['P46367', 'P54114'] * CKI1_YEAST ['P20485', 'P23292'] * CPR1_YEAST ['P14832', 'P16603'] * PRS5_YEAST ['Q12265', 'P23638'] * RPL16B_YEAST ['P26785', 'Q3E757', 'P05739'] * RPL32_YEAST ['P38061', 'P14120'] * RPL6A_YEAST ['Q02326', 'P05737'] * RPS7A_YEAST ['P26786', 'P0CX36'] * RPS7B_YEAST ['P48164', 'P0CX35'] * CPR1_YEAST ['P14832', 'P16603'] * ECM17_YEAST ['P40458', 'P47169'] * RPL16B_YEAST ['P26785', 'Q3E757', 'P05739'] * RPS7B_YEAST ['P48164', 'P0CX35'] * ASC1_YEAST ['P38011', 'P01120'] * ECM17_YEAST ['P40458', 'P47169'] :Notes on the data: * NPL3_356^S349 has a wrong psite name. Given the sequence, it should be NPL3_S349 * Same issue with TIF3 (IF4B_YEAST) where trailing number without phospho was removed by hand. * FLO9 1004^554^464^374^T329^T779, T(Phospho)GTFTSTSTEM(Oxidation)TTVTGTNGQPTDETVIVI should be T779 * One entry is AD5;7 which is wrongly named to not clash with CSV format. The proper name is indeed AD5,7. We renamed it in the file as PUR2_YEAST. We checked the sequence * To find the mapping, we used blast from bioservices to figure out the sequence of the protein and checked on uniprot. See update_mapping function * Typo in the original code for ABP1 peptide: small e was found. KEPVKT eSP APAAK should be KEPVKT PSP APAAK * Possible typo is STE11_S323 location should be S326 * Also possible typoe IMP2 has 2 rows called IMP2' (note the quote) Here are proteins names provided in the Yeast_data_set that are actually gene names. Using bioservices, we figure out possible uniprot entries but similarlly to the YEF3 there is maybe an ambiguity on the name:: 77 P54114 ALD4 could be ALDH4_YEAST ([') whereas P54114 is ALD3 2077 P23638 PRS5 KPR5_YEAST ' 6664 P01120 ASC1 GBLP_YEAST P38011 there are 6 peptide labelled CTR9. In fact, it is 2 different peptides. first 4 are CTR9 2 last are EAMAISEHNVKDDSDLSDKDNEYDEEQPR . This is defintely CTR9 but is not exactly in the sequence. Missing K at the end SIR1_YEAST exists but the peptide sequence cannot be found either using blast or manual searh on uniprot possibly sirp1 but not yeast organism; it is yeasb. Actually may still be SIR1 but Uniprot changed the sequence. See persona communicatoin with uniprot. """ def __init__(self, data, verbose=True, annotations=None, **kargs): """.. rubric:: Constructor Same as :class:`Annotations` except that organism must not be provided. """ super(AnnotationsYeast, self).__init__(data=data, organism="YEAST", verbose=verbose, annotations=annotations, **kargs) index = list(self.df[self.df.Protein == "ADE5;7"].index) if len(index)==1: self.warning("renaming ADE5;7 (should be ADE5,7 anyway) into PUR2") self.df = self.df.set_value(index[0], "Protein", "PUR2") self._rebuild_identifier() if len(index)>1: raise NotImplementedError
[docs] def update_mapping(self): """Update the mapping with known keys There are issues in the naming because of a mixing of protein and gene names. Methods in :class:`Annotations` found most of the mapping. However, there are some ambiguities and the mappind dictionary is corrected as follows. Checked with uniprot and blast. we remove "ADE5,7_YEAST" (replaced by PUR2 if found) ================ ======== ============================================ ================ ======== ============================================ ADE5,7_YEAST P07244 PUR2_YEAST YEF3_YEAST P16521 FEN1_YEAST P25358 FEN1 is actually ELO2_YEAST HIS7_YEAST P33734 HIS7 is actually HIS5_YEAST LEU1_YEAST P07264 this is LEUC_YEAST P07264 NTH1_YEAST P32356 TREA_YEAST NTH2_YEAST P35172 TREB_YEAST ECM17_YEAST P47169 MET5_YEAST RPL6A_YEAST Q02326 RL6A_YEAST RPS7A_YEAST P26786 RS7A_YEAST YJU2_YEAST P28320 CWC16_YEAST no blast results but checked ASC1_YEAST P38011 GBLP_YEAST PSA1_YEAST P41940 no blast result; gene name MPG1_YEAST CTR9_YEAST P89105 CTR9_YEAST correct but see note below IMP2_YEAST P32351 IMPX_YEAST no blast result;IMP2 is gene name ================ ======== ============================================ .. note:: there is also a protein called IMP2' (note the quote), which presumably is also IMP2. Kept as it is fow now """ if len(self._mapping) == 0: raise ValueError("You should call set_annotations and get_uniprot first") #self._mapping['ADE5,7_YEAST'] = ['P07244'] # PUR2_YEAST self._mapping['YEF3_YEAST'] = ["P16521"] # self._mapping['FEN1_YEAST'] = ["P25358"] # FEN1 is actually ELO2_YEAST self._mapping['HIS7_YEAST'] = ["P33734"] # HIS5 is actually HIS5_YEAST self._mapping['LEU1_YEAST'] = ["P07264"] # this is LEUC_YEAST P07264 self._mapping['NTH1_YEAST'] = ['P32356'] # TREA_YEAST self._mapping['NTH2_YEAST'] = ['P35172'] # TREB_YEAST self._mapping['ECM17_YEAST'] = ['P47169'] # MET5_YEAST self._mapping['RPL6A_YEAST'] = ['Q02326'] # RL6A_YEAST self._mapping['RPS7A_YEAST'] = ['P26786'] # RS7A_YEAST self._mapping['YJU2_YEAST'] = ['P28320'] # CWC16_YEAST no blast results but checked self._mapping['ASC1_YEAST'] = ['P38011'] # GBLP_YEAST self._mapping['PSA1_YEAST'] = ['P41940'] # no blast result; a gene name corresponding to MPG1_YEAST self._mapping['CTR9_YEAST'] = ['P89105'] # CTR9_YEAST P89105 this is correct but see note documetnation self._mapping['IMP2_YEAST'] = ['P32351'] # IMPX_YEAST no blast result; IMP" is gene name