# -*- python -*-
#
# This file is part of MS-DAS software
#
# Copyright (c) 2014 - EBI-EMBL
#
# File author(s): Thomas Cokelaer <cokelaer@ebi.ac.uk>, Marti Bernardo Faura
# bernardo@ebi.ac.uk
#
# Distributed under the GPLv3 License.
# See accompanying file LICENSE.txt or copy at
# http://www.gnu.org/licenses/gpl-3.0.html
#
#
##############################################################################
import re
from easydev import Logging
__all__ = ['PSites']
[docs]class PSites(Logging):
"""Utilities to manipulate and validate phosphorylation sites (psites) encoding
Psites stands for phosphorylation sites. There are encoded in different w
ays depending on the input data files. This class
provides methods to help decipher the different format and convert them
into a common one.
Psites indicate the position of a phosphorylation on a given sequence of
amino acids as well as the amino acid on which the phosphorylation occured. For
instance::
S20
means there is a phosphorylation on Serine at poistion 20 of the protein sequence.
Possibly, you may have more than one phosphorylation. In order to encode several
locations, we use the **^** character to represent an **AND**. Mote that
sometimes a location may be ambiguous. In such case, we use the **+** character
to represent an **OR**.
We use the following conventions.
#. no spaces
#. aminod acids are encoded with capitalised letters
#. S25^T40 means there are 2 psites at position 25 AND 40
#. S25+S26 means there is an ambiguity and the psite is located at poisiton 25 OR 40
#. S25+S26^T40 means there are 2 psites, one of which is ambiguous (S25).
The number of psites is equal to the number of ^ characters plus one.
"""
OR = "+"
AND = "^"
def __init__(self, verbose=True):
super(PSites, self).__init__(level=verbose)
# valid phosphos are S, T, Y; M is an oxidation that may not be required
self.valid_letters = ['S', 'T', 'Y', 'M']
[docs] def append_oxidation(self, sequence, psites):
"""Update psite string with Oxydation location if present in the sequence
Sometimes, the tag Oxydation is present in the sequence but not reflected
in the psite string. This function update the psite according to the
Oxidation tags.
:param str sequence: sequence of amino acid with position of phospho and oxidation
as in the following example: KVS(Phospho)VGSM(Oxidation)G
:param str sites: a valid psite string (capitalised letters)
:return: Updated psite string with oxidation locations. See example below.
.. doctest::
>>> from msdas.psites import PSites
>>> p = PSites()
>>> sequence = "KVS(Phospho)VGSM(Oxidation)GS(Phospho)GK"
>>> psites = "S956^S962"
>>> p.append_oxidation(sequence, psites)
'S956^M960^S962'
.. todo:: somehow we do not need the psite string as a parameter. Can be extracted
from the sequence anyway.
"""
if self.isvalid(psites) == False:
raise ValueError("invalid sites provided () must be separated by ^ character" % psites)
if "Oxidation" not in sequence:
return psites
psites = psites.split(self.AND)
absolute_position = int(psites[0][1:]) - sequence.replace("(Oxidation)","").index("(Phospho)")
for occ in re.finditer("(Oxidation)", sequence):
seq = sequence[0:occ.start()].replace("(Phospho)", "").replace("(Oxidation)", "")
seq = seq.replace("(", "")
nucleo = seq[-1]
new_psite = nucleo + str(absolute_position + len(seq))
psites.append(new_psite)
# now, we sorte the psite
psites = self.sorted(psites)
return psites
[docs] def get_unique_psites(self, psites_list):
"""
handles both AND and OR characters
"""
psites_list = [x for y in psites_list for x in y.split("^")]
psites_list = [x for y in psites_list for x in y.split("+")]
return list(set(psites_list))
[docs] def get_factorised_psites(self, psites_list):
"""Given a list of psites, find common and ambiguous parts and return new string
Only the AND psites are factorised.
>>> p = PSites()
>>> p.get_factorised_psites(['S177^Y182', 'T180^Y182'])
'S177+T180^Y182'
>>> p.get_factorised_psites(["S272^T277^S279", "S272^S275^S279"])
'S275+T277^S272^S279'
The returned string in the last example tells us that there are 3
phosphosites (2 ^ character + 1). The first psiste is ambiguous (S275+T277)
Note this example::
>>> p.get_factorised_psites(["S1+S4^S3", "S2+S1^S3"])
'S1+S1+S2+S4^S3'
.. warning:: in this example, be aware that (i), even though psites are
not ordered by location in the input, there are in the output and
(ii) if there are duplicates sites (e.g., S1) there are not
simplified for now.
"""
all_sites = set([y for x in psites_list for y in x.split("^")])
# this is the set of common psites across all psites
common = self.get_common_psites(psites_list)
# and the ones that are not
not_common = all_sites.difference(common)
# sorted concatenate with ^; let us replace by +
not_common = self.sorted(list(not_common)).replace("^", "+")
psites = [not_common] + list(common)
psites = [psite for psite in psites if psite] # remove empty strings
if len(psites)>1:
psites= "^".join([not_common]+list(common))
psites = self.sorted(psites.split("^"))
return psites
[docs] def get_common_psites(self, psites_list):
"""Given a list of psites of the form Y182^T180, returns common factor
>>> p = PSites()
>>> p.get_common_psites(['S177^Y182', 'T180^Y182'])
{'Y182'}
The OR characters are ignored.
.. note:: used by :meth:`get_factorised_psites`
"""
all_sites = set([y for x in psites_list for y in x.split(self.AND)])
sets = [set(psites.split(self.AND)) for psites in psites_list]
# note the * in front of the list of sets to make it a list of arguments
return all_sites.intersection(*sets)
[docs] def isvalid(self, psites):
"""Return true is site is using a valid format
Checks that:
#. letters are capitalised and in the :attr:`valid_letters` (default T, Y, S, M)
#. there is no spaces
#. number of aminod acid is compatible with separators (+/^). e.g, T4T5 is not
correct. It should be either S4+S5 or S4^S5.
:param str psite: a psites string (e.g., S2^T5)
:return: boolean
.. seealso:: the convention is explained in the class documentation.
::
>>> from msdas import PSites
>>> p = PSites()
>>> p.isvalid("T1+S2^T")
False
"""
if " " in psites:
self.logging.warning("Invalid psite with spaces inside")
return False
psites = [x for y in psites.split(self.OR) for x in y.split(self.AND)]
# check letters
for psite in psites:
found = list(re.finditer("[%s]" % ",".join(self.valid_letters), psite))
if len(found)==0:
self.logging.warning("Found an invalid letter (must be captialised and in {}). You provided {},{}".format(self.valid_letters, psite, psites))
return False
# check that there are number after letter
for psite in psites:
if len(psite)==1:
self.logging.warning("Invalid psite missing position in {}, {}?".format(psite,psites))
return False
number = psite[1:]
if number.isdigit() == False:
self.logging.warning("Invalid location; should be numeric after Phospho {},{}?".format(psite,psites))
return False
return True
def remove_duplicated(self, psites):
"""Remove duplicates psites in a psite string
.. doctest::
>>> from msdas.psites import PSites
>>> p = PSites()
>>> p.remove_duplicated("S3^S4^S3")
'S3^S4'
"""
psites = psites.split(self.AND)
psites = list(set(psites))
psites = self.sorted(psites)
return psites
[docs] def remove_spaces(self, psite):
"""Removes spaces found in a psite string
Sometimes, psites are separated by ;. These characters are kept
in this function but should be replaced once the meaning is known
by AND or OR character.
"""
psite = psite.strip()
psite = ";".join([x.strip() for x in psite.split(";")])
return psite
[docs] def sort_psites_ors_only(self, psites):
"""Sort psites
:param str psites: a valid psites string.
:return: modified psites
::
>>> from msdas import PSites
>>> p = PSites()
>>> p.sort_psites_ors_only("S2+S1")
'S1+S2'
Psites that contain AND character are not processed.
"""
if self.AND in psites:
return psites
else:
f = lambda x,y: cmp(int(x[1:].split(self.OR)[0]), int(y[1:].split(self.OR)[0]))
return "+".join(sorted(psites.split("+"), cmp=f))
[docs] def sorted(self, psites_list):
"""Sort the psites found in the psites list using their positions
If there is an ambiguous site, the + is ignored and only the first psite
is used for the position.
.. doctest::
>>> from msdas.psites import PSites
>>> p = PSites()
>>> p.sorted(["S3", "S1", "S4+S5"])
'S1^S3^S4+S5'
>>> p.sorted(["S3", "S1", "S8+S7"])
'S1^S3^S7+S8'
"""
# check validity
f = lambda x,y: cmp(int(x[1:].split(self.OR)[0]), int(y[1:].split(self.OR)[0]))
# First, we need to order internally the psites that contain ORs
# e.g., S8+S7 must be re-ordered as S7+S8
for psite in psites_list:
if self.isvalid(psite) == False:
self.logging.error("Invalid psite %s provided" % psite)
# each item that has an OR will be split, sorted and joined back
psites_list = [self.OR.join(sorted(psite.split(self.OR), cmp=f)) for psite in psites_list]
# now, we do the actual sorting of the list
psites = sorted(psites_list, cmp=f)
psites = self.AND.join(psites)
return psites
[docs] def remove_duplicated(self, psites):
"""Remove duplicates psites in a psite string
.. doctest::
>>> from msdas.psites import PSites
>>> p = PSites()
>>> p.remove_duplicated("S3^S4^S3")
'S3^S4'
"""
psites = psites.split(self.AND)
psites = list(set(psites))
psites = self.sorted(psites)
return psites