import re
from collections import deque
from glypy.structure import Monosaccharide, Glycan, constants, named_structures, Substituent
from glypy.composition.structure_composition import substituent_compositions
from glypy.composition.composition_transform import has_derivatization, derivatize
from glypy.io import format_constants_map
from glypy.io.nomenclature import identity
from glypy.utils import invert_dict
from glypy.io.file_utils import ParserInterface
# A static copy of monosaccharide names to structures for copy-free comparison
monosaccharide_reference = {k: v for k, v in named_structures.monosaccharides.items()}
anomer_map_from = dict(format_constants_map.anomer_map)
anomer_map_from['?'] = anomer_map_from.pop('x')
anomer_map_to = invert_dict(anomer_map_from)
Stem = constants.Stem
Configuration = constants.Configuration
Modification = constants.Modification
SuperClass = constants.SuperClass
def tryint(i):
try:
return int(i)
except:
return -1
class IUPACError(Exception):
pass
def _make_substituent_name(name):
return ''.join(t.title() for t in name.split("_")).replace("(", "")
substituents_map_to = {
name: _make_substituent_name(name) for name in substituent_compositions
}
# Special Cases
substituents_map_to['n_acetyl'] = "NAc"
substituents_map_to['n_glycolyl'] = "NGc"
substituents_map_to['sulfate'] = "S"
substituents_map_to["methyl"] = "Me"
substituents_map_to["acetyl"] = "Ac"
substituents_map_to["glycolyl"] = "Gc"
substituents_map_to["fluoro"] = "F"
substituents_map_to["amino"] = "N"
substituents_map_from = invert_dict(substituents_map_to)
class SubstituentSerializer(object):
def __init__(self, monosaccharides=None):
if monosaccharides is None:
monosaccharides = monosaccharide_reference
self.monosaccharide_reference = monosaccharides
def __call__(self, residue, monosaccharide_reference=None, **kwargs):
return self.resolve_substituents(residue, monosaccharide_reference)
def serialize_substituent(self, substituent):
name = substituent.name
if name in substituents_map_to:
part = substituents_map_to[name]
else:
part = _make_substituent_name(name)
substituents_map_to[name] = part
substituents_map_from[part] = name
return part
def resolve_substituents(self, residue, monosaccharides=None):
if monosaccharides is None:
monosaccharides = self.monosaccharide_reference
substituent = ""
multi = False
for name, pos in self.get_relevant_substituents(residue, monosaccharides):
if pos in {-1, None}:
pos = ""
if name in substituents_map_to:
part = substituents_map_to[name]
else:
part = _make_substituent_name(name)
substituents_map_to[name] = part
substituents_map_from[part] = name
# If there is a substituent after the first, successive ones are placed in parentheses
if multi:
substituent += "({}{})".format(pos, part)
else:
substituent += "{}{}".format(pos, part)
multi = True
return substituent
def get_relevant_substituents(self, residue, monosaccharides=None):
'''
Retrieve the set of substituents not implicitly included
in the base type's symbol name.
'''
if monosaccharides is None:
monosaccharides = self.monosaccharide_reference
positions = [p for p, sub in residue.substituents() if not sub._derivatize]
substituents = [sub.name for p, sub in residue.substituents() if not sub._derivatize]
if identity.is_a(residue, monosaccharides["NeuAc"], exact=False, short_circuit=True):
try:
i = substituents.index("n_acetyl")
substituents.pop(i)
j = positions.pop(i)
substituents.insert(i, "acetyl")
positions.insert(i, j)
except: # pragma: no cover
pass
elif identity.is_a(residue, monosaccharides["NeuGc"], exact=False, short_circuit=True):
# i = substituents.index("n_glycolyl")
# substituents.pop(i)
# positions.pop(i)
pass
elif identity.is_a(residue, monosaccharides["Neu"], exact=False, short_circuit=True):
i = substituents.index("amino")
substituents.pop(i)
positions.pop(i)
return zip(substituents, positions)
resolve_substituent = SubstituentSerializer()
class ModificationSerializer(object):
def extract_modifications(self, modifications, base_type):
buff = []
template = '{position}-{name}'
pos_mod_pairs = list(modifications.items())
try:
pos, mods = map(list, zip(*pos_mod_pairs))
except ValueError:
pos, mods = [], []
if "Neu" in base_type or "Kd" in base_type:
for mod in [Modification.d, Modification.keto, Modification.a]:
try:
pop_ix = mods.index(mod)
pos.pop(pop_ix)
mods.pop(pop_ix)
except: # pragma: no cover
pass
elif "Fuc" in base_type:
for mod in [Modification.d]:
pop_ix = mods.index(mod)
pos.pop(pop_ix)
mods.pop(pop_ix)
pos_mod_pairs = zip(pos, mods)
for pos, mod in pos_mod_pairs:
if pos != -1:
buff.append(template.format(position=pos, name=mod.name))
else:
buff.append(mod.name)
return ','.join(buff)
def __call__(self, modifications, base_type):
return self.extract_modifications(modifications, base_type)
extract_modifications = ModificationSerializer()
class ModificationDeserializer(object):
def parse_modifications(self, modification_string):
buff = modification_string.split(",")
pairs = []
for token in buff:
if token == '':
continue
try:
pos, mod = token.split("-")
except:
pos = -1
mod = token
pairs.append((int(pos), Modification[mod]))
return pairs
def __call__(self, modification_string):
return self.parse_modifications(modification_string)
parse_modifications = ModificationDeserializer()
class MonosaccharideSerializer(object):
def __init__(self, monosaccharides=None, substituent_resolver=None, modification_extractor=None):
if monosaccharides is None:
monosaccharides = monosaccharide_reference
self.monosaccharide_reference = monosaccharides
if substituent_resolver is None:
substituent_resolver = SubstituentSerializer(monosaccharides)
self.substituent_resolver = substituent_resolver
if modification_extractor is None:
modification_extractor = ModificationSerializer()
self.modification_extractor = modification_extractor
def resolve_special_base_type(self, residue, monosaccharide_reference=None):
if monosaccharide_reference is None:
monosaccharide_reference = self.monosaccharide_reference
if residue.superclass == SuperClass.non:
if residue.stem == (Stem.gro, Stem.gal):
substituents = [sub.name for p, sub in residue.substituents() if not sub._derivatize]
modifications = [mod for p, mod in residue.modifications.items()]
if Modification.a in modifications and\
Modification.keto in modifications and\
Modification.d in modifications:
if len(substituents) == 0:
return "Kdn"
elif "n_acetyl" in substituents:
return "Neu" # Ac
elif "n_glycolyl" in substituents:
return "Neu" # Gc
elif "amino" in substituents:
return "Neu" # _
elif residue.superclass == SuperClass.oct:
if residue.stem == (Stem.man,):
if Modification.a in residue.modifications[1] and\
Modification.keto in residue.modifications[2] and\
Modification.d in residue.modifications[3]:
return "Kdo"
elif residue.stem == (Stem.gal,) and residue.configuration == (Configuration.l,):
if Modification.d in residue.modifications.values():
return "Fuc"
return None
def monosaccharide_to_iupac(self, residue):
template = "{anomer}-{configuration}-{modification}{base_type}{ring_type}{substituent}"
anomer = anomer_map_to[residue.anomer]
if residue.configuration[0] is Configuration.Unknown:
configuration = "?"
else:
configuration = residue.configuration[0].name.upper()
modification = ""
base_type = self.resolve_special_base_type(residue)
if base_type is None:
if len(residue.stem) == 1 and residue.stem[0] is not Stem.Unknown:
base_type = residue.stem[0].name.title()
else:
base_type = residue.superclass.name.title()
modification = self.modification_extractor(residue.modifications, base_type)
ring_type = residue.ring_type.name[0]
substituent = self.substituent_resolver(residue)
return template.format(
anomer=anomer,
configuration=configuration,
modification=modification,
base_type=base_type,
ring_type=ring_type,
substituent=substituent
)
def __call__(self, residue):
return self.monosaccharide_to_iupac(residue)
class DerivatizationAwareMonosaccharideSerializer(MonosaccharideSerializer):
def monosaccharide_to_iupac(self, residue):
string = super(DerivatizationAwareMonosaccharideSerializer, self).monosaccharide_to_iupac(residue)
deriv = has_derivatization(residue)
if deriv:
string = "%s^%s" % (string, self.substituent_resolver.serialize_substituent(deriv))
return string
monosaccharide_to_iupac = MonosaccharideSerializer()
resolve_special_base_type = monosaccharide_to_iupac.resolve_special_base_type
class GlycanSerializer(object):
def __init__(self, monosaccharide_serializer=None, open_edge='-(', close_edge=')-',
open_branch='[', close_branch=']'):
if monosaccharide_serializer is None:
monosaccharide_serializer = MonosaccharideSerializer()
self.monosaccharide_serializer = monosaccharide_serializer
self.open_edge = open_edge
self.close_edge = close_edge
self.open_branch = open_branch
self.close_branch = close_branch
def glycan_to_iupac(self, structure=None, attach=None, is_branch=False):
'''
Translate a |Glycan| structure into IUPAC Three Letter Code.
Recursively operates on branches.
Parameters
----------
structure: Glycan or Monosaccharide
The glycan to be translated. Translation starts from `glycan.root` if `structure`
is a |Glycan|.
attach: int
The point from the structure tree is attached to its parent. Used for recursively
handling branches. Defaults to |None|.
Returns
-------
deque
'''
base = structure.root if isinstance(structure, Glycan) else structure
stack = [(attach, base)]
outstack = deque()
while(len(stack) > 0):
outedge, node = stack.pop()
link = ""
if outedge is not None:
link = "{oe}{attach}-{outedge_pos}{ce}".format(
outedge_pos=outedge.parent_position,
attach=outedge.child_position,
oe=self.open_edge, ce=self.close_edge)
# Branch linkage does not start with leading dash
if is_branch and link[-1] == '-':
link = link[:-1]
outstack.appendleft('{node}{link}'.format(node=self.monosaccharide_serializer(node), link=link))
# Reset for next pass through the loop
is_branch = False
children = list((p, link) for p, link in node.links.items() if link.is_parent(node))
if len(children) > 1:
for pos, link in children[:-1]:
branch = '{ob}{branch}{cb}'.format(
branch=''.join(self.glycan_to_iupac(link.child, link, is_branch=True)),
ob=self.open_branch,
cb=self.close_branch
)
outstack.appendleft(branch)
pos, link = children[-1]
stack.append((link, link.child))
elif len(children) == 1:
pos, link = children[0]
stack.append((link, link.child))
return outstack
def __call__(self, structure):
return ''.join(self.glycan_to_iupac(structure))
glycan_to_iupac = GlycanSerializer()
[docs]def to_iupac(structure):
"""Translate `structure` into its textual representation using IUPAC Three Letter Code
Parameters
----------
structure : |Glycan| or |Monosaccharide|
The structure to be translated
Returns
-------
|str|
"""
if isinstance(structure, Monosaccharide):
return monosaccharide_to_iupac(structure)
else:
return glycan_to_iupac(structure)
def aminate_substituent(substituent):
if substituent.name.startswith("n_"):
# already aminated
return substituent
aminated = Substituent("n_" + substituent.name)
if aminated.composition == {}:
raise ValueError("Could not aminate substituent")
return aminated
monosaccharide_parser = re.compile(r'''(?P<anomer>[abo?])-
(?P<configuration>[LD?])-
(?P<modification>[a-z0-9_\-,]*)
(?P<base_type>[^-]{3}?)
(?P<ring_type>[xpfo?])
(?P<substituent>[^-]*?)
(?P<linkage>-\([0-9?]-[0-9?]\)-?)?$''', re.VERBOSE)
class SubstituentDeserializer(object):
def substituent_from_iupac(self, substituents):
parts = re.split(r"\(|\)", substituents)
for part in parts:
if part == "":
continue
split_part = re.split(r"(\d+)?", part)
if len(split_part) == 3:
_, position, name = split_part
else:
position = -1
name = split_part[0]
try:
name = (substituents_map_from[name])
except KeyError:
# Acidic special case:
# Often, acidic monosaccharides are written with a trailing A like a substituent while
# GlycoCT treats acidic groups as modifications. If an A appears in the substituent suffix
# it will fail to be cast as a substituent, but pass it along raw and it will be handled
# downstream by :func:`monosaccharide_from_iupac`.
if name == "A":
pass
else: # pragma: no cover
import warnings
warnings.warn("No translation rule found to convert %s into a Substituent" % name)
continue
yield int(position), name
def symbol_to_name(self, symbol):
name = substituents_map_from[symbol]
return name
def __call__(self, substituents):
return self.substituent_from_iupac(substituents)
substituent_from_iupac = SubstituentDeserializer()
class MonosaccharideDeserializer(object):
pattern = re.compile(r'''(?P<anomer>[abo?])-
(?P<configuration>[LD?])-
(?P<modification>[a-z0-9_\-,]*)
(?P<base_type>[^-]{3}?)
(?P<ring_type>[xpfo?])
(?P<substituent>[^-]*?)
(?P<linkage>-\([0-9?]-[0-9?]\)-?)?$''', re.VERBOSE)
def __init__(self, modification_parser=None, substituent_parser=None):
if modification_parser is None:
modification_parser = ModificationDeserializer()
self.modification_parser = modification_parser
if substituent_parser is None:
substituent_parser = SubstituentDeserializer()
self.substituent_parser = substituent_parser
def has_pattern(self, string):
return self.pattern.search(string)
def extract_pattern(self, monosaccharide_str):
match = self.pattern.search(monosaccharide_str)
if match is None:
raise IUPACError("Cannot find monosaccharide pattern in {}".format(monosaccharide_str))
match_dict = match.groupdict()
return match_dict
def ring_bounds(self, residue, ring_type):
if ring_type == 'p':
residue.ring_end = residue.ring_start + 4
elif ring_type == 'f':
residue.ring_end = residue.ring_start + 3
elif ring_type == 'o':
residue.ring_end = residue.ring_start = 0
else:
residue.ring_end = residue.ring_start = None
def build_residue(self, match_dict):
anomer = anomer_map_from[match_dict['anomer']]
base_type = match_dict["base_type"]
configuration = match_dict["configuration"].lower()
ring_type = match_dict['ring_type']
modification = match_dict['modification']
linkage = [d for d in match_dict.get('linkage') or "" if d.isdigit() or d == "?"]
residue = named_structures.monosaccharides[base_type]
base_is_modified = len(residue.substituent_links) + len(residue.modifications) > 0
if len(residue.configuration) == 1:
residue.configuration = (configuration,)
residue.anomer = anomer
self.ring_bounds(residue, ring_type)
self.set_modifications(residue, modification)
self.set_substituents(residue, match_dict['substituent'], base_is_modified, base_type)
return residue, linkage
def set_substituents(self, residue, substituent_string, base_is_modified, base_type):
i = 0
for position, substituent in self.substituent_parser(substituent_string):
i += 1
if position == -1 and base_is_modified:
# Guess at what the user might mean using base_type
if base_type == "Neu" and substituent in ["acetyl", "glycolyl"] and i == 1:
position = 5
else:
raise ValueError(
"Cannot have ambiguous location of substituents on a base type which"
" has default modifications or substituents. {} {}".format(
residue, (position, substituent)))
# Often, acidic monosaccharides will be suffixed "A" instead of prefixed "a".
# Handle this here.
if substituent == "A":
residue.add_modification(Modification.a, position)
continue
substituent = Substituent(substituent)
try:
residue.add_substituent(
substituent, position,
parent_loss=substituent.attachment_composition_loss(), child_loss='H')
except ValueError:
# Highly modified large bases have a degenerate encoding, where additional qualifications following
# base name *replace* an existing substituent. This behavior may not be expected in other more
# common cases.
if base_type in {"Neu", "Kdo"}:
occupancy = 0
try:
unplaced = residue.substituent_links[position][0].child
residue.drop_substituent(position)
if unplaced.name == "amino":
try:
substituent = aminate_substituent(substituent)
except ValueError:
pass
except ValueError:
# The site contains a modification which can be present alongside the substituent
occupancy = 1
except IndexError:
occupancy = 1
try:
residue.add_substituent(
substituent, position, occupancy,
parent_loss=substituent.attachment_composition_loss(), child_loss='H')
except ValueError:
raise IUPACError("Can't resolve %s" % substituent)
else:
raise
def set_modifications(self, residue, modification_string):
for pos, mod in self.modification_parser(modification_string):
residue.add_modification(mod, pos)
def monosaccharide_from_iupac(self, monosaccharide_str, parent=None):
match_dict = self.extract_pattern(monosaccharide_str)
residue, linkage = self.build_residue(match_dict)
linkage = list(map(tryint, linkage))
self.add_monosaccharide_bond(residue, parent, linkage)
return residue, linkage
def add_monosaccharide_bond(self, residue, parent, linkage):
if parent is not None and linkage != ():
parent.add_monosaccharide(
residue, position=linkage[1], child_position=linkage[0])
def __call__(self, monosaccharide_str, parent=None):
return self.monosaccharide_from_iupac(monosaccharide_str, parent=parent)
def finalize(self, glycan):
pass
class DerivatizationAwareMonosaccharideDeserializer(MonosaccharideDeserializer):
pattern = re.compile(r'''(?P<anomer>[abo?])-
(?P<configuration>[LD?])-
(?P<modification>[a-z0-9_\-,]*)
(?P<base_type>[^-]{3}?)
(?P<ring_type>[xpfo?])
(?P<substituent>[^-]*?)
(?P<derivatization>\^[^\s-]*?)?
(?P<linkage>-\([0-9?]-[0-9?]\)-?)?$''', re.VERBOSE)
def add_monosaccharide_bond(self, residue, parent, linkage):
if parent is not None and linkage != ():
try:
parent.add_monosaccharide(residue, position=linkage[1], child_position=linkage[0])
except ValueError:
parent_substituent_links_at_site = parent.substituent_links[linkage[1]]
if (parent_substituent_links_at_site and parent_substituent_links_at_site[0].child._derivatize):
parent.drop_substituent(linkage[1], parent_substituent_links_at_site[0].child)
residue_substituent_links_at_site = residue.substituent_links[linkage[0]]
if residue_substituent_links_at_site and residue_substituent_links_at_site[0].child._derivatize:
residue.drop_substituent(linkage[0], residue_substituent_links_at_site[0].child)
parent.add_monosaccharide(residue, position=linkage[1], child_position=linkage[0])
def apply_derivatization(self, residue, deriv):
if deriv.startswith("^"):
deriv = deriv[1:]
deriv = self.substituent_parser.symbol_to_name(deriv)
derivatize(residue, deriv)
else:
raise IUPACError("Derivatization Extension Must Start with '^'")
def monosaccharide_from_iupac(self, monosaccharide_str, parent=None):
match_dict = self.extract_pattern(monosaccharide_str)
residue, linkage = self.build_residue(match_dict)
linkage = list(map(tryint, linkage))
self.add_monosaccharide_bond(residue, parent, linkage)
deriv = match_dict.get("derivatization", '')
if deriv is not None and deriv != "":
self.apply_derivatization(residue, deriv)
return residue, linkage
def finalize(self, glycan):
for node in glycan:
neg_capacity = -node._remaining_capacity()
if neg_capacity > 0:
unknowns = node.substituent_links[-1]
to_remove = []
for unknown in unknowns:
if unknown.child.node_type is Substituent.node_type and unknown.child._derivatize:
if neg_capacity > 0:
to_remove.append(unknown)
neg_capacity -= 1
else:
break
for link_to_remove in to_remove:
link_to_remove.break_link(refund=True)
if neg_capacity > 0:
raise ValueError("Could not completely remove overload from %s" % (node,))
monosaccharide_from_iupac = MonosaccharideDeserializer()
class GlycanDeserializer(object):
def __init__(self, monosaccharide_deserializer=None):
if monosaccharide_deserializer is None:
monosaccharide_deserializer = MonosaccharideDeserializer()
self.monosaccharide_deserializer = monosaccharide_deserializer
def add_monosaccharide(self, parent_node, child_node, parent_position, child_position):
# parent_node.add_monosaccharide(
# child_node, position=parent_position, child_position=child_position)
self.monosaccharide_deserializer.add_monosaccharide_bond(
child_node, parent_node, (child_position, parent_position))
def glycan_from_iupac(self, text, **kwargs):
last_outedge = None
root = None
last_residue = None
branch_stack = []
while len(text) > 0:
# If starting a new branch
if text[-1] == ']':
branch_stack.append((last_residue, root, last_outedge))
root = None
last_residue = None
last_outedge = None
text = text[:-1]
# If ending a branch
elif text[-1] == '[':
try:
branch_parent, old_root, old_last_outedge = branch_stack.pop()
child_position, parent_position = last_outedge
self.add_monosaccharide(branch_parent, root, parent_position, child_position)
root = old_root
last_residue = branch_parent
last_outedge = old_last_outedge
text = text[:-1]
except IndexError:
raise IUPACError("Bad branching at {}".format(len(text)))
# Parsing a residue
else:
match = self.monosaccharide_deserializer.has_pattern(text)
if match:
next_residue, outedge = self.monosaccharide_deserializer(
text[match.start(): match.end()], last_residue)
if root is None:
last_outedge = outedge
root = next_residue
last_residue = next_residue
text = text[:match.start()]
else:
raise IUPACError("Could not identify residue '...{}' at {}".format(text[-30:], len(text)))
res = Glycan(root)
self.monosaccharide_deserializer.finalize(res)
return res
def __call__(self, text, **kwargs):
return self.glycan_from_iupac(text, **kwargs)
glycan_from_iupac = GlycanDeserializer()
[docs]def from_iupac(text, **kwargs):
"""Parse the given text into an instance of |Glycan|. If there is only a single monosaccharide
in the output, just the Monosaccharide instance is returned.
Parameters
----------
text : |str|
Returns
-------
|Glycan| or |Monosaccharide|
If the resulting structure is just a single monosaccharide, the returned value is a Monosaccharide.
"""
res = glycan_from_iupac(text, **kwargs)
if len(res) > 1:
return res
else:
return res.root
loads = from_iupac
dumps = to_iupac
class IUPACParser(ParserInterface):
def process_result(self, line):
structure = loads(line)
return structure
Monosaccharide.register_serializer("iupac", dumps)
Glycan.register_serializer("iupac", dumps)