Source code for glypy.io.iupac

import re
from collections import deque

from glypy.structure import Monosaccharide, Glycan, constants, named_structures, Substituent
from glypy.composition.structure_composition import substituent_compositions
from glypy.composition.composition_transform import has_derivatization, derivatize
from glypy.io import format_constants_map
from glypy.io.nomenclature import identity
from glypy.utils import invert_dict

from glypy.io.file_utils import ParserInterface


# A static copy of monosaccharide names to structures for copy-free comparison
monosaccharide_reference = {k: v for k, v in named_structures.monosaccharides.items()}


anomer_map_from = dict(format_constants_map.anomer_map)
anomer_map_from['?'] = anomer_map_from.pop('x')
anomer_map_to = invert_dict(anomer_map_from)


Stem = constants.Stem
Configuration = constants.Configuration
Modification = constants.Modification
SuperClass = constants.SuperClass


def tryint(i):
    try:
        return int(i)
    except:
        return -1


class IUPACError(Exception):
    pass


def _make_substituent_name(name):
    return ''.join(t.title() for t in name.split("_")).replace("(", "")


substituents_map_to = {
    name: _make_substituent_name(name) for name in substituent_compositions
}

# Special Cases
substituents_map_to['n_acetyl'] = "NAc"
substituents_map_to['n_glycolyl'] = "NGc"
substituents_map_to['sulfate'] = "S"
substituents_map_to["methyl"] = "Me"
substituents_map_to["acetyl"] = "Ac"
substituents_map_to["glycolyl"] = "Gc"
substituents_map_to["fluoro"] = "F"
substituents_map_to["amino"] = "N"

substituents_map_from = invert_dict(substituents_map_to)


class SubstituentSerializer(object):
    def __init__(self, monosaccharides=None):
        if monosaccharides is None:
            monosaccharides = monosaccharide_reference
        self.monosaccharide_reference = monosaccharides

    def __call__(self, residue, monosaccharide_reference=None, **kwargs):
        return self.resolve_substituents(residue, monosaccharide_reference)

    def serialize_substituent(self, substituent):
        name = substituent.name
        if name in substituents_map_to:
            part = substituents_map_to[name]
        else:
            part = _make_substituent_name(name)
            substituents_map_to[name] = part
            substituents_map_from[part] = name
        return part

    def resolve_substituents(self, residue, monosaccharides=None):
        if monosaccharides is None:
            monosaccharides = self.monosaccharide_reference
        substituent = ""
        multi = False
        for name, pos in self.get_relevant_substituents(residue, monosaccharides):
            if pos in {-1, None}:
                pos = ""
            if name in substituents_map_to:
                part = substituents_map_to[name]
            else:
                part = _make_substituent_name(name)
                substituents_map_to[name] = part
                substituents_map_from[part] = name
            # If there is a substituent after the first, successive ones are placed in parentheses
            if multi:
                substituent += "({}{})".format(pos, part)
            else:
                substituent += "{}{}".format(pos, part)
                multi = True
        return substituent

    def get_relevant_substituents(self, residue, monosaccharides=None):
        '''
        Retrieve the set of substituents not implicitly included
        in the base type's symbol name.
        '''
        if monosaccharides is None:
            monosaccharides = self.monosaccharide_reference
        positions = [p for p, sub in residue.substituents() if not sub._derivatize]
        substituents = [sub.name for p, sub in residue.substituents() if not sub._derivatize]
        if identity.is_a(residue, monosaccharides["NeuAc"], exact=False, short_circuit=True):
            try:
                i = substituents.index("n_acetyl")
                substituents.pop(i)
                j = positions.pop(i)
                substituents.insert(i, "acetyl")
                positions.insert(i, j)
            except:  # pragma: no cover
                pass
        elif identity.is_a(residue, monosaccharides["NeuGc"], exact=False, short_circuit=True):
            # i = substituents.index("n_glycolyl")
            # substituents.pop(i)
            # positions.pop(i)
            pass
        elif identity.is_a(residue, monosaccharides["Neu"], exact=False, short_circuit=True):
            i = substituents.index("amino")
            substituents.pop(i)
            positions.pop(i)

        return zip(substituents, positions)


resolve_substituent = SubstituentSerializer()


class ModificationSerializer(object):
    def extract_modifications(self, modifications, base_type):
        buff = []
        template = '{position}-{name}'
        pos_mod_pairs = list(modifications.items())
        try:
            pos, mods = map(list, zip(*pos_mod_pairs))
        except ValueError:
            pos, mods = [], []
        if "Neu" in base_type or "Kd" in base_type:
            for mod in [Modification.d, Modification.keto, Modification.a]:
                try:
                    pop_ix = mods.index(mod)
                    pos.pop(pop_ix)
                    mods.pop(pop_ix)
                except:  # pragma: no cover
                    pass

        elif "Fuc" in base_type:
            for mod in [Modification.d]:
                pop_ix = mods.index(mod)
                pos.pop(pop_ix)
                mods.pop(pop_ix)

        pos_mod_pairs = zip(pos, mods)
        for pos, mod in pos_mod_pairs:
            if pos != -1:
                buff.append(template.format(position=pos, name=mod.name))
            else:
                buff.append(mod.name)
        return ','.join(buff)

    def __call__(self, modifications, base_type):
        return self.extract_modifications(modifications, base_type)


extract_modifications = ModificationSerializer()


class ModificationDeserializer(object):
    def parse_modifications(self, modification_string):
        buff = modification_string.split(",")
        pairs = []
        for token in buff:
            if token == '':
                continue
            try:
                pos, mod = token.split("-")
            except:
                pos = -1
                mod = token
            pairs.append((int(pos), Modification[mod]))
        return pairs

    def __call__(self, modification_string):
        return self.parse_modifications(modification_string)


parse_modifications = ModificationDeserializer()


class MonosaccharideSerializer(object):
    def __init__(self, monosaccharides=None, substituent_resolver=None, modification_extractor=None):
        if monosaccharides is None:
            monosaccharides = monosaccharide_reference
        self.monosaccharide_reference = monosaccharides
        if substituent_resolver is None:
            substituent_resolver = SubstituentSerializer(monosaccharides)
        self.substituent_resolver = substituent_resolver
        if modification_extractor is None:
            modification_extractor = ModificationSerializer()
        self.modification_extractor = modification_extractor

    def resolve_special_base_type(self, residue, monosaccharide_reference=None):
        if monosaccharide_reference is None:
            monosaccharide_reference = self.monosaccharide_reference
        if residue.superclass == SuperClass.non:
            if residue.stem == (Stem.gro, Stem.gal):
                substituents = [sub.name for p, sub in residue.substituents() if not sub._derivatize]
                modifications = [mod for p, mod in residue.modifications.items()]
                if Modification.a in modifications and\
                   Modification.keto in modifications and\
                   Modification.d in modifications:
                    if len(substituents) == 0:
                        return "Kdn"
                    elif "n_acetyl" in substituents:
                        return "Neu"  # Ac
                    elif "n_glycolyl" in substituents:
                        return "Neu"  # Gc
                    elif "amino" in substituents:
                        return "Neu"  # _

        elif residue.superclass == SuperClass.oct:
            if residue.stem == (Stem.man,):
                if Modification.a in residue.modifications[1] and\
                   Modification.keto in residue.modifications[2] and\
                   Modification.d in residue.modifications[3]:
                    return "Kdo"
        elif residue.stem == (Stem.gal,) and residue.configuration == (Configuration.l,):
            if Modification.d in residue.modifications.values():
                return "Fuc"

        return None

    def monosaccharide_to_iupac(self, residue):
        template = "{anomer}-{configuration}-{modification}{base_type}{ring_type}{substituent}"
        anomer = anomer_map_to[residue.anomer]
        if residue.configuration[0] is Configuration.Unknown:
            configuration = "?"
        else:
            configuration = residue.configuration[0].name.upper()
        modification = ""
        base_type = self.resolve_special_base_type(residue)
        if base_type is None:
            if len(residue.stem) == 1 and residue.stem[0] is not Stem.Unknown:
                base_type = residue.stem[0].name.title()
            else:
                base_type = residue.superclass.name.title()
        modification = self.modification_extractor(residue.modifications, base_type)
        ring_type = residue.ring_type.name[0]
        substituent = self.substituent_resolver(residue)
        return template.format(
            anomer=anomer,
            configuration=configuration,
            modification=modification,
            base_type=base_type,
            ring_type=ring_type,
            substituent=substituent
        )

    def __call__(self, residue):
        return self.monosaccharide_to_iupac(residue)


class DerivatizationAwareMonosaccharideSerializer(MonosaccharideSerializer):
    def monosaccharide_to_iupac(self, residue):
        string = super(DerivatizationAwareMonosaccharideSerializer, self).monosaccharide_to_iupac(residue)
        deriv = has_derivatization(residue)
        if deriv:
            string = "%s^%s" % (string, self.substituent_resolver.serialize_substituent(deriv))
        return string


monosaccharide_to_iupac = MonosaccharideSerializer()
resolve_special_base_type = monosaccharide_to_iupac.resolve_special_base_type


class GlycanSerializer(object):
    def __init__(self, monosaccharide_serializer=None, open_edge='-(', close_edge=')-',
                 open_branch='[', close_branch=']'):
        if monosaccharide_serializer is None:
            monosaccharide_serializer = MonosaccharideSerializer()
        self.monosaccharide_serializer = monosaccharide_serializer

        self.open_edge = open_edge
        self.close_edge = close_edge
        self.open_branch = open_branch
        self.close_branch = close_branch

    def glycan_to_iupac(self, structure=None, attach=None, is_branch=False):
        '''
        Translate a |Glycan| structure into IUPAC Three Letter Code.
        Recursively operates on branches.

        Parameters
        ----------
        structure: Glycan or Monosaccharide
            The glycan to be translated. Translation starts from `glycan.root` if `structure`
            is a |Glycan|.
        attach: int
            The point from the structure tree is attached to its parent. Used for recursively
            handling branches. Defaults to |None|.

        Returns
        -------
        deque
        '''
        base = structure.root if isinstance(structure, Glycan) else structure
        stack = [(attach, base)]
        outstack = deque()
        while(len(stack) > 0):
            outedge, node = stack.pop()
            link = ""
            if outedge is not None:
                link = "{oe}{attach}-{outedge_pos}{ce}".format(
                    outedge_pos=outedge.parent_position,
                    attach=outedge.child_position,
                    oe=self.open_edge, ce=self.close_edge)
            # Branch linkage does not start with leading dash
            if is_branch and link[-1] == '-':
                link = link[:-1]
            outstack.appendleft('{node}{link}'.format(node=self.monosaccharide_serializer(node), link=link))
            # Reset for next pass through the loop
            is_branch = False
            children = list((p, link) for p, link in node.links.items() if link.is_parent(node))
            if len(children) > 1:
                for pos, link in children[:-1]:
                    branch = '{ob}{branch}{cb}'.format(
                        branch=''.join(self.glycan_to_iupac(link.child, link, is_branch=True)),
                        ob=self.open_branch,
                        cb=self.close_branch
                    )
                    outstack.appendleft(branch)
                pos, link = children[-1]
                stack.append((link, link.child))
            elif len(children) == 1:
                pos, link = children[0]
                stack.append((link, link.child))
        return outstack

    def __call__(self, structure):
        return ''.join(self.glycan_to_iupac(structure))


glycan_to_iupac = GlycanSerializer()


[docs]def to_iupac(structure):
    """Translate `structure` into its textual representation using IUPAC Three Letter Code

    Parameters
    ----------
    structure : |Glycan| or |Monosaccharide|
        The structure to be translated

    Returns
    -------
    |str|
    """
    if isinstance(structure, Monosaccharide):
        return monosaccharide_to_iupac(structure)
    else:
        return glycan_to_iupac(structure)


def aminate_substituent(substituent):
    if substituent.name.startswith("n_"):
        # already aminated
        return substituent
    aminated = Substituent("n_" + substituent.name)
    if aminated.composition == {}:
        raise ValueError("Could not aminate substituent")
    return aminated


monosaccharide_parser = re.compile(r'''(?P<anomer>[abo?])-
                                       (?P<configuration>[LD?])-
                                       (?P<modification>[a-z0-9_\-,]*)
                                       (?P<base_type>[^-]{3}?)
                                       (?P<ring_type>[xpfo?])
                                       (?P<substituent>[^-]*?)
                                       (?P<linkage>-\([0-9?]-[0-9?]\)-?)?$''', re.VERBOSE)


class SubstituentDeserializer(object):
    def substituent_from_iupac(self, substituents):
        parts = re.split(r"\(|\)", substituents)
        for part in parts:
            if part == "":
                continue
            split_part = re.split(r"(\d+)?", part)
            if len(split_part) == 3:
                _, position, name = split_part
            else:
                position = -1
                name = split_part[0]
            try:
                name = (substituents_map_from[name])
            except KeyError:
                # Acidic special case:
                # Often, acidic monosaccharides are written with a trailing A like a substituent while
                # GlycoCT treats acidic groups as modifications. If an A appears in the substituent suffix
                # it will fail to be cast as a substituent, but pass it along raw and it will be handled
                # downstream by :func:`monosaccharide_from_iupac`.
                if name == "A":
                    pass
                else:  # pragma: no cover
                    import warnings
                    warnings.warn("No translation rule found to convert %s into a Substituent" % name)
                    continue
            yield int(position), name

    def symbol_to_name(self, symbol):
        name = substituents_map_from[symbol]
        return name

    def __call__(self, substituents):
        return self.substituent_from_iupac(substituents)


substituent_from_iupac = SubstituentDeserializer()


class MonosaccharideDeserializer(object):
    pattern = re.compile(r'''(?P<anomer>[abo?])-
                             (?P<configuration>[LD?])-
                             (?P<modification>[a-z0-9_\-,]*)
                             (?P<base_type>[^-]{3}?)
                             (?P<ring_type>[xpfo?])
                             (?P<substituent>[^-]*?)
                             (?P<linkage>-\([0-9?]-[0-9?]\)-?)?$''', re.VERBOSE)

    def __init__(self, modification_parser=None, substituent_parser=None):
        if modification_parser is None:
            modification_parser = ModificationDeserializer()
        self.modification_parser = modification_parser
        if substituent_parser is None:
            substituent_parser = SubstituentDeserializer()
        self.substituent_parser = substituent_parser

    def has_pattern(self, string):
        return self.pattern.search(string)

    def extract_pattern(self, monosaccharide_str):
        match = self.pattern.search(monosaccharide_str)
        if match is None:
            raise IUPACError("Cannot find monosaccharide pattern in {}".format(monosaccharide_str))
        match_dict = match.groupdict()
        return match_dict

    def ring_bounds(self, residue, ring_type):
        if ring_type == 'p':
            residue.ring_end = residue.ring_start + 4
        elif ring_type == 'f':
            residue.ring_end = residue.ring_start + 3
        elif ring_type == 'o':
            residue.ring_end = residue.ring_start = 0
        else:
            residue.ring_end = residue.ring_start = None

    def build_residue(self, match_dict):
        anomer = anomer_map_from[match_dict['anomer']]
        base_type = match_dict["base_type"]
        configuration = match_dict["configuration"].lower()
        ring_type = match_dict['ring_type']

        modification = match_dict['modification']

        linkage = [d for d in match_dict.get('linkage') or "" if d.isdigit() or d == "?"]

        residue = named_structures.monosaccharides[base_type]
        base_is_modified = len(residue.substituent_links) + len(residue.modifications) > 0

        if len(residue.configuration) == 1:
            residue.configuration = (configuration,)

        residue.anomer = anomer
        self.ring_bounds(residue, ring_type)
        self.set_modifications(residue, modification)
        self.set_substituents(residue, match_dict['substituent'], base_is_modified, base_type)
        return residue, linkage

    def set_substituents(self, residue, substituent_string, base_is_modified, base_type):
        i = 0
        for position, substituent in self.substituent_parser(substituent_string):
            i += 1
            if position == -1 and base_is_modified:
                # Guess at what the user might mean using base_type
                if base_type == "Neu" and substituent in ["acetyl", "glycolyl"] and i == 1:
                    position = 5
                else:
                    raise ValueError(
                        "Cannot have ambiguous location of substituents on a base type which"
                        " has default modifications or substituents. {} {}".format(
                            residue, (position, substituent)))
            # Often, acidic monosaccharides will be suffixed "A" instead of prefixed "a".
            # Handle this here.
            if substituent == "A":
                residue.add_modification(Modification.a, position)
                continue

            substituent = Substituent(substituent)
            try:
                residue.add_substituent(
                    substituent, position,
                    parent_loss=substituent.attachment_composition_loss(), child_loss='H')
            except ValueError:
                # Highly modified large bases have a degenerate encoding, where additional qualifications following
                # base name *replace* an existing substituent. This behavior may not be expected in other more
                # common cases.
                if base_type in {"Neu", "Kdo"}:
                    occupancy = 0
                    try:
                        unplaced = residue.substituent_links[position][0].child
                        residue.drop_substituent(position)
                        if unplaced.name == "amino":
                            try:
                                substituent = aminate_substituent(substituent)
                            except ValueError:
                                pass
                    except ValueError:
                        # The site contains a modification which can be present alongside the substituent
                        occupancy = 1
                    except IndexError:
                        occupancy = 1
                    try:
                        residue.add_substituent(
                            substituent, position, occupancy,
                            parent_loss=substituent.attachment_composition_loss(), child_loss='H')
                    except ValueError:
                        raise IUPACError("Can't resolve %s" % substituent)
                else:
                    raise

    def set_modifications(self, residue, modification_string):
        for pos, mod in self.modification_parser(modification_string):
            residue.add_modification(mod, pos)

    def monosaccharide_from_iupac(self, monosaccharide_str, parent=None):
        match_dict = self.extract_pattern(monosaccharide_str)
        residue, linkage = self.build_residue(match_dict)
        linkage = list(map(tryint, linkage))

        self.add_monosaccharide_bond(residue, parent, linkage)
        return residue, linkage

    def add_monosaccharide_bond(self, residue, parent, linkage):
        if parent is not None and linkage != ():
            parent.add_monosaccharide(
                residue, position=linkage[1], child_position=linkage[0])

    def __call__(self, monosaccharide_str, parent=None):
        return self.monosaccharide_from_iupac(monosaccharide_str, parent=parent)

    def finalize(self, glycan):
        pass


class DerivatizationAwareMonosaccharideDeserializer(MonosaccharideDeserializer):
    pattern = re.compile(r'''(?P<anomer>[abo?])-
                             (?P<configuration>[LD?])-
                             (?P<modification>[a-z0-9_\-,]*)
                             (?P<base_type>[^-]{3}?)
                             (?P<ring_type>[xpfo?])
                             (?P<substituent>[^-]*?)
                             (?P<derivatization>\^[^\s-]*?)?
                             (?P<linkage>-\([0-9?]-[0-9?]\)-?)?$''', re.VERBOSE)

    def add_monosaccharide_bond(self, residue, parent, linkage):
        if parent is not None and linkage != ():
            try:
                parent.add_monosaccharide(residue, position=linkage[1], child_position=linkage[0])
            except ValueError:
                parent_substituent_links_at_site = parent.substituent_links[linkage[1]]
                if (parent_substituent_links_at_site and parent_substituent_links_at_site[0].child._derivatize):
                    parent.drop_substituent(linkage[1], parent_substituent_links_at_site[0].child)
                residue_substituent_links_at_site = residue.substituent_links[linkage[0]]
                if residue_substituent_links_at_site and residue_substituent_links_at_site[0].child._derivatize:
                    residue.drop_substituent(linkage[0], residue_substituent_links_at_site[0].child)

                parent.add_monosaccharide(residue, position=linkage[1], child_position=linkage[0])

    def apply_derivatization(self, residue, deriv):
        if deriv.startswith("^"):
            deriv = deriv[1:]
            deriv = self.substituent_parser.symbol_to_name(deriv)
            derivatize(residue, deriv)
        else:
            raise IUPACError("Derivatization Extension Must Start with '^'")

    def monosaccharide_from_iupac(self, monosaccharide_str, parent=None):
        match_dict = self.extract_pattern(monosaccharide_str)
        residue, linkage = self.build_residue(match_dict)
        linkage = list(map(tryint, linkage))

        self.add_monosaccharide_bond(residue, parent, linkage)

        deriv = match_dict.get("derivatization", '')
        if deriv is not None and deriv != "":
            self.apply_derivatization(residue, deriv)

        return residue, linkage

    def finalize(self, glycan):
        for node in glycan:
            neg_capacity = -node._remaining_capacity()
            if neg_capacity > 0:
                unknowns = node.substituent_links[-1]
                to_remove = []
                for unknown in unknowns:
                    if unknown.child.node_type is Substituent.node_type and unknown.child._derivatize:
                        if neg_capacity > 0:
                            to_remove.append(unknown)
                            neg_capacity -= 1
                        else:
                            break
                for link_to_remove in to_remove:
                    link_to_remove.break_link(refund=True)
                if neg_capacity > 0:
                    raise ValueError("Could not completely remove overload from %s" % (node,))


monosaccharide_from_iupac = MonosaccharideDeserializer()


class GlycanDeserializer(object):
    def __init__(self, monosaccharide_deserializer=None):
        if monosaccharide_deserializer is None:
            monosaccharide_deserializer = MonosaccharideDeserializer()
        self.monosaccharide_deserializer = monosaccharide_deserializer

    def add_monosaccharide(self, parent_node, child_node, parent_position, child_position):
        # parent_node.add_monosaccharide(
        #     child_node, position=parent_position, child_position=child_position)
        self.monosaccharide_deserializer.add_monosaccharide_bond(
            child_node, parent_node, (child_position, parent_position))

    def glycan_from_iupac(self, text, **kwargs):
        last_outedge = None
        root = None
        last_residue = None
        branch_stack = []

        while len(text) > 0:

            # If starting a new branch
            if text[-1] == ']':
                branch_stack.append((last_residue, root, last_outedge))
                root = None
                last_residue = None
                last_outedge = None
                text = text[:-1]
            # If ending a branch
            elif text[-1] == '[':
                try:
                    branch_parent, old_root, old_last_outedge = branch_stack.pop()
                    child_position, parent_position = last_outedge
                    self.add_monosaccharide(branch_parent, root, parent_position, child_position)
                    root = old_root
                    last_residue = branch_parent
                    last_outedge = old_last_outedge
                    text = text[:-1]
                except IndexError:
                    raise IUPACError("Bad branching at {}".format(len(text)))
            # Parsing a residue
            else:
                match = self.monosaccharide_deserializer.has_pattern(text)
                if match:
                    next_residue, outedge = self.monosaccharide_deserializer(
                        text[match.start(): match.end()], last_residue)
                    if root is None:
                        last_outedge = outedge
                        root = next_residue
                    last_residue = next_residue
                    text = text[:match.start()]
                else:
                    raise IUPACError("Could not identify residue '...{}' at {}".format(text[-30:], len(text)))
        res = Glycan(root)
        self.monosaccharide_deserializer.finalize(res)
        return res

    def __call__(self, text, **kwargs):
        return self.glycan_from_iupac(text, **kwargs)


glycan_from_iupac = GlycanDeserializer()


[docs]def from_iupac(text, **kwargs):
    """Parse the given text into an instance of |Glycan|. If there is only a single monosaccharide
    in the output, just the Monosaccharide instance is returned.

    Parameters
    ----------
    text : |str|

    Returns
    -------
    |Glycan| or |Monosaccharide|
        If the resulting structure is just a single monosaccharide, the returned value is a Monosaccharide.
    """
    res = glycan_from_iupac(text, **kwargs)
    if len(res) > 1:
        return res
    else:
        return res.root


loads = from_iupac
dumps = to_iupac


class IUPACParser(ParserInterface):
    def process_result(self, line):
        structure = loads(line)
        return structure


Monosaccharide.register_serializer("iupac", dumps)
Glycan.register_serializer("iupac", dumps)