Source code for glypy.composition.composition

# Credit to Pyteomics - http://pythonhosted.org/pyteomics - for majority of design
import re
import math
from collections import defaultdict
from .mass_dict import nist_mass
from .base import ChemicalCompositionError, composition_factory

import os

use_cython = bool(int(os.environ.get("USE_CYTHON_COMPOSITION", 1)))

try:
    from .ccomposition import CComposition, calculate_mass as ccalculate_mass
except ImportError:  # pragma: no cover
    use_cython = False

# Forward Declaration

_isotope_string = r'^([A-Z][a-z+]*)(?:\[(\d+)\])?$'
_atom = r'([A-Z][a-z+]*)(?:\[(\d+)\])?([+-]?\d+)?'
_formula = r'^({})*$'.format(_atom)
formula_pattern = re.compile(_formula)


[docs]def _make_isotope_string(element_name, isotope_num): """Form a string label for an isotope.""" if isotope_num == 0: return element_name else: return '%s[%d]' % (element_name, isotope_num)
[docs]def _parse_isotope_string(label): """Parse an string with an isotope label and return the element name and the isotope number. >>> _parse_isotope_string('C') ('C', 0) >>> _parse_isotope_string('C[12]') ('C', 12) """ element_name, num = re.match(_isotope_string, label).groups() isotope_num = int(num) if num else 0 return element_name, isotope_num
[docs]class PComposition(defaultdict): '''Represent arbitrary elemental compositions''' def __str__(self): # pragma: no cover return 'Composition({})'.format(dict.__repr__(self)) def __repr__(self): # pragma: no cover return str(self) def __iadd__(self, other): for elem, cnt in (other.items()): self[elem] += cnt self._mass_args = None return self def __add__(self, other): result = self.clone() for elem, cnt in other.items(): result[elem] += cnt return result def __radd__(self, other): return self + other def __isub__(self, other): for elem, cnt in other.items(): self[elem] -= cnt self._mass_args = None return self def __sub__(self, other): result = self.clone() for elem, cnt in other.items(): result[elem] -= cnt return result def __rsub__(self, other): return (self - other) * (-1) def __mul__(self, other): if not isinstance(other, int): raise ChemicalCompositionError( 'Cannot multiply Composition by non-integer', other) prod = {} for k, v in self.items(): prod[k] = v * other return Composition(prod) def __rmul__(self, other): return self * other def __eq__(self, other): if not isinstance(other, dict): return False self_items = set([i for i in self.items() if i[1]]) other_items = set([i for i in other.items() if i[1]]) return self_items == other_items def __neg__(self): return -1 * self def __reduce__(self): return composition_factory, (list(self),), self.__getstate__() def __getstate__(self): return dict(self) def __setstate__(self, d): self._from_dict(d) # Override the default behavior, if a key is not present # do not initialize it to 0. def __missing__(self, key): return 0 def __setitem__(self, key, value): if isinstance(value, float): value = int(round(value)) elif not isinstance(value, (int)): raise ChemicalCompositionError( 'Only integers allowed as values in Composition, got {}.'.format(type(value).__name__)) if value: # Will not occur on 0 as 0 is falsey AND an integer super(PComposition, self).__setitem__(key, value) elif key in self: del self[key] self._mass_args = None def clone(self): return PComposition(self) copy = clone def update(self, *args, **kwargs): dict.update(self, *args, **kwargs) self._mass_args = None def _from_formula(self, formula, mass_data): if '(' in formula: return self._from_formula_parens(formula, mass_data) if not formula_pattern.match(formula): raise ChemicalCompositionError('Invalid formula: ' + formula) for elem, isotope, number in re.findall(_atom, formula): if elem not in mass_data: raise ChemicalCompositionError('Unknown chemical element: ' + elem) self[_make_isotope_string(elem, int(isotope) if isotope else 0) ] += int(number) if number else 1 def _from_formula_parens(self, formula, mass_data): # pragma: no cover # Parsing a formula backwards. prev_chem_symbol_start = len(formula) i = len(formula) - 1 seek_mode = 0 parse_stack = "" resolve_stack = [] group_coef = None while i >= 0: if seek_mode < 1: if (formula[i] == ")"): seek_mode += 1 if i + 1 == prev_chem_symbol_start: group_coef = 1 elif formula[i + 1].isdigit(): group_coef = int(formula[i + 1:prev_chem_symbol_start]) i -= 1 continue # Read backwards until a non-number character is met. if (formula[i].isdigit() or formula[i] == '-'): i -= 1 continue else: # If the number of atoms is omitted then it is 1. if i + 1 == prev_chem_symbol_start: num_atoms = 1 else: try: num_atoms = int(formula[i + 1:prev_chem_symbol_start]) except ValueError: raise ChemicalCompositionError( 'Badly-formed number of atoms: %s' % formula) # Read isotope number if specified, else it is undefined (=0). if formula[i] == ']': brace_pos = formula.rfind('[', 0, i) if brace_pos == -1: raise ChemicalCompositionError( 'Badly-formed isotope number: %s' % formula) try: isotope_num = int(formula[brace_pos + 1:i]) except ValueError: raise ChemicalCompositionError( 'Badly-formed isotope number: %s' % formula) i = brace_pos - 1 else: isotope_num = 0 # Match the element name to the mass_data. element_found = False # Sort the keys from longest to shortest to workaround # the overlapping keys issue for element_name in sorted(mass_data, key=len, reverse=True): if formula.endswith(element_name, 0, i + 1): isotope_string = _make_isotope_string( element_name, isotope_num) self[isotope_string] += num_atoms i -= len(element_name) prev_chem_symbol_start = i + 1 element_found = True break if not element_found: raise ChemicalCompositionError( 'Unknown chemical element in the formula: %s' % formula) else: ch = formula[i] parse_stack += ch i -= 1 if(ch == "("): seek_mode -= 1 if seek_mode == 0: resolve_stack.append(Composition( # Omit the last character, then reverse the parse # stack string. formula=parse_stack[:-1][::-1], mass_data=mass_data) * group_coef) prev_chem_symbol_start = i + 1 seek_mode = False parse_stack = "" elif(formula[i] == ")"): seek_mode += 1 else: # continue to accumulate tokens pass # Unspool the resolve stack, adding together the chunks # at this level. __add__ operates immutably, so must manually # loop through each chunk. for chunk in resolve_stack: for elem, cnt in chunk.items(): self[elem] += cnt def _from_dict(self, comp): # for isotope_string, num_atoms in comp.items(): # element_name, isotope_num = _parse_isotope_string( # isotope_string) # # Remove explicitly undefined isotopes (e.g. X[0]). # self[_make_isotope_string(element_name, isotope_num)] = ( # num_atoms) self.update(comp) def calc_mass(self, average=False, charge=None, mass_data=None): mdid = id(mass_data) if self._mass_args is not None and average is self._mass_args[0]\ and charge == self._mass_args[1] and mdid == self._mass_args[2]: return self._mass else: self._mass_args = (average, charge, mdid) self._mass = pcalculate_mass(composition=self, average=average, charge=charge, mass_data=mass_data) return self._mass @property def mass(self): return self.calc_mass()
[docs] def __init__(self, *args, **kwargs): """ A Composition object stores a chemical composition of a substance. Basically it is a dict object, in which keys are the names of chemical elements and values contain integer numbers of corresponding atoms in a substance. The main improvement over dict is that Composition objects allow addition and subtraction. If ``formula`` is not specified, the constructor will look at the first positional argument and try to build the object from it. Without positional arguments, a Composition will be constructed directly from keyword arguments. Parameters ---------- formula : str, optional A string with a chemical formula. All elements must be present in `mass_data`. mass_data : dict, optional A dict with the masses of chemical elements (the default value is :py:data:`nist_mass`). It is used for formulae parsing only. """ defaultdict.__init__(self, int) mass_data = kwargs.get('mass_data', nist_mass) kw_sources = set( ('formula',)) kw_given = kw_sources.intersection(kwargs) if len(kw_given) > 1: raise ChemicalCompositionError('Only one of {} can be specified!\n\ Given: {}'.format(', '.join(kw_sources), ', '.join(kw_given))) elif kw_given: kwa = kw_given.pop() getattr(self, '_from_' + kwa)(kwargs[kwa], mass_data) # can't build from kwargs elif args: if isinstance(args[0], dict): self._from_dict(args[0]) elif isinstance(args[0], str): try: self._from_formula(args[0], mass_data) except ChemicalCompositionError: raise ChemicalCompositionError( 'Could not create a Composition object from ' 'string: "{}": not a valid sequence or ' 'formula'.format(args[0])) else: self._from_dict(kwargs) self._mass = None self._mass_args = None
[docs]def pcalculate_mass(composition=None, formula=None, average=False, charge=None, mass_data=None): """Calculates the monoisotopic mass of a chemical formula or Composition object. Parameters ---------- composition : Composition, optional A Composition object with the elemental composition of a substance. average : bool, optional If :py:const:`True` then the average mass is calculated. Note that mass is not averaged for elements with specified isotopes. Default is :py:const:`False`. charge : int, optional If not 0 then m/z is calculated: the mass is increased by the corresponding number of proton masses and divided by z. mass_data : dict, optional A dict with the masses of the chemical elements (the default value is :py:data:`nist_mass`). Returns ------- mass : float """ mass_data = mass_data or nist_mass # Make a deep copy of `composition` keyword argument. composition = composition if composition else PComposition(formula) # Get charge. old_charge = composition['H+'] if charge is not None: if old_charge != 0: raise ChemicalCompositionError( 'Charge is specified both by the number of protons and ' '`charge` in kwargs') composition['H+'] = charge # Calculate mass. mass = 0.0 for isotope_string in composition: element_name, isotope_num = _parse_isotope_string(isotope_string) # Calculate average mass if required and the isotope number is # not specified. if (not isotope_num) and average: for isotope in mass_data[element_name]: if isotope != 0: mass += (composition[element_name] * mass_data[element_name][isotope][0] * mass_data[element_name][isotope][1]) else: mass += (composition[isotope_string] * mass_data[element_name][isotope_num][0]) # Calculate m/z if required. if charge: mass /= abs(charge) if old_charge != 0: composition["H+"] = old_charge elif charge: del composition["H+"] return mass
# Checks to see if the Cython versions are available if use_cython: # pragma: no cover Composition = CComposition calculate_mass = ccalculate_mass else: # pragma: no cover Composition = PComposition calculate_mass = pcalculate_mass
[docs]def most_probable_isotopic_composition(*args, **kwargs): """Calculate the most probable isotopic composition of a chemical formula or |Composition| object. For each element, only two most abundant isotopes are considered. Parameters ---------- formula : str, optional A string with a chemical formula. composition : |Composition|, optional A :py:class:`Composition` object with the elemental composition of a substance. elements_with_isotopes : list of str A list of elements to be considered in isotopic distribution (by default, every element has a isotopic distribution). mass_data : dict, optional A dict with the masses of chemical elements (. Returns ------- out: tuple (|Composition|, float) A tuple with the most probable isotopic composition and its relative abundance. """ composition = (dict(kwargs['composition']) if 'composition' in kwargs else Composition(*args, **kwargs)) # Removing isotopes from the composition. for isotope_string in composition: element_name, isotope_num = _parse_isotope_string(isotope_string) if isotope_num: composition[element_name] += composition.pop(isotope_string) mass_data = kwargs.get('mass_data', nist_mass) elements_with_isotopes = kwargs.get('elements_with_isotopes') isotopic_composition = Composition() for element_name in composition: if element_name == 'H+': continue if (not elements_with_isotopes or (element_name in elements_with_isotopes)): # Take the two most abundant isotopes. first_iso, second_iso = sorted( [(i[0], i[1][1]) for i in mass_data[element_name].items() if i[0]], key=lambda x: -x[1])[:2] # Write the number of isotopes of the most abundant type. first_iso_str = _make_isotope_string(element_name, first_iso[0]) isotopic_composition[first_iso_str] = round(int(math.ceil(composition[element_name])) * first_iso[1]) # Write the number of the second isotopes. second_iso_str = _make_isotope_string(element_name, second_iso[0]) isotopic_composition[second_iso_str] = round( (composition[element_name] - isotopic_composition[first_iso_str])) else: isotopic_composition[element_name] = composition[element_name] return (isotopic_composition, isotopic_composition_abundance( composition=isotopic_composition, mass_data=mass_data))
[docs]def isotopic_composition_abundance(*args, **kwargs): """Calculate the relative abundance of a given isotopic composition of a molecule. Parameters ---------- formula : str, optional A string with a chemical formula. composition : Composition, optional A Composition object with the isotopic composition of a substance. mass_data : dict, optional A dict with the masses of chemical elements (the default value is :py:data:`nist_mass`). Returns ------- relative_abundance : float The relative abundance of a given isotopic composition. """ composition = (Composition(kwargs['composition']) if 'composition' in kwargs else Composition(*args, **kwargs)) isotopic_composition = defaultdict(dict) # Check if there are default and non-default isotopes of the same # element and rearrange the elements. for element in composition: if element == 'H+': continue element_name, isotope_num = _parse_isotope_string(element) # If there is already an entry for this element and either it # contains a default isotope or newly added isotope is default # then raise an exception. if ((element_name in isotopic_composition) and (isotope_num == 0 or 0 in isotopic_composition[element_name])): raise ChemicalCompositionError( 'Please specify the isotopic states of all atoms of ' '%s or do not specify them at all.' % element_name) else: isotopic_composition[element_name][isotope_num] = ( composition[element]) # Calculate relative abundance. mass_data = kwargs.get('mass_data', nist_mass) num1, num2, denom = 1., 1., 1. for element_name, isotope_dict in isotopic_composition.items(): num1 *= math.factorial(sum(isotope_dict.values())) for isotope_num, isotope_content in isotope_dict.items(): denom *= math.factorial(isotope_content) if isotope_num: num2 *= (mass_data[element_name][isotope_num][1] ** isotope_content) return num2 * (num1 / denom)
def formula(composition): keys = sorted(composition) return ''.join("%s%d" % (k, composition[k]) for k in keys)