Pyteomics documentation v2.1.5

pyteomics.pepxml

Contents

Source code for pyteomics.pepxml

"""
pepxml - pepXML file reader
===========================

Summary
-------

`pep.XML <http://tools.proteomecenter.org/wiki/index.php?title=Formats:pepXML>`_
was the first widely accepted format for proteomics search engines' output. 
Even though it is to be replaced by a community standard
`mzIdentML <http://www.psidev.info/index.php?q=node/454>`_, it is still used
commonly.

This module provides minimalistic infrastructure for access to data stored in
pep.XML files. The most important function is :py:func:`read`, which 
reads peptide-spectum matches and related information and saves them into 
human-readable dicts. The rest of data can be obtained via :py:func:`get_node` 
function. This function relies on the terminology of the underlying 
`lxml library <http://lxml.de/>`_.

Data access
-----------

  :py:func:`read` - iterate through peptide-spectrum matches in a pep.XML 
  file. Data for a single spectrum are converted to an easy-to-use dict. 

  :py:func:`roc_curve` - get a receiver-operator curve (min peptideprophet
  probability is a sample vs. false discovery rate) of peptideprophet analysis.

  :py:func:`version_info` - get version information about the pepXML file.

  :py:func:`iterfind` - iterate over elements in a pepXML file.

-------------------------------------------------------------------------------
"""

#   Copyright 2012 Anton Goloborodko, Lev Levitsky
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

from lxml import etree
from . import auxiliary as aux

def _get_info_smart(source, element, **kw):
    """Extract the info in a smart way depending on the element type"""
    name = aux._local_name(element)
    kwargs = dict(kw)
    rec = kwargs.pop('recursive', None)
    if name == 'msms_pipeline_analysis':
        info = _get_info(source, element, rec if rec is not None else False,
                **kwargs)
    else:
        info = _get_info(source, element, rec if rec is not None else True,
                **kwargs)
    
    # attributes which contain unconverted values.
    convert = {'float':  {'calc_neutral_pep_mass', 'massdiff'},
        'int': {'start_scan', 'end_scan', 'index'},
        'bool': {'is_rejected'},
        'floatarray': {'all_ntt_prob'}}
    converters = {'float': float, 'int': int, 
            'bool': lambda x: x.lower() in {'1', 'true'},
            'floatarray': lambda x: list(map(float, x[1:-1].split(',')))}
    for k, v in dict(info).items():
        for t, s in convert.items():
            if k in s:
                del info[k]
                info[k] = converters[t](v)
    for k in {'search_score', 'parameter'}:
        if k in info and isinstance(info[k], list) and all(
                isinstance(x, dict) and len(x) == 1 for x in info[k]):
            scores = {}
            for score in info[k]:
                name, value = score.popitem()
                scores[name] = float(value)
            info[k] = scores
    if 'search_result' in info and len(info['search_result']) == 1:
        info.update(info['search_result'][0])
        del info['search_result']
    if 'protein' in info and 'peptide' in info:
        info['proteins'] = [{'protein': info.pop('protein'),
            'protein_descr': info.pop('protein_descr', None)}]
        for add_key in {'peptide_prev_aa', 'peptide_next_aa', 'protein_mw'}:
            if add_key in info:
                info['proteins'][0][add_key] = info.pop(add_key)
        info['proteins'][0]['num_tol_term'] = info.pop('num_tol_term', 0)
        if 'alternative_protein' in info:
            info['proteins'].extend(info['alternative_protein'])
            del info['alternative_protein']
    if 'peptide' in info and not 'modified_peptide' in info:
        info['modified_peptide'] = info['peptide']
    if 'mod_aminoacid_mass' in info:
        info['modifications'] = info.pop('mod_aminoacid_mass')
        if 'mod_nterm_mass' in info:
            info['modifications'].insert(0, {'position': 0,
                'mass': float(info.pop('mod_nterm_mass'))})
        if 'mod_cterm_mass' in info:
            info['modifications'].append({'position': 1 + len(info['peptide']),
                'mass': float(info.pop('mod_cterm_mass'))})
    if 'modified_peptide' in info and info['modified_peptide'] == info.get(
            'peptide'):
        if not info.get('modifications'):
            info['modifications'] = []
        else:
            mp = info['modified_peptide']
            for mod in sorted(info['modifications'],
                    key=lambda m: m['position'],
                    reverse=True):
                p = mod['position']
                mp = mp[:p] + '[{}]'.format(int(mod['mass'])) + mp[p:]
            info['modified_peptide'] = mp
    if 'search_hit' in info:
        info['search_hit'].sort(key=lambda x: x['hit_rank'])
    return info

@aux._file_reader('rb')
[docs]def read(source): """Parse ``source`` and iterate through peptide-spectrum matches. Parameters ---------- source : str or file A path to a target pepXML file or the file object itself. Returns ------- out : iterator An iterator over the dicts with PSM properties. """ return iterfind(source, 'spectrum_query')
[docs]def roc_curve(source): """Parse source and return a ROC curve for peptideprophet analysis. Parameters ---------- source : str or file A path to a target pepXML file or the file object itself. Returns ------- out : list A list of ROC points, sorted by ascending min prob. """ parser = etree.XMLParser(remove_comments=True, ns_clean=True) tree = etree.parse(source, parser=parser) roc_curve = [] for roc_element in tree.xpath( "/*[local-name()='msms_pipeline_analysis']" "/*[local-name()='analysis_summary and @analysis='peptideprophet']" "/*[local-name()='peptideprophet_summary']" "/*[local-name()='roc_data_point']"): roc_data_point = dict(roc_element.attrib) for key in roc_data_point: roc_data_point[key] = float(roc_data_point[key]) roc_curve.append(roc_data_point) return sorted(roc_curve, key=lambda x: x['min_prob'])
_version_info_env = {'format': 'pepXML', 'element': 'msms_pipeline_analysis'} version_info = aux._make_version_info(_version_info_env) _schema_defaults = {'ints': {('xpressratio_summary', 'xpress_light'), ('distribution_point', 'obs_5_distr'), ('distribution_point', 'obs_2_distr'), ('enzymatic_search_constraint', 'max_num_internal_cleavages'), ('asapratio_lc_heavypeak', 'right_valley'), ('libra_summary', 'output_type'), ('distribution_point', 'obs_7_distr'), ('spectrum_query', 'index'), ('data_filter', 'number'), ('roc_data_point', 'num_incorr'), ('search_hit', 'num_tol_term'), ('search_hit', 'num_missed_cleavages'), ('asapratio_lc_lightpeak', 'right_valley'), ('libra_summary', 'normalization'), ('specificity', 'min_spacing'), ('database_refresh_timestamp', 'min_num_enz_term'), ('enzymatic_search_constraint', 'min_number_termini'), ('xpressratio_result', 'light_lastscan'), ('distribution_point', 'obs_3_distr'), ('spectrum_query', 'end_scan'), ('analysis_result', 'id'), ('search_database', 'size_in_db_entries'), ('search_hit', 'hit_rank'), ('alternative_protein', 'num_tol_term'), ('search_hit', 'num_tot_proteins'), ('asapratio_summary', 'elution'), ('search_hit', 'tot_num_ions'), ('error_point', 'num_incorr'), ('mixture_model', 'precursor_ion_charge'), ('roc_data_point', 'num_corr'), ('search_hit', 'num_matched_ions'), ('dataset_derivation', 'generation_no'), ('xpressratio_result', 'heavy_firstscan'), ('xpressratio_result', 'heavy_lastscan'), ('error_point', 'num_corr'), ('spectrum_query', 'assumed_charge'), ('analysis_timestamp', 'id'), ('xpressratio_result', 'light_firstscan'), ('distribution_point', 'obs_4_distr'), ('asapratio_lc_heavypeak', 'left_valley'), ('fragment_masses', 'channel'), ('distribution_point', 'obs_6_distr'), ('affected_channel', 'channel'), ('search_result', 'search_id'), ('contributing_channel', 'channel'), ('asapratio_lc_lightpeak', 'left_valley'), ('asapratio_peptide_data', 'area_flag'), ('search_database', 'size_of_residues'), ('asapratio_peptide_data', 'cidIndex'), ('mixture_model', 'num_iterations'), ('mod_aminoacid_mass', 'position'), ('spectrum_query', 'start_scan'), ('asapratio_summary', 'area_flag'), ('mixture_model', 'tot_num_spectra'), ('search_summary', 'search_id'), ('xpressratio_timestamp', 'xpress_light'), ('distribution_point', 'obs_1_distr'), ('intensity', 'channel'), ('asapratio_contribution', 'charge'), ('libra_summary', 'centroiding_preference')}, 'floats': {('asapratio_contribution', 'error'), ('asapratio_lc_heavypeak', 'area_error'), ('modification_info', 'mod_nterm_mass'), ('distribution_point', 'model_4_neg_distr'), ('distribution_point', 'model_5_pos_distr'), ('spectrum_query', 'precursor_neutral_mass'), ('asapratio_lc_heavypeak', 'time_width'), ('xpressratio_summary', 'masstol'), ('affected_channel', 'correction'), ('distribution_point', 'model_7_neg_distr'), ('error_point', 'error'), ('intensity', 'target_mass'), ('roc_data_point', 'sensitivity'), ('distribution_point', 'model_4_pos_distr'), ('distribution_point', 'model_2_neg_distr'), ('distribution_point', 'model_3_pos_distr'), ('mixture_model', 'prior_probability'), ('roc_data_point', 'error'), ('intensity', 'normalized'), ('modification_info', 'mod_cterm_mass'), ('asapratio_lc_lightpeak', 'area_error'), ('distribution_point', 'fvalue'), ('distribution_point', 'model_1_neg_distr'), ('peptideprophet_summary', 'min_prob'), ('asapratio_result', 'mean'), ('point', 'pos_dens'), ('fragment_masses', 'mz'), ('mod_aminoacid_mass', 'mass'), ('distribution_point', 'model_6_neg_distr'), ('asapratio_lc_lightpeak', 'time_width'), ('asapratio_result', 'heavy2light_error'), ('peptideprophet_result', 'probability'), ('error_point', 'min_prob'), ('peptideprophet_summary', 'est_tot_num_correct'), ('roc_data_point', 'min_prob'), ('asapratio_result', 'heavy2light_mean'), ('distribution_point', 'model_5_neg_distr'), ('mixturemodel', 'neg_bandwidth'), ('asapratio_result', 'error'), ('xpressratio_result', 'light_mass'), ('point', 'neg_dens'), ('asapratio_lc_lightpeak', 'area'), ('distribution_point', 'model_1_pos_distr'), ('xpressratio_result', 'mass_tol'), ('mixturemodel', 'pos_bandwidth'), ('xpressratio_result', 'light_area'), ('asapratio_peptide_data', 'heavy_mass'), ('distribution_point', 'model_2_pos_distr'), ('search_hit', 'calc_neutral_pep_mass'), ('intensity', 'absolute'), ('asapratio_peptide_data', 'light_mass'), ('distribution_point', 'model_3_neg_distr'), ('aminoacid_modification', 'mass'), ('asapratio_lc_heavypeak', 'time'), ('asapratio_lc_lightpeak', 'time'), ('asapratio_lc_lightpeak', 'background'), ('mixture_model', 'est_tot_correct'), ('point', 'value'), ('asapratio_lc_heavypeak', 'background'), ('terminal_modification', 'mass'), ('fragment_masses', 'offset'), ('xpressratio_result', 'heavy_mass'), ('search_hit', 'protein_mw'), ('libra_summary', 'mass_tolerance'), ('spectrum_query', 'retention_time_sec'), ('distribution_point', 'model_7_pos_distr'), ('asapratio_lc_heavypeak', 'area'), ('alternative_protein', 'protein_mw'), ('asapratio_contribution', 'ratio'), ('xpressratio_result', 'heavy_area'), ('distribution_point', 'model_6_pos_distr')}, 'bools': {('sample_enzyme', 'independent'), ('intensity', 'reject'), ('libra_result', 'is_rejected')}, 'intlists': set(), 'floatlists': set(), 'charlists': set(), 'lists': {'point', 'aminoacid_modification', 'msms_run_summary', 'mixturemodel', 'search_hit', 'mixturemodel_distribution', 'sequence_search_constraint', 'specificity', 'alternative_protein', 'analysis_result', 'data_filter', 'fragment_masses', 'error_point', 'parameter', 'spectrum_query', 'search_result', 'affected_channel', 'analysis_summary', 'roc_data_point', 'distribution_point', 'search_summary', 'mod_aminoacid_mass', 'search_score', 'intensity', 'analysis_timestamp', 'mixture_model', 'terminal_modification', 'contributing_channel', 'inputfile'}} _schema_env = {'format': 'pepXML', 'version_info': version_info, 'default_version': '1.15', 'defaults': _schema_defaults} _schema_info = aux._make_schema_info(_schema_env) _getinfo_env = {'keys': {'search_score_summary', 'modification_info'}, 'schema_info': _schema_info, 'get_info_smart': _get_info_smart} _get_info = aux._make_get_info(_getinfo_env) _iterfind_env = {'get_info_smart': _get_info_smart} iterfind = aux._make_iterfind(_iterfind_env)

Contents