Pyteomics documentation v2.1.5

pyteomics.fasta

Contents

Source code for pyteomics.fasta

"""
fasta - manipulations with FASTA databases 
==========================================

FASTA is a simple file format for protein sequence databases. Please refer to
`the NCBI website <http://www.ncbi.nlm.nih.gov/blast/fasta.shtml>`_
for the most detailed information on the format.

Data manipulation
-----------------

  :py:func:`read` - iterate through entries in a FASTA database

  :py:func:`write` - write entries to a FASTA database

  :py:func:`parse` - parse a FASTA header

Decoy database generation
-------------------------

  :py:func:`decoy_sequence` - generate a decoy sequence from a given sequence

  :py:func:`decoy_db` - generate entries for a decoy database from a given FASTA
  database

  :py:func:`write_decoy_db` - generate a decoy database and print it to a file

Auxiliary
----------

:py:data:`std_parsers` - a dictionary with parsers for known FASTA header
formats.

-------------------------------------------------------------------------------
"""

#   Copyright 2012 Anton Goloborodko, Lev Levitsky
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

import itertools
import random
from collections import namedtuple
import re
from .auxiliary import PyteomicsError, _file_obj, _file_reader

@_file_reader()
[docs]def read(source=None, ignore_comments=False, parser=None): """Read a FASTA file and return entries iteratively. Parameters ---------- source : str or file or None, optional A file object (or file name) with a FASTA database. Default is :py:const:`None`, which means read standard input. ignore_comments : bool, optional If True then ignore the second and subsequent lines of description. Default is :py:const:`False`. parser : function or None, optional Defines whether the fasta descriptions should be parsed. If it is a function, that function will be given the description string, and the returned value will be yielded together with the sequence. The :py:data:`std_parsers` dict has parsers for several formats. Hint: specify :py:func:`parse` as the parser to apply automatic format guessing. Default is :py:const:`None`, which means return the header "as is". Returns ------- out : iterator of tuples A named 2-tuple with FASTA header (str) and sequence (str). Attributes 'description' and 'sequence' are also provided. """ Protein = namedtuple('Protein', ('description', 'sequence')) f = parser or (lambda x: x) accumulated_strings = [] # Iterate through '>' after the file is over to retrieve the last entry. for string in itertools.chain(source, '>'): stripped_string = string.strip() # Skip empty lines. if not stripped_string: continue is_comment = (stripped_string.startswith('>') or stripped_string.startswith(';')) if is_comment: # If it is a continuing comment if len(accumulated_strings) == 1: if not ignore_comments: accumulated_strings[0] += (' '+stripped_string[1:]) else: continue elif accumulated_strings: description = accumulated_strings[0] sequence = ''.join(accumulated_strings[1:]) # Drop the translation stop sign. if sequence.endswith('*'): sequence = sequence[:-1] yield Protein(f(description), sequence) accumulated_strings = [stripped_string[1:], ] else: # accumulated_strings is empty; we're probably reading # the very first line of the file accumulated_strings.append(stripped_string[1:]) else: accumulated_strings.append(stripped_string)
[docs]def write(entries, output=None): """ Create a FASTA file with ``entries``. Parameters ---------- entries : iterable of (str, str) tuples An iterable of 2-tuples in the form (description, sequence). output : file-like or str, optional A file open for writing or a path to write to. If the file exists, it will be opened for appending. Default is :py:const:`None`, which means write to standard output. Returns ------- output_file : file object The file where the FASTA is written. """ with _file_obj(output, 'a') as foutput: for descr, seq in entries: # write the description foutput.write('>' + descr.replace('\n', '\n;') + '\n') # write the sequence; it should be interrupted with \n every 70 characters foutput.write(''.join([('%s\n' % seq[i:i+70]) for i in range(0, len(seq), 70)]) + '\n') return foutput.file
[docs]def decoy_sequence(sequence, mode): """ Create a decoy sequence out of a given sequence string. Parameters ---------- sequence : str The initial sequence string. mode : {'reverse', 'shuffle'} Type of decoy sequence. Returns ------- modified_sequence : str The modified sequence. """ if mode == 'reverse': return sequence[::-1] if mode == 'shuffle': modified_sequence = list(sequence) random.shuffle(modified_sequence) return ''.join(modified_sequence) raise PyteomicsError( """`fasta.decoy_sequence`: `mode` must be 'reverse' or 'shuffle', not {}""".format(mode))
@_file_reader()
[docs]def decoy_db(source=None, mode='reverse', prefix='DECOY_', decoy_only=False): """Iterate over sequences for a decoy database out of a given ``source``. If `output` is a path, the file will be open for appending, so no information will be lost if the file exists. Although, the user should be careful when providing open file streams as `source` and `output`. The reading and writing will start from the current position in the files, which is where the last I/O operation finished. One can use the :py:func:`file.seek` method to change it. Parameters ---------- source : file-like object or str or None, optional A path to a FASTA database or a file object itself. Default is :py:const:`None`, which means read standard input. mode : {'reverse', 'shuffle'}, optional Algorithm of decoy sequence generation. 'reverse' by default. prefix : str, optional A prefix to the protein descriptions of decoy entries. The default value is 'DECOY_'. decoy_only : bool, optional If set to :py:const:`True`, only the decoy entries will be written to `output`. If :py:const:`False`, the entries from `source` will be written first. :py:const:`False` by default. Returns ------- out : iterator An iterator over entries of the new database. """ # store the initial position pos = source.tell() if not decoy_only: for x in read(source): yield x # return to the initial position the source file to read again source.seek(pos) decoy_entries = ((prefix + descr, decoy_sequence(seq, mode)) for descr, seq in read(source)) for x in decoy_entries: yield x
[docs]def write_decoy_db(source=None, output=None, mode='reverse', prefix='DECOY_', decoy_only=False): """Generate a decoy database out of a given ``source`` and write to file. If `output` is a path, the file will be open for appending, so no information will be lost if the file exists. Although, the user should be careful when providing open file streams as `source` and `output`. The reading and writing will start from the current position in the files, which is where the last I/O operation finished. One can use the :py:func:`file.seek` method to change it. Parameters ---------- source : file-like object or str or None, optional A path to a FASTA database or a file object itself. Default is :py:const:`None`, which means read standard input. output : file-like object or str, optional A path to the output database or a file open for writing. Defaults to :py:const:`None`, the results go to the standard output. mode : {'reverse', 'shuffle'}, optional Algorithm of decoy sequence generation. 'reverse' by default. prefix : str, optional A prefix to the protein descriptions of decoy entries. The default value is "DECOY_" decoy_only : bool, optional If set to :py:const:`True`, only the decoy entries will be written to `output`. If :py:const:`False`, the entries from `source` will be written as well. :py:const:`False` by default. Returns ------- output : file A file object for the created file. """ with _file_obj(output, 'a') as fout, decoy_db( source, mode, prefix, decoy_only) as entries: write(entries, fout) return fout.file # auxiliary functions for parsing of FASTA headers
def _split_pairs(s): return dict(map(lambda x: x.strip(), x.split('=')) for x in re.split(' (?=\w+=)', s.strip())) def _intify(d, keys): for k in keys: if k in d: d[k] = int(d[k]) # definitions for custom parsers def _parse_uniprotkb(header): db, ID, entry, name, pairs, _ = re.match( r'^(\w+)\|([-\w]+)\|(\w+)\s+([^=]*\S)((\s+\w+=[^=]+(?!\w*=))+)\s*$', header).groups() info = {'db': db, 'id': ID, 'entry': entry, 'name': name} info.update(_split_pairs(pairs)) _intify(info, ('PE', 'SV')) return info def _parse_uniref(header): assert 'Tax' in header and 'RepID=' in header ID, cluster, pairs, _ = re.match( r'^(\S+)\s+([^=]*\S)((\s+\w+=[^=]+(?!\w*=))+)\s*$', header).groups() info = {'id': ID, 'cluster': cluster} info.update(_split_pairs(pairs)) _intify(info, ('n',)) return info def _parse_uniparc(header): ID, status = re.match(r'(\S+)\s+status=(\w+)\s*$', header).groups() return {'id': ID, 'status': status} def _parse_unimes(header): assert 'OS=' in header and 'SV=' in header ID, name, pairs, _ = re.match( r'^(\S+)\s+([^=]*\S)((\s+\w+=[^=]+(?!\w*=))+)\s*$', header).groups() info = {'id': ID, 'name': name} info.update(_split_pairs(pairs)) _intify(info, ('SV',)) return info std_parsers = {'uniprotkb': _parse_uniprotkb, 'uniref': _parse_uniref, 'uniparc': _parse_uniparc, 'unimes': _parse_unimes} """A dictionary with parsers for known FASTA header formats. For now, supported formats are those described at `UniProt help page <http://www.uniprot.org/help/fasta-headers>`_."""
[docs]def parse(header, flavour='auto', parsers=None): """Parse the FASTA header and return a nice dictionary. Parameters ---------- header : str FASTA header to parse flavour : str, optional Short name of the header format (case-insensitive). Valid values are :py:const:`'auto'` and keys of the `parsers` dict. Default is :py:const:`'auto'`, which means try all formats in turn and return the first result that can be obtained without an exception. parsers : dict, optional A dict where keys are format names (lowercased) and values are functions that take a header string and return the parsed header. Default is :py:const:`None`, which means use the default dictionary :py:data:`std_parsers`. Returns ------- out : dict A dictionary with the info from the header. The format depends on the flavour.""" # accept strings with and without leading '>' if header.startswith('>'): header = header[1:] # choose the format known = parsers or std_parsers if flavour.lower() == 'auto': for fl, parser in known.items(): try: return parser(header) except: pass raise PyteomicsError('Unknown FASTA header format.') elif flavour.lower() in known: try: return known[flavour.lower()](header) except Exception as e: raise PyteomicsError('Could not parse as {}. ' 'The error message was: {}'.format( flavour, e.message))

Contents