Source code for bioplus.tabfile

'''tools for dealing with files that have tabular data'''
import os
import itertools
import re
import gzip
import bz2

[docs]def merge_files(left, right, output, comments='left'):
    '''
    merge_files merges the tab-delimited files named left and right,
    which may have commmented lines. the output is directed to the
    file named output.

    There are few modes. If comments='left', comments in left are preserved.
    If comments='right', comments in right are preserved. If comments='none', 
    no comments are preserved. If comments='all', all comments in left and right
    are appended to the beginning of output, although they may previously have
    been contained within the data in left or right.

    Use merge_tab_files if you need to pass custom parameters to TabFile.
    '''
    file1 = TabFile(left)
    file2 = TabFile(right)
    return merge_tab_files(file1, file2, output, comments)

[docs]def merge_tab_files(file1, file2, output_filename, comments='left'):
    '''
    merge_tab_files merges the tab-delimited files represented by TabFile
    objects left and right, which may have commmented lines. the output is
    directed to the file named output.

    There are few modes. If comments='left', comments in left are preserved.

    If comments='right', comments in right are preserved.

    If comments='none', no comments are preserved.

    If comments='all', all comments in left and right are appended to the
    beginning of output, although they may previously have been
    contained within the data in left or right.

    See also merge_files
    '''
    if comments == 'all':
        file1.process_table(file1, lambda x: x + file2.read_row())
    elif comments == 'right':
        file2.process_table(file2, lambda x: x + file1.read_row())
    elif comments == 'none':
        outfile = TabFile(output_filename, write=True)
        for x in file1: outfile.write_row(x + file2.read_row())
    elif comments == 'all':
        # write comments first
        for x in file1.comment_line_contents(): outfile.write_row(x)
        for x in file2.comment_line_contents(): outfile.write_row(x)
        for x in file1: outfile.write_row( x+ file2.read_row())
    else:
        raise ValueError("comments must be one of 'left', 'right', 'none', 'all'")

[docs]class TabFileError(Exception):
    def __init__(self, *args):
        self.msg = ' '.join(args)
        return
    
    def __str__(self):
        return repr(self.msg)

[docs]class DetectCommentsError(TabFileError):
    def __init__(self, *args):
        super(DetectCommentsError, self).__init__(*args)
        return
                
[docs]class TabFile(object):
    '''Usage: f = TabFile('filename', convert_spaces=True,
                          comments=[],
                          column_names = False)

       TabFile is a class for handling tab-delimited files.

       Use convert_spaces=False if you're file is tab-delimited and you wish to
       preserve other whitespace.
       
       TabFile suports commented lines. Commented lines are not recognzied as
       part of the table

       By default, only lines beginning with '#' will be recognized as comments
       (not part of the table). You may specify a list of additional keywords
       using comments=['keyword1','keyword2',etc.]. All lines containing that
       keyword will be recognized as a comment. keywords may be regular
       expressions.
['(?i)track','(?i)browser']
       if column_names = True, the first properly formatted row will be treated as
       column names (i.e. ignored as a comment)
    '''

    def __init__(self, filename, mode='r', convert_spaces=True,
                 compression=None, comments=[], column_names=False):
        self._filename = filename
        self._file_extension = os.path.splitext(filename)[1].lstrip(os.extsep)
        self.mode = mode
        self.open(mode)
        self._previous_line = 0
        # _detect_comments sets _comment_line_numbers,
        #                       _comment_line_contents
        if 'r' in mode :
            try:
                self._detect_comments(comments, column_names)
            except:
                raise DetectCommentsError("Failed with comments=", *comments)
            
[docs]    def previous_line(self):
        '''returns the line number of the last line read'''
        return self._previous_line

    def _detect_comments(self, comments, column_names=False):
        self.close()
        self.open()
        if not os.path.isfile(self._filename):
            self._column_names = None
            self._comment_line_numbers = []
            self._comment_line_contents = []
            return

        number_keywords = len(comments)
        if number_keywords is not 0:
            searchables = [re.compile(keyword) for keyword in comments]

        if column_names:
            first_valid_line = True
        else:
            first_valid_line = False
            self._column_names = None

        line_number = 0
        comment_line_numbers = []
        comment_line_contents = []
        for line in self.__rawiter__():
            line_number += 1
            # blank lines are comments
            if line.lstrip() == '':
                comment_line_numbers.append(line_number)
                comment_line_contents.append(line)
            # lines starting with # are comments
            elif line.lstrip()[0] == '#':
                comment_line_numbers.append(line_number)
                comment_line_contents.append(line)
            elif number_keywords is not 0:
                for searchable in searchables:
                    if searchable.search(line) is not None:
                        comment_line_numbers.append(line_number)
                        comment_line_contents.append(line)
                        break
            elif first_valid_line:
                first_valid_line = False
                self._column_names = self._parse_line(line)
                comment_line_numbers.append(line_number)
                comment_line_contents.append(line)

        self._comment_line_numbers = comment_line_numbers
        self._comment_line_contents = comment_line_contents
        self.close()
        self.open()

    def _parse_line(self, input_string, convert_spaces=True):
        '''
        parses tab-separated elements in a string and returns a list.
        if convert_spaces=True, _parse_line will treat contiguous whitespace as
        a tab
        '''
        if convert_spaces:
            return input_string.split()
        else:
            return input_string.split('\t')
    
    def _make_line(self, input_array):
        '''takes a list or array and returns a tab-delimited string'''
        return '\t'.join([str(item) for item in input_array]) + os.linesep 

    def __enter__(self):
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        self._file_pointer.close()

[docs]    def open(self, mode=None):
        '''
        mode can be overriden here but defaults to TabFile.mode
        
        acts just like the built-in open method in the file class. use
        write=True to write to a file, otherwise it will be opened in read-only
        mode
        '''
        if mode is None: mode = self.mode
        self._file_pointer = open(self._filename, mode)

[docs]    def zap(self):
        '''forces status to not open. use with caution. this may destroy data'''
        self._file_pointer = None
        self._previous_line = 0

[docs]    def close(self):
        '''works just like the built-in close method in the file class'''
        if self._file_pointer is None:
            raise IOError('{!s} not open'.format(self._filename))
        else:
            self._file_pointer.close()
        self._previous_line = 0

[docs]    def read_table(self, override=False):
        '''returns the contents of a file as a list of rows (with each row as a list). will ignore any lines that begin with a "#" symbol and truncate any lines that contain a "#" symbol'''
        if self._file_pointer is None: self.open()
        elif not override:
            raise UserWarning('File already opened and must be closed and re-opened to read whole table.')
        return [ row for row in self ]

[docs]    def read_cols(self, L):
        '''read_cols behaves like read_col but instead of taking a single column number (numbering starts at 0), it takes a list of column numbers and returns a list of partial rows, where each partial row is a list with entries from the appropriate columns IN THE ORDER SPECIFIED.

tip: use range() to create lists of ordered integers. e.g., range(2,6)=[2,3,4,5]
'''
        if self._file_pointer is None: self.open()
        elif type(L)==list:
            raise UserWarning('File already opened and must be closed and \
re-opened to read all rows of the column.')
        else: raise TypeError('read_cols requires a list of column numbers (0-based) as input')

        my_array = []
        for row in self:
            my_array.append([])
            for n in L: my_array[-1].append(row[n])
        return my_array

[docs]    def read_col(self, n):
        '''
        returns a list of items in column n (numbering starts at 0) as items
        rather than lists. Unlike read_cols and read_table, elements of the
        read_col list are not lists, but strings
        '''
        if self._file_pointer is None: self.open()
        elif type(n) is int:
            raise UserWarning('File already opened and must be close and \
re-open to read all rows of the column')
        else:
            raise TypeError('read_col requires an integer \
(column number, 0-based) as input')

        return [ row[n] for row in self ]

[docs]    def read_first_col(self):
        '''return a list of items in the first column'''
        return self.read_col(0)

[docs]    def read_last_col(self):
        '''returns a list of items in the last column'''
        return self.read_col(-1)

[docs]    def readline(self):
        '''
        reads one line and returns it. uses a generator, and will raise
        StopIteration if it reaches the EOF.

        readline is deprecated. use x = self.__rawiter__() and x.next()
        '''
        lines = self.__rawiter__()
        return lines.next()

    def __rawiter__(self):
        '''like iter, but does not process rows'''
        if self._file_pointer is None: self.open()
        while True:
            self._previous_line += 1
            thisline = self._file_pointer.readline()
            if thisline=='':
                self.close()
                raise StopIteration
            else:
                yield thisline

    def __iter__(self):
        '''returns the next (or first) line that is not a comment, parsed'''
        for line in self.__rawiter__():
            if not self.previous_line() in self.comment_line_numbers():
                yield self._parse_line( line )

[docs]    def comment_line_contents(self):
        '''returns the list of lines that are comments'''
        return self._comment_line_contents

[docs]    def comment_line_numbers(self):
        '''returns the list of lines that are comments'''
        return self._comment_line_numbers

[docs]    def read_row(self):
        '''returns the next (or first) line that is not a comment, parsed. uses __iter__ as a generator, and simply returns the next value

        read_row is deprecated. Use x = self.__iter__() and x.next()
        '''
        gen = self.__iter__()
        return gen.next()
    
[docs]    def write(self, s):
        '''
        writes a string directly to a file,
        without modification (user must supply \\n if desired)
        '''
        if self._file_pointer is None:
            raise IOError('File not open for writing.')
        else:
            self._file_pointer.write(s)

[docs]    def write_rows(self, iterable):
        for row in iterable: self.write_row(row)
        return
    
[docs]    def write_row(self, row, separator='\t'):
        '''Writes a list to the file as a line (Tab-delimited). A different separator may also be specified with separator='x'. (Note: uses file writelines method)'''
        if self._file_pointer is None:
            raise IOError('File not open for writing.')
        else:
            self._file_pointer.writelines([self._make_line(row)])

[docs]    def write_table(self, table, separator='\t', override=False,
                    column_names=True):
        '''
        Writes a table to a file (tab-delimited).
        An alternative separator may be specified with separator='x'.
        if column_names = True, column_names will be included as the first line
        unless they do not exist.
        '''
        if self._file_pointer is None:
            raise IOError('File not open for writing.')
        elif not override:
            raise UserWarning("File already opened and must be closed and \
re-opened to write the table. You may override this behavior by passing \
override=True to the method')")
        try:
            # write column_names if applicable
            if column_names and not self._column_names==None:
                self._file_pointer.wiritelines(
                    self._make_line(self._column_names))
            write_list = itertools.imap(self._make_line, table)
            self._file_pointer.writelines(write_list)
        finally:
            self.close()

[docs]    def process_table(self, output_filename, fnc, column_names=None):
        '''
        process_tables2 writes a new file (name is specified with new_file), 
        which applies a user-defined function fnc to each row of data in the 
        original file. fnc should yield a row (i.e. a list, array, or something 
        else finitely iterable).

        process_tables2 preserves all commented lines and also the line which 
        column names, if applicable. The user may specify new column names 
        using column_names (a list or other finite iterable), or we will use 
        the old column_names, which might not preserve the column labels if 
        columns were inserted in the middle of the table
        '''
        if self._file_pointer is None: self.open()
        output_file = TabFile(output_filename, 'w')

        if not column_names == None:
            first_valid_line = True
            output_file.set_column_names(column_names)
        elif not self._column_names==None:
            first_valid_line = True
            output_file.set_column_names(column_names)
        else: first_valid_line = False

        # treat rawiter as a generator
        # we're going to thread it
        lines = self.__rawiter__()

        # make sure we set column_names correctly
        if first_valid_line:
            while True:
                line = lines.next()
                if self.previous_line() in self.comment_line_numbers():
                    output_file.write(line)
                else:
                    first_valid_line = False
                    output_file.write_row(column_names)
                    break

        # proceed with the remainder
        while True:
            line = lines.next()
            if self.previous_line() in self.comment_line_numbers():
                output_file.write(line)
            else:
                output_file.write_row(fnc(self._parse_line(line)))
        output_file.close()

[docs]    def get_column_names(self):
        '''returns the column names'''
        return self._column_names

[docs]    def column_dict(self):
        '''returns a dictionary which gives the index corresponding to a particular column name'''
        d = {}
        for i in range(len(self._column_names)):
            d[self._column_names[i]] = i
        return d

[docs]    def set_column_names(self, L):
        self._column_names = L

[docs]    def write_column_names(self):
        '''writes the stored column names'''
        self.write_row(self._column_names)

[docs]    def mergesort(self,f, n,numerical=False):
        raise NotImplementedError
#        if self._file_pointer is None:
#            raise UserWarning('File already opened and must be closed and re-opened to mergesort the file. This behavior may not be overridden.')
#        fcontents = self.read_table(f)
#        if numerical==True: fcontents = self._numerize(fcontents,n)
##        newcontents = merge.mergesort(fcontents,n)
##        writeFile(newcontents,f)

[docs]class BedRow(list):
    '''BEDrows are list, but you can access their chromStart(), chromEnd(), etc. use help for a full list. Uses the same conventions as http://genome.ucsc.edu/FAQ/FAQformat#format1. Note that only the first three entries (chrom, chromStart, chromEnd) are required, so the others may not be defined.'''
[docs]    def chrom(self):
        '''returns the name of the chromosome (e.g. chr3, chrY, chr2_random) or scaffold (e.g. scaffold10671)'''
        return self[0]

[docs]    def chrom_start(self):
        '''returns the starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0'''
        return self[1]

[docs]    def chromStart(self):
        raise DeprecationWarning()
        return self.chrom_start()

[docs]    def chrom_end(self):
        '''returns the ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99'''
        return self[2]

[docs]    def chromEnd(self):
        raise DeprecationWarning()
        return self.chrom_end()

[docs]    def name(self):
        '''returns the name of the BED line. This label is displayed to the left of the BED line in the Genome Browser window when the track is open to full display mode or directly to the left of the item in pack mode.'''
        return self[3]

[docs]    def score(self):
        '''returns the score, a number between 0 and 1000. If the track line useScore attribute is set to 1 for this annotation data set, the score value will determine the level of gray in which this feature is displayed (higher numbers = darker gray)'''
        return self[4]

[docs]    def strand(self):
        '''returns the strand, either '+' or '-' '''
        return self[5]

[docs]    def thickStart(self):
        '''returns the starting position at which the feature is drawn thickly (for example, the start codon in gene displays)'''
        return self[6]


[docs]    def thickEnd(self):
        '''returns the ending position at which the feature is drawn thickly (for example, the stop codon in gene displays).'''
        return self[7]

[docs]    def itemRgb(self):
        '''returns itemRgb, An RGB value of the form R,G,B (e.g. 255,0,0). If the track line itemRgb attribute is set to "On", this RBG value will determine the display color of the data contained in this BED line. NOTE: It is recommended that a simple color scheme (eight colors or less) be used with this attribute to avoid overwhelming the color resources of the Genome Browser and your Internet browse'''
        return self[8]


[docs]    def blockCount(self):
        '''return the number of blocks (exons) in the BED line'''
        return self[9]


[docs]    def blockSizes(self):
        '''returns a comma-separated list of the block sizes. The number of items in this list should correspond to blockCount.'''
        return self[10]

[docs]    def blockStart(self):
        '''returns a comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount.'''
        return self[11]

[docs]class GzipTabFile(TabFile):
    '''
    For gzip-compressed tab-delimited files
    
    See Tabfile for usage info
    '''
    def __init__(self, *args, **kwargs):
        super(TabFile, self).__init__(*args, **kwargs)
        
    def _open(self, mode=None):
        '''
        mode can be overriden here but defaults to TabFile.mode
        
        acts just like the built-in open method in the file class. use
        write=True to write to a file, otherwise it will be opened in read-only
        mode
        '''
        if mode is None: mode = self.mode
        if not 'b' in mode: mode += 'b'
        self._file_pointer = gzip.open(self._filename, mode)
    
[docs]class Bzip2TabFile(TabFile):
    '''
    For bzip2-compressed tab-delimited files
    
    See Tabfile for usage info
    '''
    def __init__(self, *args, **kwargs):
        super(TabFile, self).__init__(*args, **kwargs)
        
    def _open(self, mode=None):
        '''
        mode can be overriden here but defaults to TabFile.mode
        
        acts just like the built-in open method in the file class. use
        write=True to write to a file, otherwise it will be opened in read-only
        mode
        '''
        if mode is None: mode = self.mode
        if not 'b' in mode: mode += 'b'
        self._file_pointer = bz2.BZ2File(self._filename, mode)
                
[docs]class BedFile(TabFile):
    '''
    A BED file is a type of TabFile, but also defines a method for working with
    rows. rows are given as instances of BedRow, instead of lists. BEDrows 
    inerhit all list methods and therefore are compatible with write_row. 
    BedRow has additional methods for chrom, chromStart, chromEnd, etc. 
    For more info, see BedRow

    track, browser lines are treated as comments

    Assumes track row is a comment. Use getTrackLine to see the track info
    '''
    DEFAULT_BED_COMMENTS = ['(?i)track','(?i)browser']

    def __init__(self, f, additional_comments=[], **kwargs):
        comments = self.DEFAULT_BED_COMMENTS + additional_comments
        TabFile.__init__(self, f, comments=comments,
                         **kwargs)
        self._track_line = None
        # check again for track line, and save it #
        for x in self._comment_line_contents:
            expr = re.compile('(?i)track')
            if expr.search(x) is not None:
                self._track_line = x
                break

[docs]    def get_track_line(self):
        '''returns the current track line, if any'''
        return self._track_line

    def __iter__(self):
        '''returns the next (or first) line that is not a comment, parsed'''
        for line in self.__rawiter__():
            if not self.previous_line() in self.comment_line_numbers():
                yield BedRow(self._parse_line( line ))

[docs]class MacsRow(list):
    '''MACSrows are list, but you can access their features as follows:        

    chr() or chrom() -- chromosome name
    start() or chromStart() -- start position, start() is 1-based, chromStart is 0-based (BED)
    end() or chromEnd() -- end position, equivalent but chromEnd (BED) is defined as 0-based, exclusive
    length() -- length
    summit() -- position of summit
    tags() -- number of unique tags in the peak region
    pvalue() -- returns the -10*log10(pvalue)
    fold_enrichment -- returns the fold enrichment
    FDR -- returns the FDR in %
    '''
[docs]    def chr(self):
        '''returns the name of the chromosome (e.g. chr3, chrY, chr2_random) or scaffold (e.g. scaffold10671)'''
        return self[0]

[docs]    def chrom(self):
        '''returns the name of the chromosome (e.g. chr3, chrY, chr2_random) or scaffold (e.g. scaffold10671)'''
        return self[0]

[docs]    def start(self):
        '''returns the starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 1'''
        return int(self[1])

[docs]    def chrom_start(self):
        '''returns the starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0'''
        return int(self[1]) - 1

[docs]    def chromStart(self):
        raise DeprecationWarning()
        return self.chrom_start()

[docs]    def end(self):
        '''returns the end position, 1-based, inclusive'''
        return int(self[2])

[docs]    def chrom_end(self):
        '''returns the ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99'''
        return int(self[2])

[docs]    def chromEnd(self):
        raise DeprecationWarning()
        return self.chrom_end()

[docs]    def length(self):
        '''returns the length'''
        return int(self[3])
    
[docs]    def summit(self):
        '''returns the position of the summit'''
        return int(self[4]) - 1

[docs]    def tags(self, type_=int):
        '''returns the number of unique tags in the peak region'''
        return type_(self[5])
    
[docs]    def tagsv1(self):
        return self.tags(int)
    
[docs]    def tagsv2(self):
        return self.tags(lambda x: int(float(x)))

[docs]    def pvalue(self,type_=str):
        '''returns the -10*log10(pvalue). preserves the str to eliminate rounding error. use type=float to get a decimal value'''
        return type_(self[6])

[docs]    def fold_enrichment(self,type=str):
        '''returns the fold_enrichment vs control. preserves the str to eliminate rounding error. use type=float to get a decimal value'''
        return type(self[7])

[docs]    def FDR(self,type_=str):
        '''returns the FDR (%). preserves the str to eliminate rounding error. use type=float to get a decimal value'''
        return type_(self[8])    

[docs]class Macs2Row(MacsRow):
[docs]    def FDR():
        raise NotImplementedError
    
[docs]    def qvalue(self):
        return float(self[8])
    
[docs]    def name(self):
        return self[9]

[docs]class MacsFile(TabFile):
    '''A MACS file is a type of TabFile, but also defines a method for working with rows. rows are given as instances of MACSRow, instead of lists. MACSrows inerhit all list methods and therefore are compatible with write_row. MacsRow has additional methods for chrom, chromStart, chromEnd, etc. For more info, see MacsRow'''

    def __init__(self, f, convert_spaces=True, **kwargs):
        super(MacsFile, self).__init__(f, column_names=True, **kwargs)
        if self.column_dict().has_key('name'):
            # MACS2 file
            self.Row = Macs2Row
            self.MACS_version = 2
        else:
            self.Row = MacsRow
            self.MACS_version = 1

    def __iter__(self):
        '''returns the next (or first) line that is not a comment, parsed'''
        for line in self.__rawiter__():
            if not self.previous_line() in self.comment_line_numbers():
                yield self.Row(self._parse_line(line))

[docs]def shift_peaks(f, peak_lengths=2):
    '''
    shift_peaks takes a file f (foo.bed)
    and produces a new file (foo_shifted.bed)
    with all the sequences shifted (left) by peak_lengths times their length.
    If peak_lengths, is negative they are shifted to the right.
    comments are stripped
    '''
    x = BedFile(f)
    # CHANGE TO USE FILENAME CORRECTION SCHEME
    y = BedFile(f[0:-4]+'_shifted.bed')
    y.open(write=True)
    for peak in x:
        peak_start, peak_end = int(peak[1]), int(peak[2])
        new_peak = peak.copy()
        peak_shift = peak_lengths * (peak_end - peak_start)
        # update start, end
        new_peak[1] = peak_start - peak_shift
        new_peak[2] = peak_end - peak_shift
        if int(new_peak[1])<0: new_peak[1] = 0
        y.write_row(new_peak)
    return

def _quote(s):
    return ''.join(["'", s ,"'"])
Navigation

Source code for bioplus.tabfile

Quick search

Navigation