Source code for monty.io

# coding: utf-8
"""
Augments Python's suite of IO functions with useful transparent support for
compressed files.
"""

from __future__ import absolute_import

import os
import bz2
import gzip
import sys
import time
import errno
import mmap

from io import open
from monty.tempfile import ScratchDir as ScrDir
from monty.dev import deprecated

try:
    from pathlib import Path
except ImportError:
    try:
        from pathlib2 import Path
    except ImportError:
        Path = None

__author__ = 'Shyue Ping Ong'
__copyright__ = "Copyright 2014, The Materials Virtual Lab"
__version__ = '0.1'
__maintainer__ = 'Shyue Ping Ong'
__email__ = 'ongsp@ucsd.edu'
__date__ = '1/24/14'


PY_VERSION = sys.version_info


[docs]def zopen(filename, *args, **kwargs):
    """
    This function wraps around the bz2, gzip and standard python's open
    function to deal intelligently with bzipped, gzipped or standard text
    files.

    Args:
        filename (str/Path): filename or pathlib.Path.
        \*args: Standard args for python open(..). E.g., 'r' for read, 'w' for
            write.
        \*\*kwargs: Standard kwargs for python open(..).

    Returns:
        File-like object. Supports with context.
    """
    if Path is not None and isinstance(filename, Path):
        filename = str(filename)

    file_ext = filename.split(".")[-1].upper()
    if file_ext == "BZ2":
        if PY_VERSION[0] >= 3:
            return bz2.open(filename, *args, **kwargs)
        else:
            args = list(args)
            if len(args) > 0:
                args[0] = "".join([c for c in args[0] if c != "t"])
            if "mode" in kwargs:
                kwargs["mode"] = "".join([c for c in kwargs["mode"]
                                          if c != "t"])
            return bz2.BZ2File(filename, *args, **kwargs)
    elif file_ext in ("GZ", "Z"):
        return gzip.open(filename, *args, **kwargs)
    else:
        return open(filename, *args, **kwargs)


[docs]def reverse_readfile(filename):
    """
    A much faster reverse read of file by using Python's mmap to generate a
    memory-mapped file. It is slower for very small files than
    reverse_readline, but at least 2x faster for large files (the primary use
    of such a method).

    Args:
        filename (str):
            Name of file to read.

    Yields:
        Lines from the file in reverse order.
    """
    try:
        with zopen(filename, "rb") as f:
            if isinstance(f, gzip.GzipFile):
                for l in reversed(f.readlines()):
                    yield l.decode("utf-8").rstrip()
            else:
                fm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
                n = len(fm)
                while n > 0:
                    i = fm.rfind(b"\n", 0, n)
                    yield fm[i + 1:n].decode("utf-8").strip("\n")
                    n = i
    except ValueError:
        return


[docs]def reverse_readline(m_file, blk_size=4096, max_mem=4000000):
    """
    Generator method to read a file line-by-line, but backwards. This allows
    one to efficiently get data at the end of a file.

    Based on code by Peter Astrand <astrand@cendio.se>, using modifications by
    Raymond Hettinger and Kevin German.
    http://code.activestate.com/recipes/439045-read-a-text-file-backwards
    -yet-another-implementat/

    Reads file forwards and reverses in memory for files smaller than the
    max_mem parameter, or for gzip files where reverse seeks are not supported.

    Files larger than max_mem are dynamically read backwards.

    Args:
        m_file (File): File stream to read (backwards)
        blk_size (int): The buffer size. Defaults to 4096.
        max_mem (int): The maximum amount of memory to involve in this
            operation. This is used to determine when to reverse a file
            in-memory versus seeking portions of a file. For bz2 files,
            this sets the maximum block size.

    Returns:
        Generator that returns lines from the file. Similar behavior to the
        file.readline() method, except the lines are returned from the back
        of the file.
    """
    try:
        file_size = os.path.getsize(m_file.name)
    except AttributeError:
        # Bz2 files do not have name attribute. Just set file_size to above
        # max_mem for now.
        file_size = max_mem + 1

    # If the file size is within our desired RAM use, just reverse it in memory
    # GZip files must use this method because there is no way to negative seek
    if file_size < max_mem or isinstance(m_file, gzip.GzipFile):
        for line in reversed(m_file.readlines()):
            yield line.rstrip()
    else:
        if isinstance(m_file, bz2.BZ2File):
            # for bz2 files, seeks are expensive. It is therefore in our best
            # interest to maximize the blk_size within limits of desired RAM
            # use.
            blk_size = min(max_mem, file_size)

        buf = ""
        m_file.seek(0, 2)
        lastchar = m_file.read(1).decode("utf-8")
        trailing_newline = (lastchar == "\n")

        while 1:
            newline_pos = buf.rfind("\n")
            pos = m_file.tell()
            if newline_pos != -1:
                # Found a newline
                line = buf[newline_pos + 1:]
                buf = buf[:newline_pos]
                if pos or newline_pos or trailing_newline:
                    line += "\n"
                yield line
            elif pos:
                # Need to fill buffer
                toread = min(blk_size, pos)
                m_file.seek(pos - toread, 0)
                buf = m_file.read(toread).decode("utf-8") + buf
                m_file.seek(pos - toread, 0)
                if pos == toread:
                    buf = "\n" + buf
            else:
                # Start-of-file
                return


@deprecated(ScrDir)
class ScratchDir(ScrDir):
    pass


[docs]class FileLockException(Exception):
    """Exception raised by FileLock."""


[docs]class FileLock(object):
    """
    A file locking mechanism that has context-manager support so you can use
    it in a with statement. This should be relatively cross-compatible as it
    doesn't rely on msvcrt or fcntl for the locking.
    Taken from http://www.evanfosmark.com/2009/01/cross-platform-file-locking
    -support-in-python/
    """
    Error = FileLockException

    def __init__(self, file_name, timeout=10, delay=.05):
        """
        Prepare the file locker. Specify the file to lock and optionally
        the maximum timeout and the delay between each attempt to lock.

        Args:
            file_name: Name of file to lock.
            timeout: Maximum timeout for locking. Defaults to 10.
            delay: Delay between each attempt to lock. Defaults to 0.05.
        """
        self.file_name = os.path.abspath(file_name)
        self.lockfile = os.path.abspath(file_name) + ".lock"
        self.timeout = float(timeout)
        self.delay = float(delay)
        self.is_locked = False

        if self.delay > self.timeout or self.delay <= 0 or self.timeout <= 0:
            raise ValueError("delay and timeout must be positive with delay "
                             "<= timeout")

[docs]    def acquire(self):
        """
        Acquire the lock, if possible. If the lock is in use, it check again
        every `delay` seconds. It does this until it either gets the lock or
        exceeds `timeout` number of seconds, in which case it throws
        an exception.
        """
        start_time = time.time()
        while True:
            try:
                self.fd = os.open(self.lockfile,
                                  os.O_CREAT | os.O_EXCL | os.O_RDWR)
                break
            except (OSError,) as e:
                if e.errno != errno.EEXIST:
                    raise
                if (time.time() - start_time) >= self.timeout:
                    raise FileLockException("%s: Timeout occured." %
                                            self.lockfile)
                time.sleep(self.delay)

        self.is_locked = True

[docs]    def release(self):
        """ Get rid of the lock by deleting the lockfile.
            When working in a `with` statement, this gets automatically
            called at the end.
        """
        if self.is_locked:
            os.close(self.fd)
            os.unlink(self.lockfile)
            self.is_locked = False

    def __enter__(self):
        """
        Activated when used in the with statement. Should automatically
        acquire a lock to be used in the with block.
        """
        if not self.is_locked:
            self.acquire()
        return self

    def __exit__(self, type, value, traceback):
        """
        Activated at the end of the with statement. It automatically releases
        the lock if it isn't locked.
        """
        if self.is_locked:
            self.release()

    def __del__(self):
        """
        Make sure that the FileLock instance doesn't leave a lockfile
        lying around.
        """
        self.release()


[docs]def get_open_fds():
    """
    Return the number of open file descriptors for current process

    .. warning: will only work on UNIX-like OS-es.
    """
    import subprocess
    import os

    pid = os.getpid()
    procs = subprocess.check_output(["lsof", '-w', '-Ff', "-p", str(pid)])

    nprocs = len(filter(lambda s: s and s[0] == 'f' and s[1:].isdigit(), procs.split('\n')))

    return nprocs