Source code for monty.io

# coding: utf-8
"""
Augments Python's suite of IO functions with useful transparent support for
compressed files.
"""

from __future__ import absolute_import

import os
import bz2
import gzip
import sys
import time
import errno
import mmap

from io import open
from monty.tempfile import ScratchDir as ScrDir
from monty.dev import deprecated

try:
    from pathlib import Path
except ImportError:
    try:
        from pathlib2 import Path
    except ImportError:
        Path = None

__author__ = 'Shyue Ping Ong'
__copyright__ = "Copyright 2014, The Materials Virtual Lab"
__version__ = '0.1'
__maintainer__ = 'Shyue Ping Ong'
__email__ = 'ongsp@ucsd.edu'
__date__ = '1/24/14'


PY_VERSION = sys.version_info


[docs]def zopen(filename, *args, **kwargs): """ This function wraps around the bz2, gzip and standard python's open function to deal intelligently with bzipped, gzipped or standard text files. Args: filename (str/Path): filename or pathlib.Path. \*args: Standard args for python open(..). E.g., 'r' for read, 'w' for write. \*\*kwargs: Standard kwargs for python open(..). Returns: File-like object. Supports with context. """ if Path is not None and isinstance(filename, Path): filename = str(filename) file_ext = filename.split(".")[-1].upper() if file_ext == "BZ2": if PY_VERSION[0] >= 3: return bz2.open(filename, *args, **kwargs) else: args = list(args) if len(args) > 0: args[0] = "".join([c for c in args[0] if c != "t"]) if "mode" in kwargs: kwargs["mode"] = "".join([c for c in kwargs["mode"] if c != "t"]) return bz2.BZ2File(filename, *args, **kwargs) elif file_ext in ("GZ", "Z"): return gzip.open(filename, *args, **kwargs) else: return open(filename, *args, **kwargs)
[docs]def reverse_readfile(filename): """ A much faster reverse read of file by using Python's mmap to generate a memory-mapped file. It is slower for very small files than reverse_readline, but at least 2x faster for large files (the primary use of such a method). Args: filename (str): Name of file to read. Yields: Lines from the file in reverse order. """ try: with zopen(filename, "rb") as f: if isinstance(f, gzip.GzipFile): for l in reversed(f.readlines()): yield l.decode("utf-8").rstrip() else: fm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) n = len(fm) while n > 0: i = fm.rfind(b"\n", 0, n) yield fm[i + 1:n].decode("utf-8").strip("\n") n = i except ValueError: return
[docs]def reverse_readline(m_file, blk_size=4096, max_mem=4000000): """ Generator method to read a file line-by-line, but backwards. This allows one to efficiently get data at the end of a file. Based on code by Peter Astrand <astrand@cendio.se>, using modifications by Raymond Hettinger and Kevin German. http://code.activestate.com/recipes/439045-read-a-text-file-backwards -yet-another-implementat/ Reads file forwards and reverses in memory for files smaller than the max_mem parameter, or for gzip files where reverse seeks are not supported. Files larger than max_mem are dynamically read backwards. Args: m_file (File): File stream to read (backwards) blk_size (int): The buffer size. Defaults to 4096. max_mem (int): The maximum amount of memory to involve in this operation. This is used to determine when to reverse a file in-memory versus seeking portions of a file. For bz2 files, this sets the maximum block size. Returns: Generator that returns lines from the file. Similar behavior to the file.readline() method, except the lines are returned from the back of the file. """ try: file_size = os.path.getsize(m_file.name) except AttributeError: # Bz2 files do not have name attribute. Just set file_size to above # max_mem for now. file_size = max_mem + 1 # If the file size is within our desired RAM use, just reverse it in memory # GZip files must use this method because there is no way to negative seek if file_size < max_mem or isinstance(m_file, gzip.GzipFile): for line in reversed(m_file.readlines()): yield line.rstrip() else: if isinstance(m_file, bz2.BZ2File): # for bz2 files, seeks are expensive. It is therefore in our best # interest to maximize the blk_size within limits of desired RAM # use. blk_size = min(max_mem, file_size) buf = "" m_file.seek(0, 2) lastchar = m_file.read(1).decode("utf-8") trailing_newline = (lastchar == "\n") while 1: newline_pos = buf.rfind("\n") pos = m_file.tell() if newline_pos != -1: # Found a newline line = buf[newline_pos + 1:] buf = buf[:newline_pos] if pos or newline_pos or trailing_newline: line += "\n" yield line elif pos: # Need to fill buffer toread = min(blk_size, pos) m_file.seek(pos - toread, 0) buf = m_file.read(toread).decode("utf-8") + buf m_file.seek(pos - toread, 0) if pos == toread: buf = "\n" + buf else: # Start-of-file return
@deprecated(ScrDir) class ScratchDir(ScrDir): pass
[docs]class FileLockException(Exception): """Exception raised by FileLock."""
[docs]class FileLock(object): """ A file locking mechanism that has context-manager support so you can use it in a with statement. This should be relatively cross-compatible as it doesn't rely on msvcrt or fcntl for the locking. Taken from http://www.evanfosmark.com/2009/01/cross-platform-file-locking -support-in-python/ """ Error = FileLockException def __init__(self, file_name, timeout=10, delay=.05): """ Prepare the file locker. Specify the file to lock and optionally the maximum timeout and the delay between each attempt to lock. Args: file_name: Name of file to lock. timeout: Maximum timeout for locking. Defaults to 10. delay: Delay between each attempt to lock. Defaults to 0.05. """ self.file_name = os.path.abspath(file_name) self.lockfile = os.path.abspath(file_name) + ".lock" self.timeout = float(timeout) self.delay = float(delay) self.is_locked = False if self.delay > self.timeout or self.delay <= 0 or self.timeout <= 0: raise ValueError("delay and timeout must be positive with delay " "<= timeout")
[docs] def acquire(self): """ Acquire the lock, if possible. If the lock is in use, it check again every `delay` seconds. It does this until it either gets the lock or exceeds `timeout` number of seconds, in which case it throws an exception. """ start_time = time.time() while True: try: self.fd = os.open(self.lockfile, os.O_CREAT | os.O_EXCL | os.O_RDWR) break except (OSError,) as e: if e.errno != errno.EEXIST: raise if (time.time() - start_time) >= self.timeout: raise FileLockException("%s: Timeout occured." % self.lockfile) time.sleep(self.delay) self.is_locked = True
[docs] def release(self): """ Get rid of the lock by deleting the lockfile. When working in a `with` statement, this gets automatically called at the end. """ if self.is_locked: os.close(self.fd) os.unlink(self.lockfile) self.is_locked = False
def __enter__(self): """ Activated when used in the with statement. Should automatically acquire a lock to be used in the with block. """ if not self.is_locked: self.acquire() return self def __exit__(self, type, value, traceback): """ Activated at the end of the with statement. It automatically releases the lock if it isn't locked. """ if self.is_locked: self.release() def __del__(self): """ Make sure that the FileLock instance doesn't leave a lockfile lying around. """ self.release()
[docs]def get_open_fds(): """ Return the number of open file descriptors for current process .. warning: will only work on UNIX-like OS-es. """ import subprocess import os pid = os.getpid() procs = subprocess.check_output(["lsof", '-w', '-Ff', "-p", str(pid)]) nprocs = len(filter(lambda s: s and s[0] == 'f' and s[1:].isdigit(), procs.split('\n'))) return nprocs