Source code for savReaderWriter.savReaderNp

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import print_function, division
from pprint import pprint as print
import os
import re
import datetime
import struct
from math import ceil
from ctypes import *
from functools import wraps, partial
from itertools import chain, islice
from bisect import bisect

try:
    import numpy as np
except ImportError:
    print("WARNING: numpy not found, cannot use savReaderNp")
    class np: nan = float("nan")

from savReaderWriter import *
from error import *
from helpers import *
from py3k import *

# TODO:
# pytables integration
# numba.jit
# function to easily read mmapped array back in


[docs]class SavReaderNp(SavReader):

    """
    Read SPSS .sav file data into a numpy array (either in-memory or mmap)
    
    Parameters
    ----------
    savFileName : str
        The file name of the spss data file
    recodeSysmisTo : value
        Indicates to which value missing values should be recoded
    rawMode : bool
        Set to ``True`` to get faster processing speeds. ``rawMode=False``
        indicates:
 
        * that trailing blanks will stripped off of string values 
        * that datetime variables (if present) will be converted into
          ``datetime.datetime`` objects, 
        * that SPSS `$sysmis` values will be converted into 
          `recodeSysmisTo` (default ``np.nan``, except for datetimes). 

    ioUtf8 : bool
        Indicates the mode in which text communicated to or from 
        the I/O Module will be. Valid values are True (UTF-8 mode aka 
        Unicode mode) and False (Codepage mode). Cf. `SET UNICODE=ON/OFF`
    ioLocale : locale str
        indicates the locale of the I/O module. Cf. `SET LOCALE`. 
        (default = None, which corresponds to `locale.setlocale(locale.LC_ALL, "")`.
        For example, `en_US.UTF-8`.

    Examples
    --------
    Typical use::

        # memmapped array, omit filename to use in-memory array 
        reader_np = SavReaderNp("Employee data.sav")
        array = reader_np.to_structured_array("/tmp/test.dat") 
        reader_np.close()

    Note. The sav-to-array conversion is MUCH faster when uncompressed .sav 
    files are used. These are created with the SPSS command::

        SAVE OUTFILE = 'some_file.sav' /UNCOMPRESSED.
    This is NOT the default in SPSS. 

    See also
    --------
    savReaderWriter.SavWriter : use `_uncompressed.sav` savFileName 
        suffix to write uncompressed files"""

    def __init__(self, savFileName, recodeSysmisTo=np.nan, rawMode=False, 
                 ioUtf8=False, ioLocale=None):
        super(SavReaderNp, self).__init__(savFileName, 
           ioUtf8=ioUtf8, ioLocale=ioLocale)

        self.savFileName = savFileName
        self.recodeSysmisTo = recodeSysmisTo
        self.rawMode = rawMode
        self.ioUtf8 = ioUtf8
        self.ioLocale = ioLocale

        self.caseBuffer = self.getCaseBuffer()
        self.unpack = self.getStruct(self.varTypes, self.varNames).unpack_from 
        self._init_funcs()
        self.gregorianEpoch = datetime.datetime(1582, 10, 14, 0, 0, 0)
        self.do_convert_datetimes = True
        self.nrows, self.ncols = self.shape

        if self._is_uncompressed:
            self.sav = open(self.savFileName, "rb")
            self.__iter__ = self._uncompressed_iter
            self.to_ndarray = self._uncompressed_to_ndarray
            self.to_structured_array = self._uncompressed_to_structured_array  

    def _items(self, start, stop, step):
        """Helper function for __getitem__"""
        for case in xrange(start, stop, step):
            self.seekNextCase(self.fh, case)
            self.wholeCaseIn(self.fh, byref(self.caseBuffer))
            record = np.fromstring(self.caseBuffer, self.struct_dtype)
            yield record

[docs]    def convert_datetimes(func):
        """Decorator to convert all the SPSS datetimes into datetime.datetime
        values. Missing datetimes are converted into the value 
        `datetime.datetime(1, 1, 1, 0, 0, 0)`"""
        @wraps(func)
        def _convert_datetimes(self, *args):
            #print("@convert_datetimes called by: %s" % func.__name__)
            array = func(self, *args)
            if (self.rawMode or not self.datetimevars or not \
                self.do_convert_datetimes):
                return array

            # calculate count so fromiter can pre-allocate
            count = self.nrows if not args else -1
            if len(args) == 1 and isinstance(args[0], slice):
                start, stop, step = args[0].indices(self.nrows)
                count = (stop - start) // step

            # now fill the array with datetimes
            dt_array = array.astype(self.datetime_dtype)            
            for varName in self.uvarNames:
                if not varName in self.datetimevars:
                    continue
                datetimes = (self.spss2datetimeDate(dt) for dt in array[varName])
                dt_array[varName] = np.fromiter(datetimes, "datetime64[us]", count)
            return dt_array
        return _convert_datetimes

[docs]    def convert_missings(func):
        """Decorator to recode numerical missing values into `recodeSysmisTo` 
        (default: `np.nan`), unless they are datetimes"""
        @wraps(func) 
        def _convert_missings(self, *args):
            array = func(self, *args) 
            cutoff = -sys.float_info.max
            sysmis = self.recodeSysmisTo
            is_to_structured_array = func.__name__.endswith('to_structured_array')
            if self.rawMode:
                return array
            elif self.is_homogeneous and not is_to_structured_array:
                array[:] = np.where(array <= cutoff, sysmis, array)
            else:
                for v in self.uvarNames:
                    if v in self.datetimevars or self.uvarTypes[v]:
                        continue
                    array[v] = np.where(array[v] <= cutoff, sysmis, array[v])

            if hasattr(array, "flush"):  # memmapped
                array.flush()

            return array
        return _convert_missings

    @convert_datetimes
[docs]    def __getitem__(self, key):
        """x.__getitem__(y) <==> x[y], where y may be int or slice

        Parameters
        ----------
        key : int, slice

        Returns
        -------
        record : numpy.ndarray 

        Raises
        -------
        IndexError, TypeError
        """
        is_slice = isinstance(key, slice)
        is_index = isinstance(key, int)
        
        if is_slice:
            start, stop, step = key.indices(self.nrows)
            records = (item for item in self._items(start, stop, step))
            count = (stop - start) // step
            record = np.fromiter(iter(records), self.struct_dtype, count)
        elif is_index:
            if abs(key) > self.nrows - 1:
                raise IndexError("index out of bounds")
            key = self.nrows + key if key < 0 else key
            self.seekNextCase(self.fh, key)
            self.wholeCaseIn(self.fh, self.caseBuffer)
            record = np.fromstring(self.caseBuffer, self.struct_dtype)
        else:
            raise TypeError("slice or int required")

        # rewind for possible subsequent call to __iter__
        self.seekNextCase(self.fh, 0)
        return record

[docs]    def __iter__(self):
        """x.__iter__() <==> iter(x). Yields records as a tuple.
        If `rawMode=True`, trailing spaces of strings are not removed
        and SPSS dates are not converted into `datetime` dates

        Returns
        -------
        record : tuple 

        Raises
        -------
        SPSSIOError
        """
        varNames = self.uvarNames
        varTypes = self.uvarTypes
        datetimevars = self.datetimevars
        shortcut = self.rawMode or not self.do_convert_datetimes or \
                   not datetimevars
        for row in xrange(self.nrows):
            self.wholeCaseIn(self.fh, self.caseBuffer)
            record = self.unpack(self.caseBuffer)
            if shortcut:
                yield record
                continue
            yield tuple([self.spss2datetimeDate(value) if v in datetimevars else
                         value.rstrip() if varTypes[v] else value for value, v
                         in izip(record, varNames)])
      
    def _init_funcs(self):
        """Helper to initialize C functions of the SPSS I/O module: set their
        argtypes and _errcheck attributes""" 
        self.seekNextCase = self.spssio.spssSeekNextCase
        self.seekNextCase.argtypes = [c_int, c_long]
        self.seekNextCase._errcheck = self._errcheck

        self.record_size = sizeof(self.caseBuffer)
        self.wholeCaseIn = self.spssio.spssWholeCaseIn
        self.wholeCaseIn.argtypes = [c_int, POINTER(c_char * self.record_size)]
        self.wholeCaseIn._errcheck = self._errcheck

    def _errcheck(self, retcode, func, arguments):
        """Checks for return codes > 0 when calling C functions of the 
        SPSS I/O module"""
        if retcode > 0:
            error = retcodes.get(retcode, retcode)
            msg = "function %r with arguments %r throws error: %s"
            msg = msg % (func.__name__, arguments, error)
            raise SPSSIOError(msg, retcode)

    @memoized_property
    def uvarNames(self):
        """Returns a list of variable names, as unicode strings"""
        if self.ioUtf8: return self.varNames
        return [v.decode(self.fileEncoding) for v in self.varNames]

    @memoized_property
    def uvarTypes(self): 
        """Returns a dictionary of variable names, as unicode strings (keys)
        and variable types (values, int). Variable type == 0 indicates 
        numerical values, other values indicate the string length in bytes"""
        if self.ioUtf8: return self.varTypes
        return {v.decode(self.fileEncoding): t for 
                v, t in self.varTypes.items()}

    @memoized_property
    def uformats(self):
        """Returns a dictionary of variable names (keys) and SPSS formats 
        (values), both as unicode strings"""
        if self.ioUtf8: return self.formats
        encoding = self.fileEncoding
        return {v.decode(encoding): fmt.decode(encoding) for 
                v, fmt in self.formats.items()}

    @memoized_property
    def datetimevars(self):
        """Returns a list of the datetime variable nanes (as unicode strings)
        in the dataset, if any"""
        return [varName for varName in self.uvarNames if 
                re.search("date|time", self.uformats[varName], re.I)]

    @memoized_property
    def _titles(self):
        """Helper function that uses varLabels to get the titles for a dtype.
        If no varLabels are available, varNames are used instead"""
        titles =  [self.varLabels[v] if self.varLabels[v] else 
                   bytez("col_%03d" % col) for col, v in 
                   enumerate(self.varNames)]
        return [title.decode(self.fileEncoding) if not 
                isinstance(title, unicode) else title for title in titles]

    @memoized_property
    def is_homogeneous(self):
        """Returns boolean that indicates whether the dataset contains only 
        numerical variables (datetimes excluded). If `rawMode=True`, datetimes
        are also considered numeric. A dataset with string variables of equal
        length is not considered to be homogeneous"""
        is_all_numeric = bool( not max(list(self.varTypes.values())) )
        if self.rawMode:
            return is_all_numeric 
        return is_all_numeric and not self.datetimevars

    @memoized_property
    def struct_dtype(self):
        """Get the dtype that is used to unpack the binary record

        Returns
        -------
        struct dtype : numpy.dtype (complex dtype if heterogeneous data, 
            simple dtype otherwise). A complex dtype uses `varNames` as 
            names and `varLabels` (if any) as titles (fields)."""
        if self.is_homogeneous:
            byteorder = u"<" if self.byteorder == u"little" else u">"
            return np.dtype(byteorder + u"d")
        fmt8 = lambda varType: int(ceil(varType / 8.) * 8)
        varTypes = [self.varTypes[varName] for varName in self.varNames]
        byteorder = u"<" if self.byteorder == "little" else u">"
        formats = [u"a%d" % fmt8(t) if t else u"%sd" % 
                   byteorder for t in varTypes]
        obj = dict(names=self.uvarNames, formats=formats, titles=self._titles)
        return np.dtype(obj)

    @memoized_property
    def trunc_dtype(self):
        """Returns the numpy dtype using the SPSS display formats

        The following spss-format to numpy-dtype conversions are made:

        +------------+------------------+
        | spss       | numpy            |
        +============+==================+
        | <= `F2`    | `float16` (`f2`) |
        +------------+------------------+
        | `F3`-`F5`  | `float32` (`f4`) |
        +------------+------------------+
        | >= `F5`    | `float64` (`f8`) |
        +------------+------------------+
        | (datetime) | `float64` (`f8`)*|
        +------------+------------------+
        | A1 >=      | `S1` >=   (`a1`) |
        +------------+------------------+ 
        *) Subsequently converted to `datetime.datetime` unless 
        `rawMode=True`. Examples of SPSS datetime display formats are `SDATE`,
        `EDATE`, `ADATE`, `JDATE` and `TIME`. 

        Note that all numerical values are stored in SPSS files as double
        precision floats. The SPSS display formats are used to create a more
        compact dtype. Datetime formats are never shrunk to a more compact 
        format. In the table above, only F and A formats are displayed, but
        other numerical (e.g. `DOLLAR`) or string (`AHEX`) are treated the 
        same way, e.g. `DOLLAR5.2` will become `float64`.

        Returns
        -------
        truncated dtype : numpy.dtype (complex dtype)

        See also
        --------
        :ref:`formats` : overview of SPSS display formats 
        :ref:`dateformats` : overview of SPSS datetime formats 
        """
        #if self.is_homogeneous:
        #    return self.struct_dtype
        dst_fmts = [u"f2", u"f4", u"f8", u"f8"]
        get_dtype = lambda src_fmt: dst_fmts[bisect([2, 5, 8], src_fmt)]
        widths = [int(re.search(u"\d+", self.uformats[v]).group(0)) 
                  for v in self.uvarNames]
        formats = [u'a%s' % widths[i] if self.uvarTypes[v] else u"f8" if 
                   v in self.datetimevars else get_dtype(widths[i]) for 
                   i, v in enumerate(self.uvarNames)]
        obj = dict(names=self.uvarNames, formats=formats, titles=self._titles)
        return np.dtype(obj)

    @memoized_property
    def datetime_dtype(self):
        """Return the modified dtype in order to accomodate `datetime.datetime`
        values that were originally datetimes, stored as floats, in the SPSS
        file

        Returns
        -------
        datetime dtype :  numpy.dtype (complex dtype) 
        """
        if not self.datetimevars:
            return self.trunc_dtype
        formats = ["datetime64[us]" if name in self.datetimevars else 
                   fmt for (title, name), fmt in self.trunc_dtype.descr]
        obj = dict(names=self.uvarNames, formats=formats, titles=self._titles)
        return np.dtype(obj)

    @memoize
[docs]    def spss2datetimeDate(self, spssDateValue):
        """Convert an SPSS datetime into a ``datetime.datetime`` object

        Parameters
        ----------
        spssDateValue : float, int

        Returns
        -------
        datetime : datetime.datetime; errors and missings are returned as
        ``datetime.datetime(datetime.MINYEAR, 1, 1, 0, 0, 0)``

        See also
        --------
        savReaderWriter.SavReader.spss2strDate : convert SPSS datetime into
            a datetime string
        :ref:`dateformats` : overview of SPSS datetime formats 

        """
        try:
            theDate = self.gregorianEpoch + \
                      datetime.timedelta(seconds=spssDateValue)
            #theDate = np.datetime64(theDate)
            return theDate
        except (OverflowError, TypeError, ValueError):
            return datetime.datetime(datetime.MINYEAR, 1, 1, 0, 0, 0)


    # ---- functions that deal with uncompressed .sav files ----
    @memoized_property    
    def _is_uncompressed(self):
        """Returns True if the .sav file was not compressed at all, False
        otherwise (i.e., neither standard, nor zlib compression was used)."""
        return self.fileCompression == b"uncompressed"

    def _uncompressed_iter(self):
        """Faster version of __iter__ that can only be used with 
        uncompressed .sav files"""
        self.sav.seek(self._offset)
        for case in xrange(self.nrows):
            yield self.unpack(self.sav.read(self.record_size))

    @property
    def _offset(self):
        """Returns the position of the type 999 record, which indicates the 
        end of the metadata and the start of the case data"""
        unpack_int = lambda value: struct.unpack("i", value)
        i = 0
        while True:
            self.sav.seek(i)
            try: 
                code = unpack_int(self.sav.read(4))
            except struct.error:
                pass
            i += 1
            end_of_metadata = code == (999,)
            if end_of_metadata:
                self.sav.read(4)
                return self.sav.tell()

    @convert_datetimes
    @convert_missings
    def _uncompressed_to_structured_array(self, filename=None):
        """Read an uncompressed .sav file and return as a structured array"""
        if not self._is_uncompressed:
            raise ValueError("Only uncompressed files can be used")
        self.sav.seek(self._offset)
        if filename:
            array = np.memmap(filename, self.trunc_dtype, 'w+', shape=self.nrows)
            array[:] = np.fromfile(self.sav, self.trunc_dtype, self.nrows)
        else:
            array = np.fromfile(self.sav, self.trunc_dtype, self.nrows)
        return array

    @convert_missings
    def _uncompressed_to_ndarray(self, filename=None):
        """Read an uncompressed .sav file and return as an ndarray"""
        if not self._is_uncompressed:
            raise ValueError("Only uncompressed files can be used")
        if not self.is_homogeneous:
            raise ValueError("Need only floats and no datetimes in dataset")
        self.sav.seek(self._offset)
        count = np.prod(self.shape)
        if filename:
            array = np.memmap(filename, float, 'w+', shape=count)
            array[:] = np.fromfile(self.sav, float, count)
        else:
            array = np.fromfile(self.sav, float, count)
        return array.reshape(self.shape)
    # ------------------------------------------------------------------------ 

    @convert_datetimes
    @convert_missings
[docs]    def to_structured_array(self, filename=None):
        """Return the data in <savFileName> as a structured array, optionally
        using <filename> as a memmapped file.

        Parameters
        ----------
        filename : str, optional 
                   The filename for the memory mapped array. If omitted, 
                   the array will be in-memory

        Returns
        -------
        array : numpy.ndarray (if `filename=None`) or numpy.core.memmap.memmap
                The array has a complex dtype, i.e. is a structured array. If
                defined, `varLabels` may also be used to retrieve columns

        Examples
        --------
        For example::

            reader_np = SavReaderNp("./test_data/Employee data.sav")
            array = reader_np.to_structured_array()
            mean_salary = array["salary"].mean().round(2)
            mean_salary == array["Current Salary"].mean().round(2)  # True
            first_record = array[0]
            reader_np.close()
               
        See also
        --------        
        savReaderWriter.SavReaderNp.to_ndarray

        """
        self.do_convert_datetimes = False  # no date conversion in __iter__ 
        if filename:
            array = np.memmap(filename, self.trunc_dtype, 'w+', shape=self.nrows)
            for row, record in enumerate(self):
                array[row] = record
            #array.flush()
        else:
            if self._is_uncompressed:
                array = self._uncompressed_to_array(as_ndarray=False)
            else: 
                array = np.fromiter(self, self.trunc_dtype, self.nrows)
        self.do_convert_datetimes = True
        return array

[docs]    def all(self, filename=None):
        """Wrapper for to_structured_array; overrides the SavReader version

        See also
        --------        
        savReaderWriter.SavReaderNp.to_structured_array"""
        return self.to_structured_array(filename)

    @convert_missings
[docs]    def to_ndarray(self, filename=None):
        """Converts a homogeneous, all-numeric SPSS dataset into an ndarray,
        unless the numerical variables are actually datetimes

        Parameters
        ----------
        filename : str, optional 
                   The filename for the memory mapped array. If omitted, 
                   the array will be in-memory

        Raises
        ------
        ValueError : if the data are not homogeneous. If `rawMode=False` 
            (default) SPSS datetimes are not considered to be numerical, 
            even though they are stored as such in the .sav file

        Returns
        -------
        array : numpy.ndarray (if `filename=None`) or numpy.core.memmap.memmap
                The array has a simple dtype, i.e. is a regular ndarray

        Examples
        --------
        For example::

            import numpy.ma 
            reader_np = SavReaderNp("./test_data/all_numeric.sav")
            array = reader_np.to_ndarray()
            average = numpy.ma.masked_invalid(array).mean()
            reader_np.close()

        See also
        --------
        savReaderWriter.SavReaderNp.is_homogeneous : determines whether a 
            dataset is considered to be all-numeric
        savReaderWriter.SavReaderNp.to_structured_array
        """
        if not self.is_homogeneous:
            raise ValueError("Need only floats and no datetimes in dataset")
        elif filename:
            array = np.memmap(filename, float, 'w+', shape=self.shape)
            for row, record in enumerate(self):
                array[row,:] = record
        else:
            values = chain.from_iterable(self)
            count = np.prod(self.shape) 
            array = np.fromiter(values, float, count).reshape(self.shape)
        return array 

[docs]    def to_array(self, filename=None):
        """Wrapper for to_ndarray and to_structured_array. Returns an ndarray if the
        dataset is all-numeric homogeneous (and no datetimes), a structured
        array otherwise

        See also
        --------
        savReaderWriter.SavReaderNp.to_ndarray
        savReaderWriter.SavReaderNp.to_structured_array"""
        if self.is_homogeneous:
            return self.to_ndarray(filename)
        else:
            return self.to_structured_array(filename)



if __name__ == "__main__":
    import time
    from contextlib import closing
    savFileName = "./test_data/all_numeric_datetime_uncompressed.sav"
    kwargs = dict( \
    savFileName = savFileName,
    varNames = ["v1", "v2"],
    varTypes = {"v1": 0, "v2": 0},
    formats = {"v1": "DOLLAR15.2", "v2": "EDATE40"} )
    if not os.path.exists(savFileName):
        with SavWriter(**kwargs) as writer:
            for i in xrange(10 ** 2):
                value = None if not i else 11654150400.
                writer.writerow([i, value])

    klass = globals()[sys.argv[1]]
    start = time.time() 
    filename = "./test_data/Employee data.sav"
    #filename = "./test_data/greetings.sav"
    filename = "./test_data/all_numeric.sav"
    #filename = "/home/albertjan/nfs/Public/somefile_uncompressed.sav" 
    #filename = '/home/antonia/Desktop/big.sav'
    #filename = '/home/albertjan/nfs/Public/bigger.sav'
    with closing(klass(filename, rawMode=False, ioUtf8=False)) as sav:
        #print(sav.struct_dtype.descr)
        array = sav.to_ndarray() #"/tmp/test.dat")
        #array = sav.to_structured_array() 
        #print(sav.formats)
        #array = sav.all() #"/tmp/test.dat")
        #for record in sav:
            #print(record)
            #pass  
    print("%s version: %5.3f" % (sys.argv[1], (time.time() - start)))