Source code for savReaderWriter.savReaderNp

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import print_function, division
from pprint import pprint as print
import os
import re
import datetime
import struct
from math import ceil
from ctypes import *
from functools import wraps, partial
from itertools import chain, islice
from bisect import bisect

try:
    import numpy as np
except ImportError:
    print("WARNING: numpy not found, cannot use savReaderNp")
    class np: nan = float("nan")

from savReaderWriter import *
from error import *
from helpers import *
from py3k import *

# TODO:
# pytables integration
# numba.jit
# function to easily read mmapped array back in


[docs]class SavReaderNp(SavReader): """ Read SPSS .sav file data into a numpy array (either in-memory or mmap) Parameters ---------- savFileName : str The file name of the spss data file recodeSysmisTo : value Indicates to which value missing values should be recoded rawMode : bool Set to ``True`` to get faster processing speeds. ``rawMode=False`` indicates: * that trailing blanks will stripped off of string values * that datetime variables (if present) will be converted into ``datetime.datetime`` objects, * that SPSS `$sysmis` values will be converted into `recodeSysmisTo` (default ``np.nan``, except for datetimes). ioUtf8 : bool Indicates the mode in which text communicated to or from the I/O Module will be. Valid values are True (UTF-8 mode aka Unicode mode) and False (Codepage mode). Cf. `SET UNICODE=ON/OFF` ioLocale : locale str indicates the locale of the I/O module. Cf. `SET LOCALE`. (default = None, which corresponds to `locale.setlocale(locale.LC_ALL, "")`. For example, `en_US.UTF-8`. Examples -------- Typical use:: # memmapped array, omit filename to use in-memory array reader_np = SavReaderNp("Employee data.sav") array = reader_np.to_structured_array("/tmp/test.dat") reader_np.close() Note. The sav-to-array conversion is MUCH faster when uncompressed .sav files are used. These are created with the SPSS command:: SAVE OUTFILE = 'some_file.sav' /UNCOMPRESSED. This is NOT the default in SPSS. See also -------- savReaderWriter.SavWriter : use `_uncompressed.sav` savFileName suffix to write uncompressed files""" def __init__(self, savFileName, recodeSysmisTo=np.nan, rawMode=False, ioUtf8=False, ioLocale=None): super(SavReaderNp, self).__init__(savFileName, ioUtf8=ioUtf8, ioLocale=ioLocale) self.savFileName = savFileName self.recodeSysmisTo = recodeSysmisTo self.rawMode = rawMode self.ioUtf8 = ioUtf8 self.ioLocale = ioLocale self.caseBuffer = self.getCaseBuffer() self.unpack = self.getStruct(self.varTypes, self.varNames).unpack_from self._init_funcs() self.gregorianEpoch = datetime.datetime(1582, 10, 14, 0, 0, 0) self.do_convert_datetimes = True self.nrows, self.ncols = self.shape if self._is_uncompressed: self.sav = open(self.savFileName, "rb") self.__iter__ = self._uncompressed_iter self.to_ndarray = self._uncompressed_to_ndarray self.to_structured_array = self._uncompressed_to_structured_array def _items(self, start, stop, step): """Helper function for __getitem__""" for case in xrange(start, stop, step): self.seekNextCase(self.fh, case) self.wholeCaseIn(self.fh, byref(self.caseBuffer)) record = np.fromstring(self.caseBuffer, self.struct_dtype) yield record
[docs] def convert_datetimes(func): """Decorator to convert all the SPSS datetimes into datetime.datetime values. Missing datetimes are converted into the value `datetime.datetime(1, 1, 1, 0, 0, 0)`""" @wraps(func) def _convert_datetimes(self, *args): #print("@convert_datetimes called by: %s" % func.__name__) array = func(self, *args) if (self.rawMode or not self.datetimevars or not \ self.do_convert_datetimes): return array # calculate count so fromiter can pre-allocate count = self.nrows if not args else -1 if len(args) == 1 and isinstance(args[0], slice): start, stop, step = args[0].indices(self.nrows) count = (stop - start) // step # now fill the array with datetimes dt_array = array.astype(self.datetime_dtype) for varName in self.uvarNames: if not varName in self.datetimevars: continue datetimes = (self.spss2datetimeDate(dt) for dt in array[varName]) dt_array[varName] = np.fromiter(datetimes, "datetime64[us]", count) return dt_array return _convert_datetimes
[docs] def convert_missings(func): """Decorator to recode numerical missing values into `recodeSysmisTo` (default: `np.nan`), unless they are datetimes""" @wraps(func) def _convert_missings(self, *args): array = func(self, *args) cutoff = -sys.float_info.max sysmis = self.recodeSysmisTo is_to_structured_array = func.__name__.endswith('to_structured_array') if self.rawMode: return array elif self.is_homogeneous and not is_to_structured_array: array[:] = np.where(array <= cutoff, sysmis, array) else: for v in self.uvarNames: if v in self.datetimevars or self.uvarTypes[v]: continue array[v] = np.where(array[v] <= cutoff, sysmis, array[v]) if hasattr(array, "flush"): # memmapped array.flush() return array return _convert_missings
@convert_datetimes
[docs] def __getitem__(self, key): """x.__getitem__(y) <==> x[y], where y may be int or slice Parameters ---------- key : int, slice Returns ------- record : numpy.ndarray Raises ------- IndexError, TypeError """ is_slice = isinstance(key, slice) is_index = isinstance(key, int) if is_slice: start, stop, step = key.indices(self.nrows) records = (item for item in self._items(start, stop, step)) count = (stop - start) // step record = np.fromiter(iter(records), self.struct_dtype, count) elif is_index: if abs(key) > self.nrows - 1: raise IndexError("index out of bounds") key = self.nrows + key if key < 0 else key self.seekNextCase(self.fh, key) self.wholeCaseIn(self.fh, self.caseBuffer) record = np.fromstring(self.caseBuffer, self.struct_dtype) else: raise TypeError("slice or int required") # rewind for possible subsequent call to __iter__ self.seekNextCase(self.fh, 0) return record
[docs] def __iter__(self): """x.__iter__() <==> iter(x). Yields records as a tuple. If `rawMode=True`, trailing spaces of strings are not removed and SPSS dates are not converted into `datetime` dates Returns ------- record : tuple Raises ------- SPSSIOError """ varNames = self.uvarNames varTypes = self.uvarTypes datetimevars = self.datetimevars shortcut = self.rawMode or not self.do_convert_datetimes or \ not datetimevars for row in xrange(self.nrows): self.wholeCaseIn(self.fh, self.caseBuffer) record = self.unpack(self.caseBuffer) if shortcut: yield record continue yield tuple([self.spss2datetimeDate(value) if v in datetimevars else value.rstrip() if varTypes[v] else value for value, v in izip(record, varNames)])
def _init_funcs(self): """Helper to initialize C functions of the SPSS I/O module: set their argtypes and _errcheck attributes""" self.seekNextCase = self.spssio.spssSeekNextCase self.seekNextCase.argtypes = [c_int, c_long] self.seekNextCase._errcheck = self._errcheck self.record_size = sizeof(self.caseBuffer) self.wholeCaseIn = self.spssio.spssWholeCaseIn self.wholeCaseIn.argtypes = [c_int, POINTER(c_char * self.record_size)] self.wholeCaseIn._errcheck = self._errcheck def _errcheck(self, retcode, func, arguments): """Checks for return codes > 0 when calling C functions of the SPSS I/O module""" if retcode > 0: error = retcodes.get(retcode, retcode) msg = "function %r with arguments %r throws error: %s" msg = msg % (func.__name__, arguments, error) raise SPSSIOError(msg, retcode) @memoized_property def uvarNames(self): """Returns a list of variable names, as unicode strings""" if self.ioUtf8: return self.varNames return [v.decode(self.fileEncoding) for v in self.varNames] @memoized_property def uvarTypes(self): """Returns a dictionary of variable names, as unicode strings (keys) and variable types (values, int). Variable type == 0 indicates numerical values, other values indicate the string length in bytes""" if self.ioUtf8: return self.varTypes return {v.decode(self.fileEncoding): t for v, t in self.varTypes.items()} @memoized_property def uformats(self): """Returns a dictionary of variable names (keys) and SPSS formats (values), both as unicode strings""" if self.ioUtf8: return self.formats encoding = self.fileEncoding return {v.decode(encoding): fmt.decode(encoding) for v, fmt in self.formats.items()} @memoized_property def datetimevars(self): """Returns a list of the datetime variable nanes (as unicode strings) in the dataset, if any""" return [varName for varName in self.uvarNames if re.search("date|time", self.uformats[varName], re.I)] @memoized_property def _titles(self): """Helper function that uses varLabels to get the titles for a dtype. If no varLabels are available, varNames are used instead""" titles = [self.varLabels[v] if self.varLabels[v] else bytez("col_%03d" % col) for col, v in enumerate(self.varNames)] return [title.decode(self.fileEncoding) if not isinstance(title, unicode) else title for title in titles] @memoized_property def is_homogeneous(self): """Returns boolean that indicates whether the dataset contains only numerical variables (datetimes excluded). If `rawMode=True`, datetimes are also considered numeric. A dataset with string variables of equal length is not considered to be homogeneous""" is_all_numeric = bool( not max(list(self.varTypes.values())) ) if self.rawMode: return is_all_numeric return is_all_numeric and not self.datetimevars @memoized_property def struct_dtype(self): """Get the dtype that is used to unpack the binary record Returns ------- struct dtype : numpy.dtype (complex dtype if heterogeneous data, simple dtype otherwise). A complex dtype uses `varNames` as names and `varLabels` (if any) as titles (fields).""" if self.is_homogeneous: byteorder = u"<" if self.byteorder == u"little" else u">" return np.dtype(byteorder + u"d") fmt8 = lambda varType: int(ceil(varType / 8.) * 8) varTypes = [self.varTypes[varName] for varName in self.varNames] byteorder = u"<" if self.byteorder == "little" else u">" formats = [u"a%d" % fmt8(t) if t else u"%sd" % byteorder for t in varTypes] obj = dict(names=self.uvarNames, formats=formats, titles=self._titles) return np.dtype(obj) @memoized_property def trunc_dtype(self): """Returns the numpy dtype using the SPSS display formats The following spss-format to numpy-dtype conversions are made: +------------+------------------+ | spss | numpy | +============+==================+ | <= `F2` | `float16` (`f2`) | +------------+------------------+ | `F3`-`F5` | `float32` (`f4`) | +------------+------------------+ | >= `F5` | `float64` (`f8`) | +------------+------------------+ | (datetime) | `float64` (`f8`)*| +------------+------------------+ | A1 >= | `S1` >= (`a1`) | +------------+------------------+ *) Subsequently converted to `datetime.datetime` unless `rawMode=True`. Examples of SPSS datetime display formats are `SDATE`, `EDATE`, `ADATE`, `JDATE` and `TIME`. Note that all numerical values are stored in SPSS files as double precision floats. The SPSS display formats are used to create a more compact dtype. Datetime formats are never shrunk to a more compact format. In the table above, only F and A formats are displayed, but other numerical (e.g. `DOLLAR`) or string (`AHEX`) are treated the same way, e.g. `DOLLAR5.2` will become `float64`. Returns ------- truncated dtype : numpy.dtype (complex dtype) See also -------- :ref:`formats` : overview of SPSS display formats :ref:`dateformats` : overview of SPSS datetime formats """ #if self.is_homogeneous: # return self.struct_dtype dst_fmts = [u"f2", u"f4", u"f8", u"f8"] get_dtype = lambda src_fmt: dst_fmts[bisect([2, 5, 8], src_fmt)] widths = [int(re.search(u"\d+", self.uformats[v]).group(0)) for v in self.uvarNames] formats = [u'a%s' % widths[i] if self.uvarTypes[v] else u"f8" if v in self.datetimevars else get_dtype(widths[i]) for i, v in enumerate(self.uvarNames)] obj = dict(names=self.uvarNames, formats=formats, titles=self._titles) return np.dtype(obj) @memoized_property def datetime_dtype(self): """Return the modified dtype in order to accomodate `datetime.datetime` values that were originally datetimes, stored as floats, in the SPSS file Returns ------- datetime dtype : numpy.dtype (complex dtype) """ if not self.datetimevars: return self.trunc_dtype formats = ["datetime64[us]" if name in self.datetimevars else fmt for (title, name), fmt in self.trunc_dtype.descr] obj = dict(names=self.uvarNames, formats=formats, titles=self._titles) return np.dtype(obj) @memoize
[docs] def spss2datetimeDate(self, spssDateValue): """Convert an SPSS datetime into a ``datetime.datetime`` object Parameters ---------- spssDateValue : float, int Returns ------- datetime : datetime.datetime; errors and missings are returned as ``datetime.datetime(datetime.MINYEAR, 1, 1, 0, 0, 0)`` See also -------- savReaderWriter.SavReader.spss2strDate : convert SPSS datetime into a datetime string :ref:`dateformats` : overview of SPSS datetime formats """ try: theDate = self.gregorianEpoch + \ datetime.timedelta(seconds=spssDateValue) #theDate = np.datetime64(theDate) return theDate except (OverflowError, TypeError, ValueError): return datetime.datetime(datetime.MINYEAR, 1, 1, 0, 0, 0) # ---- functions that deal with uncompressed .sav files ----
@memoized_property def _is_uncompressed(self): """Returns True if the .sav file was not compressed at all, False otherwise (i.e., neither standard, nor zlib compression was used).""" return self.fileCompression == b"uncompressed" def _uncompressed_iter(self): """Faster version of __iter__ that can only be used with uncompressed .sav files""" self.sav.seek(self._offset) for case in xrange(self.nrows): yield self.unpack(self.sav.read(self.record_size)) @property def _offset(self): """Returns the position of the type 999 record, which indicates the end of the metadata and the start of the case data""" unpack_int = lambda value: struct.unpack("i", value) i = 0 while True: self.sav.seek(i) try: code = unpack_int(self.sav.read(4)) except struct.error: pass i += 1 end_of_metadata = code == (999,) if end_of_metadata: self.sav.read(4) return self.sav.tell() @convert_datetimes @convert_missings def _uncompressed_to_structured_array(self, filename=None): """Read an uncompressed .sav file and return as a structured array""" if not self._is_uncompressed: raise ValueError("Only uncompressed files can be used") self.sav.seek(self._offset) if filename: array = np.memmap(filename, self.trunc_dtype, 'w+', shape=self.nrows) array[:] = np.fromfile(self.sav, self.trunc_dtype, self.nrows) else: array = np.fromfile(self.sav, self.trunc_dtype, self.nrows) return array @convert_missings def _uncompressed_to_ndarray(self, filename=None): """Read an uncompressed .sav file and return as an ndarray""" if not self._is_uncompressed: raise ValueError("Only uncompressed files can be used") if not self.is_homogeneous: raise ValueError("Need only floats and no datetimes in dataset") self.sav.seek(self._offset) count = np.prod(self.shape) if filename: array = np.memmap(filename, float, 'w+', shape=count) array[:] = np.fromfile(self.sav, float, count) else: array = np.fromfile(self.sav, float, count) return array.reshape(self.shape) # ------------------------------------------------------------------------ @convert_datetimes @convert_missings
[docs] def to_structured_array(self, filename=None): """Return the data in <savFileName> as a structured array, optionally using <filename> as a memmapped file. Parameters ---------- filename : str, optional The filename for the memory mapped array. If omitted, the array will be in-memory Returns ------- array : numpy.ndarray (if `filename=None`) or numpy.core.memmap.memmap The array has a complex dtype, i.e. is a structured array. If defined, `varLabels` may also be used to retrieve columns Examples -------- For example:: reader_np = SavReaderNp("./test_data/Employee data.sav") array = reader_np.to_structured_array() mean_salary = array["salary"].mean().round(2) mean_salary == array["Current Salary"].mean().round(2) # True first_record = array[0] reader_np.close() See also -------- savReaderWriter.SavReaderNp.to_ndarray """ self.do_convert_datetimes = False # no date conversion in __iter__ if filename: array = np.memmap(filename, self.trunc_dtype, 'w+', shape=self.nrows) for row, record in enumerate(self): array[row] = record #array.flush() else: if self._is_uncompressed: array = self._uncompressed_to_array(as_ndarray=False) else: array = np.fromiter(self, self.trunc_dtype, self.nrows) self.do_convert_datetimes = True return array
[docs] def all(self, filename=None): """Wrapper for to_structured_array; overrides the SavReader version See also -------- savReaderWriter.SavReaderNp.to_structured_array""" return self.to_structured_array(filename)
@convert_missings
[docs] def to_ndarray(self, filename=None): """Converts a homogeneous, all-numeric SPSS dataset into an ndarray, unless the numerical variables are actually datetimes Parameters ---------- filename : str, optional The filename for the memory mapped array. If omitted, the array will be in-memory Raises ------ ValueError : if the data are not homogeneous. If `rawMode=False` (default) SPSS datetimes are not considered to be numerical, even though they are stored as such in the .sav file Returns ------- array : numpy.ndarray (if `filename=None`) or numpy.core.memmap.memmap The array has a simple dtype, i.e. is a regular ndarray Examples -------- For example:: import numpy.ma reader_np = SavReaderNp("./test_data/all_numeric.sav") array = reader_np.to_ndarray() average = numpy.ma.masked_invalid(array).mean() reader_np.close() See also -------- savReaderWriter.SavReaderNp.is_homogeneous : determines whether a dataset is considered to be all-numeric savReaderWriter.SavReaderNp.to_structured_array """ if not self.is_homogeneous: raise ValueError("Need only floats and no datetimes in dataset") elif filename: array = np.memmap(filename, float, 'w+', shape=self.shape) for row, record in enumerate(self): array[row,:] = record else: values = chain.from_iterable(self) count = np.prod(self.shape) array = np.fromiter(values, float, count).reshape(self.shape) return array
[docs] def to_array(self, filename=None): """Wrapper for to_ndarray and to_structured_array. Returns an ndarray if the dataset is all-numeric homogeneous (and no datetimes), a structured array otherwise See also -------- savReaderWriter.SavReaderNp.to_ndarray savReaderWriter.SavReaderNp.to_structured_array""" if self.is_homogeneous: return self.to_ndarray(filename) else: return self.to_structured_array(filename)
if __name__ == "__main__": import time from contextlib import closing savFileName = "./test_data/all_numeric_datetime_uncompressed.sav" kwargs = dict( \ savFileName = savFileName, varNames = ["v1", "v2"], varTypes = {"v1": 0, "v2": 0}, formats = {"v1": "DOLLAR15.2", "v2": "EDATE40"} ) if not os.path.exists(savFileName): with SavWriter(**kwargs) as writer: for i in xrange(10 ** 2): value = None if not i else 11654150400. writer.writerow([i, value]) klass = globals()[sys.argv[1]] start = time.time() filename = "./test_data/Employee data.sav" #filename = "./test_data/greetings.sav" filename = "./test_data/all_numeric.sav" #filename = "/home/albertjan/nfs/Public/somefile_uncompressed.sav" #filename = '/home/antonia/Desktop/big.sav' #filename = '/home/albertjan/nfs/Public/bigger.sav' with closing(klass(filename, rawMode=False, ioUtf8=False)) as sav: #print(sav.struct_dtype.descr) array = sav.to_ndarray() #"/tmp/test.dat") #array = sav.to_structured_array() #print(sav.formats) #array = sav.all() #"/tmp/test.dat") #for record in sav: #print(record) #pass print("%s version: %5.3f" % (sys.argv[1], (time.time() - start)))