Source code for savReaderWriter.savReader

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from ctypes import *
import os
import operator
import locale
import datetime
import collections
import functools

from savReaderWriter import *
from header import *
from helpers import *

@rich_comparison
@implements_to_string
[docs]class SavReader(Header): """ Read SPSS system files (.sav, .zsav) Parameters ---------- savFileName : str the file name of the spss data file returnHeader : bool, default False indicates whether the first record should be a list of variable names recodeSysmisTo: (value), default None indicates to which value SPSS missing values (`$sysmis`) should be recoded. Any value below 10 ** -10 is returned as None verbose : bool, default False indicates whether information about the spss data file (e.g., number of cases, variable names, file size) should be printed on the screen. selectVars : list or None, default None indicates which variables in the file should be selected. The variables should be specified as a list of valid variable names. If ``None`` is specified, all the variables in the file are used idVar : str or None, default None indicates which variable in the file should be used for use as id variable for the 'get' method rawMode : bool, default False indicates whether values should get SPSS-style formatting, and whether date variables (if present) should be converted into ISO-dates. If set to ``True`` the program does not format any values, which increases processing speed. In particular ``rawMode=True`` implies that: * SPSS datetimes will not be converted into ISO8601 dates * SPSS `N` formats will not be converted into strings with leading zeroes * SPSS `$sysmis` values will not be converted into ``None`` values * String values will be ceiled multiples of 8 bytes See also :ref:`formats` and :ref:`dateformats` ioUtf8 : bool, int, default False indicates the mode in which text communicated to or from the I/O Module will be. * `codepage mode`: ``ioUtf8=CODEPAGE_MODE``, or ``ioUtf8=0``, or ``ioUtf8=False``. Use the current ioLocale setting to determine the encoding for reading and writing data. Cf. `SET UNICODE=OFF`. * `standard unicode mode`: ``ioUtf8=UNICODE_UMODE``, or ``ioUtf8=1``, or ``ioUtf8=True``. Use Unicode encoding (UTF-8) for reading and writing data. Data are returned as ``unicode`` strings. Cf. `SET UNICODE=ON`. * `bytes unicode mode`: ``ioUtf8=UNICODE_BMODE``, or ``ioUtf8=2``. Like standard unicode mode, but data are returned as ``byte`` strings. See also under :py:meth:`savReaderWriter.Generic.ioUtf8` and under ``ioUtf8`` in :py:class:`savReaderWriter.SavWriter`. .. versionchanged:: 3.4 ``ioUtf8=UNICODE_BMODE`` was added. ioLocale : str or None, default None indicates the locale of the I/O module. Cf. `SET LOCALE` (default = ``None``, which corresponds to ``locale.setlocale(locale.LC_CTYPE)``, for example: ``en_US.UTF-8`` (Unix) or ``english`` (Windows). See also under :py:meth:`savReaderWriter.Generic.ioLocale`. Examples -------- Typical use: .. code-block:: python with SavReader('somefile.sav', returnHeader=True) as reader: header = reader.next() for line in reader: process(line) """
[docs] def __init__(self, savFileName, returnHeader=False, recodeSysmisTo=None, verbose=False, selectVars=None, idVar=None, rawMode=False, ioUtf8=False, ioLocale=None): """ Constructor. Initializes all vars that can be recycled """ super(SavReader, self).__init__(savFileName, b"rb", None, ioUtf8, ioLocale) self.savFileName = savFileName self.returnHeader = returnHeader self.recodeSysmisTo = recodeSysmisTo self.verbose = verbose self.selectVars = selectVars self.idVar = idVar self.rawMode = rawMode self.header = self.getHeader(self.selectVars) self.bareformats, self.varWids = self._splitformats() self.autoRawMode = self._isAutoRawMode() self.ioUtf8_ = ioUtf8 self.sysmis_ = self.sysmis self.numVars = self.numberofVariables self.nCases = self.numberofCases self.myStruct = self.getStruct(self.varTypes, self.varNames) self.unpack_from = self.myStruct.unpack_from self.seekNextCase = self.spssio.spssSeekNextCase self.caseBuffer = self.getCaseBuffer() self.current_case_number = -1
[docs] def __enter__(self): """ This function opens the spss data file (context manager).""" if self.verbose and self.ioUtf8_: print(self.replace(os.linesep, "\n")) elif self.verbose: print(str(self).replace(os.linesep, "\n")) return self
[docs] def __exit__(self, type, value, tb): """ This function closes the spss data file and does some cleaning. .. warning:: Always ensure the the .sav file is properly closed, either by using a context manager (``with`` statement) or by using ``close()``""" if type is not None: pass # Exception occurred self.close()
[docs] def close(self): """This function closes the spss data file and does some cleaning.""" if not segfaults: self.closeSavFile(self.fh, mode=b"rb") del self.spssio try: locale.resetlocale() # fails on Windows except: locale.setlocale(locale.LC_ALL, "")
[docs] def __len__(self): """ This function reports the number of cases (rows) in the spss data file. For example: len(SavReader(savFileName))""" return self.nCases # Python 3: see @rich_comparison class decorator
[docs] def __cmp__(self, other): """ This function implements behavior for all of the comparison operators so comparisons can be made between SavReader instances, or comparisons between SavReader instances and integers.""" if not isinstance(other, (SavReader, int)): raise TypeError other = other if isinstance(other, int) else len(other) if len(self) < other: return -1 elif len(self) == other: return 0 else: return 1
[docs] def __hash__(self): """This function returns a hash value for the object to ensure it is hashable.""" return id(self)
[docs] def __str__(self): """This function returns a conscise file report of the spss data file For example:: data = SavReader(savFileName) print(str(data)) # Python 3: bytes(data) data.close()""" return self.__unicode__().encode(self.fileEncoding)
[docs] def __unicode__(self): """This function returns a conscise file report of the spss data file. For example:: data = SavReader(savFileName) print(unicode(data)) # Python 3: str(data) data.close()""" return self.getFileReport()
[docs] def __next__(self): """reader.next() -> the next value, or raise StopIteration""" return self.next()
[docs] def next(self): """reader.next() -> the next value, or raise StopIteration""" self.current_case_number += 1 nCases = self.nCases + 1 if self.returnHeader else self.nCases if self.current_case_number > nCases: raise StopIteration return next(iter(self))
@memoized_property def shape(self): """This function returns the number of rows (nrows) and columns (ncols) as a namedtuple. For example:: data = SavReader(savFileName) data.shape.nrows == len(data) # True data.close()""" shape = (self.nCases, self.numVars) return collections.namedtuple("Shape", "nrows ncols")(*shape) def _isAutoRawMode(self): """Helper function for formatValues function. Determines whether iterating over each individual value is really needed""" hasDates = bool(set(self.bareformats.values()) & set(supportedDates)) hasNfmt = b"N" in list(self.bareformats.values()) hasStrings = any(self.varTypes.values()) #hasRecodeSysmis = self.recodeSysmisTo is not None return not any([hasDates, hasNfmt, hasStrings, self.ioUtf8_]) # TODO: turn this into a decorator
[docs] def formatValues(self, record): """This function formats date fields to ISO dates (yyyy-mm-dd), plus some other date/time formats. The SPSS N format is formatted to a character value with leading zeroes. System missing values are recoded to <recodeSysmisTo>, which defaults to `None`. If rawMode==True, this function does nothing""" sysmis = self.sysmis if self.rawMode: return record # 6-7 times faster! elif self.autoRawMode: # only recode SPSS $sysmis to Python None return [None if item <= sysmis else item for item in record] else: for i, value in enumerate(record): varName = self.header[i] varType = self.varTypes[varName] bareformat_ = self.bareformats[varName] varWid = self.varWids[varName] if varType == 0: # recode system missing values, if present and desired if value <= sysmis: record[i] = self.recodeSysmisTo # format N-type values (=numerical with leading zeroes) if bareformat_ in (b"N", u"N"): #record[i] = str(value).zfill(varWid) nfmt_value = "%%0%dd" % varWid % value #15 x faster (zfill) nfmt_value = nfmt_value if self.ioUtf8 == 1 else bytez(nfmt_value) record[i] = nfmt_value #15 x faster (zfill) # convert SPSS dates to ISO dates elif bareformat_ in supportedDates: fmt = supportedDates[bareformat_] args = (value, fmt, self.recodeSysmisTo) record[i] = self.spss2strDate(*args) if bareformat_ == b"QYR" and record[i]: # convert month to quarter, e.g. 12 Q 1990 --> 4 Q 1990 # There is no such thing as a %q strftime directive try: record[i] = QUARTERS[record[i][:2]] + record[i][2:] except (KeyError, TypeError): record[i] = self.recodeSysmisTo elif varType > 0: value = value.rstrip() if self.ioUtf8_: record[i] = value.decode("utf-8") else: record[i] = value return record
def _items(self, start=0, stop=None, step=1, returnHeader=False): """ This is a helper function to implement the __getitem__ and the __iter__ special methods. """ #import pdb; pdb.set_trace() if returnHeader and self.current_case_number <= 0: self.current_case_number += 1 yield self.header used_as_iterator = all([start == 0, stop is None, step == 1]) if not used_as_iterator: retcode = self.seekNextCase(c_int(self.fh), c_long(0)) # reset if retcode: checkErrsWarns("Problem seeking first case", retcode) stop = self.nCases if stop is None else stop selection = self.selectVars is not None selectOne = len(self.selectVars) == 1 if self.selectVars else None for case in xrange(start, stop, step): if start or step != 1: # only call this when iterating over part of the records retcode = self.seekNextCase(c_int(self.fh), c_long(case)) if retcode: checkErrsWarns("Problem seeking case %d" % case, retcode) record = self.record if selection: record = self.selector(record) record = [record] if selectOne else list(record) yield self.formatValues(record)
[docs] def __iter__(self): """x.__iter__() <==> iter(x). Yields records as a list. For example:: with SavReader("someFile.sav") as reader: for line in reader: process(line)""" return self._items(0, None, 1, self.returnHeader)
[docs] def __getitem__(self, key): """x.__getitem__(y) <==> x[y], where y may be int or slice. This function reports the record of case number <key>. The <key> argument may also be a slice, for example:: data = SavReader("someFile.sav") print("The first six records look like this: %s" % data[:6]) print("The first record looks like this: %s" % data[0]) print("First column: %s" % data[..., 0]) # requires numpy print("Row 4 & 5, first three cols: %s" % data[4:6, :3]) data.close()""" is_slice = isinstance(key, slice) is_array_slice = key is Ellipsis or isinstance(key, tuple) if is_slice: start, stop, step = key.indices(self.nCases) elif is_array_slice: return self._get_array_slice(key, self.nCases, len(self.header)) else: key = operator.index(key) start = key + self.nCases if key < 0 else key if not 0 <= start < self.nCases: raise IndexError("Index out of bounds") stop = start + 1 step = 1 records = self._items(start, stop, step) if is_slice: return list(records) return next(records)
def _cast_array(self, cstart, cstop, cstep, raw_result): """Helper for _get_array_slice function""" varNames = self.varNames[slice(cstart, cstop, cstep)] numVars = [v for v in varNames if self.varTypes[v] == 0 and not re.search(b"time|date|n\d+", self.formats[v], re.I)] return [[float(item) if v in numVars else item for v, item in zip(varNames, record)]for record in raw_result] def _get_array_slice(self, key, nRows, nCols): """This is a helper function to implement array slicing with numpy""" if not numpyOk: raise ImportError("Array slicing requires the numpy library") is_index = False rstart = cstart = 0 cstop = cstep = None try: row, col = key if isinstance(row, int) and row < 0: row = nRows + row if isinstance(col, int) and col < 0: col = nCols + col ## ... slices if isinstance(row, slice) and col is Ellipsis: # reader[1:2, ...] rstart, rstop, rstep = row.indices(nRows) cstart, cstop, cstep = 0, nRows, 1 elif row is Ellipsis and isinstance(col, slice): # reader[..., 1:2] rstart, rstop, rstep = 0, nRows, 1 cstart, cstop, cstep = col.indices(nCols) elif isinstance(row, slice) and isinstance(col, slice): # reader[1:2, 1:2] rstart, rstop, rstep = row.indices(nRows) cstart, cstop, cstep = col.indices(nCols) elif row is Ellipsis and col is Ellipsis: # reader[..., ...] # DeprecationWarning in recent numpy versions rstart, rstop, rstep = 0, nRows, 1 cstart, cstop, cstep = 0, nCols, 1 ## ... indexes elif isinstance(row, int) and col is Ellipsis: # reader[1, ...] rstart, rstop, rstep = row, row + 1, 1 cstart, cstop, cstep = 0, nCols, 1 is_index = True elif row is Ellipsis and isinstance(col, int): # reader[..., 1] rstart, rstop, rstep = 0, nRows, 1 cstart, cstop, cstep = col, col + 1, 1 is_index = True elif isinstance(row, int) and isinstance(col, int): # reader[1, 1] rstart, rstop, rstep = row, row + 1, 1 cstart, cstop, cstep = col, col + 1, 1 is_index = True # ... slice + index elif isinstance(row, slice) and isinstance(col, int): # reader[1:2, 1] rstart, rstop, rstep = row.indices(nRows) cstart, cstop, cstep = col, col + 1, 1 elif isinstance(row, int) and isinstance(col, slice): # reader[1, 1:2] rstart, rstop, rstep = row, row + 1, 1 cstart, cstop, cstep = col.indices(nCols) try: if not 0 <= abs(rstart) < nRows: raise IndexError("Index out of bounds") if not 0 <= abs(cstart) < nCols: raise IndexError("Index out of bounds") key = (Ellipsis, slice(cstart, cstop, cstep)) except UnboundLocalError: msg = "The array index is either invalid, or not implemented" raise TypeError(msg) except TypeError: # reader[...] rstart, rstop, rstep = 0, nRows, 1 key = (Ellipsis, Ellipsis) # select the rows, cols respectively records = self._items(rstart, rstop, rstep) raw_result = numpy.array(list(records))[key].tolist() # cast the result, so floats become floats again result = self._cast_array(cstart, cstop, cstep, raw_result) # flatten list if it's row or one col if abs(key[1].start - key[1].stop) == 1 or len(result) == 1: return functools.reduce(list.__add__, result) if is_index: return result[0] return result
[docs] def head(self, n=5): """ This convenience function returns the first <n> records. Example:: data = SavReader("someFile.sav") print("The first five records look like this: %s" % data.head()) data.close()""" return self[:abs(n)]
[docs] def tail(self, n=5): """ This convenience function returns the last <n> records. Example:: data = SavReader("someFile.sav") print("The last four records look like this: %s" % data.tail(4)) data.close()""" return self[-abs(n):]
[docs] def all(self): """ This convenience function returns all the records. Example:: data = SavReader("someFile.sav") list_of_lists = data.all() data.close()""" return [record for record in iter(self)]
[docs] def __contains__(self, item): """ This function implements membership testing and returns True if <idVar> contains <item>. Thus, it requires the 'idVar' parameter to be set. Example:: reader = SavReader(savFileName, idVar="ssn") "987654321" in reader # returns True or False """ return bool(self.get(item))
[docs] def get(self, key, default=None, full=False): """ This function returns the records for which <idVar> == <key> if <key> in <savFileName>, else <default>. Thus, the function mimics dict.get, but note that dict[key] is NOT implemented. NB: Even though this uses a binary search, this is not very fast on large data (esp. the first call, and with full=True) Parameters ---------- key : str, int, float key for which the corresponding record should be returned default : (value) value that should be returned if <key> is not found full : bool value that indicates whether *all* records for which <idVar> == <key> should be returned Examples -------- For example:: data = SavReader(savFileName, idVar="ssn") data.get("987654321", "social security number not found!") data.close()""" if not self.idVar in self.varNames: msg = ("SavReader object must be instantiated with an existing " + "variable as an idVar argument") raise NameError(msg) #two slightly modified functions from the bisect module def bisect_right(a, x, lo=0, hi=None): if hi is None: hi = len(a) while lo < hi: mid = (lo + hi) // 2 if x < a[mid][0]: hi = mid # a[mid][0], not a[mid] else: lo = mid + 1 return lo def bisect_left(a, x, lo=0, hi=None): if hi is None: hi = len(a) while lo < hi: mid = (lo + hi) // 2 if a[mid][0] < x: lo = mid + 1 # a[mid][0], not a[mid] else: hi = mid return lo idPos = self.varNames.index(self.idVar) if not hasattr(self, "isSorted"): self.isSorted = True if self.varTypes[self.idVar] == 0: if not isinstance(key, (int, float)): return default self.recordz = ((record[idPos], i) for i, record in enumerate(iter(self))) else: if not isinstance(key, basestring): return default self.recordz = ((record[idPos].rstrip(), i) for i, record in enumerate(iter(self))) self.recordz = sorted(self.recordz) insertLPos = bisect_left(self.recordz, key) insertRPos = bisect_right(self.recordz, key) if full: result = [self[record[1]] for record in self.recordz[insertLPos: insertRPos]] else: if insertLPos == insertRPos: return default result = self[self.recordz[insertLPos][1]] if result: return result return default
[docs] def getSavFileInfo(self): """ This function reads and returns some basic information of the open spss data file.""" return (self.numVars, self.nCases, self.varNames, self.varTypes, self.formats, self.varLabels, self.valueLabels)
[docs] def decode(func): """Decorator to decode datestrings for ioUtf8""" @functools.wraps(func) def wrapper(*args): value = func(*args) self = args[0] if not self.ioUtf8 or self.ioUtf8 == 2: return value # unchanged try: return value.decode("utf-8") except AttributeError: return value return wrapper
@memoize @decode
[docs] def spss2strDate(self, spssDateValue, fmt, recodeSysmisTo): """This function converts internal SPSS dates (number of seconds since midnight, Oct 14, 1582 (the beginning of the Gregorian calendar)) to a human-readable format (ISO-8601 where possible) Parameters ---------- spssDateValue : int, float fmt : strptime format recodeSysmisTo : what SPSS $sysmis values will be replaced with Examples -------- For example:: data = SavReader(savFileName) iso_date = data.spss2strDate(11654150400.0, "%Y-%m-%d", None) data.close() See also -------- savReaderWriter.SavReaderNp.spss2datetimeDate : returns ``datetime.datetime`` object strptime-formats-settings :download:`__init__.py <../__init__.py>` to change the strptime formats from ISO into something else. Note that dates before 1900 are *not* affected by format changes in `__init__.py`. :ref:`dateformats` : overview of SPSS datetime formats""" try: MIDNIGHT_OCT_14_1582 = 86400 time_only = spssDateValue < MIDNIGHT_OCT_14_1582 is_time_fmt = fmt.startswith("%H:%M:%S") and time_only is_dtime_fmt = fmt == "%d %H:%M:%S" is_normal_fmt = not is_time_fmt and not is_dtime_fmt delta = datetime.timedelta(seconds=spssDateValue) gregorianEpoch = datetime.datetime(1582, 10, 14, 0, 0, 0) theDate = (gregorianEpoch + delta) if theDate.year >= 1900 and is_normal_fmt: return bytez(datetime.datetime.strftime(theDate, fmt)) elif is_normal_fmt: #import mx.DateTime # Python 2 only (2015) #return mx.DateTime.DateTimeFrom(theDate).strftime(fmt) if "%H" in fmt: return bytez(theDate.isoformat(" ")) return bytez(theDate.isoformat().split("T")[0]) elif is_time_fmt: return bytez(str(delta).zfill(8)) elif is_dtime_fmt: time_part = bytez(theDate.isoformat().split("T")[1]) day_part = bytez(str(delta.days).zfill(2)) return day_part + b" " + time_part else: raise RuntimeError except (OverflowError, TypeError, ValueError): return recodeSysmisTo
[docs] def getFileReport(self): """ This function prints a report about basic file characteristics """ filesize = os.path.getsize(self.savFileName) kb = float(filesize) / 2**10 mb = float(filesize) / 2**20 (fileSize, label) = (mb, "MB") if mb > 1 else (kb, "kB") systemString = self.systemString.decode(self.fileEncoding) spssVersion = ".".join(map(str, self.spssVersion)) lang, cp = locale.getlocale() intEnc = "Utf-8/Unicode" if self.ioUtf8 else "Codepage (%s)" % cp varlist = [] line = " %%0%sd. %%s (%%s - %%s)" % len(str(len(self.varNames) + 1)) for cnt, varName in enumerate(self.varNames): lbl = "string" if self.varTypes[varName] > 0 else "numerical" format_ = self.formats[varName].decode(self.fileEncoding) varName = varName.decode(self.fileEncoding) varlist.append(line % (cnt + 1, varName, format_, lbl)) info = {"savFileName": self.savFileName, "fileSize": fileSize, "label": label, "nCases": self.nCases, "nCols": len(self.varNames), "nValues": self.nCases * len(self.varNames), "spssVersion": "%s (%s)" % (systemString, spssVersion), "ioLocale": self.ioLocale.decode(self.fileEncoding), "ioUtf8": intEnc, "fileEncoding": self.fileEncoding, "fileCodePage": self.fileCodePage, "isCompatible": "Yes" if self.isCompatibleEncoding() else "No", "local_language": lang, "local_encoding": cp, "varlist": os.linesep.join(varlist), "sep": os.linesep, "asterisks": 70 * "*"} report = ("%(asterisks)s%(sep)s" + "*File '%(savFileName)s' (%(fileSize)3.2f %(label)s) has " + "%(nCols)s columns (variables) and %(nCases)s rows " + "(%(nValues)s values)%(sep)s" + "*The file was created with SPSS version: %(spssVersion)s%" + "(sep)s" + "*The interface locale is: '%(ioLocale)s'%(sep)s" + "*The interface mode is: %(ioUtf8)s%(sep)s" + "*The file encoding is: '%(fileEncoding)s' (Code page: " + "%(fileCodePage)s)%(sep)s" + "*File encoding and the interface encoding are compatible:" + " %(isCompatible)s%(sep)s" + "*Your computer's locale is: '%(local_language)s' (Code " + "page: %(local_encoding)s)%(sep)s" + "*The file contains the following variables:%(sep)s" + "%(varlist)s%(sep)s%(asterisks)s%(sep)s") % info if hasattr(report, "decode"): report = report.decode(self.fileEncoding) return report
[docs] def getHeader(self, selectVars): """This function returns the variable names, or a selection thereof (as specified as a list using the selectVars parameter), as a list.""" if selectVars is None: header = self.varNames elif isinstance(selectVars, (list, tuple)): diff = set(selectVars).difference(set(self.varNames)) if diff: msg = "Variable names misspecified (%r)" % ", ".join(diff) raise NameError(msg) varPos = [self.varNames.index(v) for v in self.varNames if v in selectVars] self.selector = operator.itemgetter(*varPos) header = self.selector(self.varNames) header = [header] if not isinstance(header, tuple) else list(header) else: msg = ("Variable names list misspecified. Must be 'None' or a " "list or tuple of existing variables") raise TypeError(msg) return header