#!/usr/bin/env python
# -*- coding: utf-8 -*-
from ctypes import *
import os
import operator
import locale
import datetime
import collections
import functools
from savReaderWriter import *
from header import *
from helpers import *
@rich_comparison
@implements_to_string
[docs]class SavReader(Header):
""" Read SPSS system files (.sav, .zsav)
Parameters
----------
savFileName : str
the file name of the spss data file
returnHeader : bool, default False
indicates whether the first record should be a list of variable names
recodeSysmisTo: (value), default None
indicates to which value SPSS missing values (`$sysmis`) should be
recoded. Any value below 10 ** -10 is returned as None
verbose : bool, default False
indicates whether information about the spss data file (e.g., number
of cases, variable names, file size) should be printed on the screen.
selectVars : list or None, default None
indicates which variables in the file should be selected.
The variables should be specified as a list of valid variable names.
If ``None`` is specified, all the variables in the file are used
idVar : str or None, default None
indicates which variable in the file should be used for use as id
variable for the 'get' method
rawMode : bool, default False
indicates whether values should get SPSS-style formatting, and whether
date variables (if present) should be converted into ISO-dates. If set
to ``True`` the program does not format any values, which increases
processing speed. In particular ``rawMode=True`` implies that:
* SPSS datetimes will not be converted into ISO8601 dates
* SPSS `N` formats will not be converted into strings with leading zeroes
* SPSS `$sysmis` values will not be converted into ``None`` values
* String values will be ceiled multiples of 8 bytes
See also :ref:`formats` and :ref:`dateformats`
ioUtf8 : bool, int, default False
indicates the mode in which text communicated to or from the I/O
Module will be.
* `codepage mode`: ``ioUtf8=CODEPAGE_MODE``, or ``ioUtf8=0``, or
``ioUtf8=False``. Use the current ioLocale setting to determine the
encoding for reading and writing data. Cf. `SET UNICODE=OFF`.
* `standard unicode mode`: ``ioUtf8=UNICODE_UMODE``, or ``ioUtf8=1``,
or ``ioUtf8=True``. Use Unicode encoding (UTF-8) for reading and
writing data. Data are returned as ``unicode`` strings.
Cf. `SET UNICODE=ON`.
* `bytes unicode mode`: ``ioUtf8=UNICODE_BMODE``, or ``ioUtf8=2``.
Like standard unicode mode, but data are returned as ``byte``
strings.
See also under :py:meth:`savReaderWriter.Generic.ioUtf8` and under
``ioUtf8`` in :py:class:`savReaderWriter.SavWriter`.
.. versionchanged:: 3.4
``ioUtf8=UNICODE_BMODE`` was added.
ioLocale : str or None, default None
indicates the locale of the I/O module. Cf. `SET LOCALE` (default
= ``None``, which corresponds to
``locale.setlocale(locale.LC_CTYPE)``, for example:
``en_US.UTF-8`` (Unix) or ``english`` (Windows).
See also under :py:meth:`savReaderWriter.Generic.ioLocale`.
Examples
--------
Typical use:
.. code-block:: python
with SavReader('somefile.sav', returnHeader=True) as reader:
header = reader.next()
for line in reader:
process(line)
"""
[docs] def __init__(self, savFileName, returnHeader=False, recodeSysmisTo=None,
verbose=False, selectVars=None, idVar=None, rawMode=False,
ioUtf8=False, ioLocale=None):
""" Constructor. Initializes all vars that can be recycled """
super(SavReader, self).__init__(savFileName, b"rb", None,
ioUtf8, ioLocale)
self.savFileName = savFileName
self.returnHeader = returnHeader
self.recodeSysmisTo = recodeSysmisTo
self.verbose = verbose
self.selectVars = selectVars
self.idVar = idVar
self.rawMode = rawMode
self.header = self.getHeader(self.selectVars)
self.bareformats, self.varWids = self._splitformats()
self.autoRawMode = self._isAutoRawMode()
self.ioUtf8_ = ioUtf8
self.sysmis_ = self.sysmis
self.numVars = self.numberofVariables
self.nCases = self.numberofCases
self.myStruct = self.getStruct(self.varTypes, self.varNames)
self.unpack_from = self.myStruct.unpack_from
self.seekNextCase = self.spssio.spssSeekNextCase
self.caseBuffer = self.getCaseBuffer()
self.current_case_number = -1
[docs] def __enter__(self):
""" This function opens the spss data file (context manager)."""
if self.verbose and self.ioUtf8_:
print(self.replace(os.linesep, "\n"))
elif self.verbose:
print(str(self).replace(os.linesep, "\n"))
return self
[docs] def __exit__(self, type, value, tb):
""" This function closes the spss data file and does some cleaning.
.. warning::
Always ensure the the .sav file is properly closed, either by
using a context manager (``with`` statement) or by using
``close()``"""
if type is not None:
pass # Exception occurred
self.close()
[docs] def close(self):
"""This function closes the spss data file and does some cleaning."""
if not segfaults:
self.closeSavFile(self.fh, mode=b"rb")
del self.spssio
try:
locale.resetlocale() # fails on Windows
except:
locale.setlocale(locale.LC_ALL, "")
[docs] def __len__(self):
""" This function reports the number of cases (rows) in the spss data
file. For example: len(SavReader(savFileName))"""
return self.nCases
# Python 3: see @rich_comparison class decorator
[docs] def __cmp__(self, other):
""" This function implements behavior for all of the comparison
operators so comparisons can be made between SavReader instances,
or comparisons between SavReader instances and integers."""
if not isinstance(other, (SavReader, int)):
raise TypeError
other = other if isinstance(other, int) else len(other)
if len(self) < other:
return -1
elif len(self) == other:
return 0
else:
return 1
[docs] def __hash__(self):
"""This function returns a hash value for the object to ensure it
is hashable."""
return id(self)
[docs] def __str__(self):
"""This function returns a conscise file report of the spss data file
For example::
data = SavReader(savFileName)
print(str(data)) # Python 3: bytes(data)
data.close()"""
return self.__unicode__().encode(self.fileEncoding)
[docs] def __unicode__(self):
"""This function returns a conscise file report of the spss data file.
For example::
data = SavReader(savFileName)
print(unicode(data)) # Python 3: str(data)
data.close()"""
return self.getFileReport()
[docs] def __next__(self):
"""reader.next() -> the next value, or raise StopIteration"""
return self.next()
[docs] def next(self):
"""reader.next() -> the next value, or raise StopIteration"""
self.current_case_number += 1
nCases = self.nCases + 1 if self.returnHeader else self.nCases
if self.current_case_number > nCases:
raise StopIteration
return next(iter(self))
@memoized_property
def shape(self):
"""This function returns the number of rows (nrows) and columns
(ncols) as a namedtuple. For example::
data = SavReader(savFileName)
data.shape.nrows == len(data) # True
data.close()"""
shape = (self.nCases, self.numVars)
return collections.namedtuple("Shape", "nrows ncols")(*shape)
def _isAutoRawMode(self):
"""Helper function for formatValues function. Determines whether
iterating over each individual value is really needed"""
hasDates = bool(set(self.bareformats.values()) & set(supportedDates))
hasNfmt = b"N" in list(self.bareformats.values())
hasStrings = any(self.varTypes.values())
#hasRecodeSysmis = self.recodeSysmisTo is not None
return not any([hasDates, hasNfmt, hasStrings, self.ioUtf8_])
# TODO: turn this into a decorator
def _items(self, start=0, stop=None, step=1, returnHeader=False):
""" This is a helper function to implement the __getitem__ and
the __iter__ special methods. """
#import pdb; pdb.set_trace()
if returnHeader and self.current_case_number <= 0:
self.current_case_number += 1
yield self.header
used_as_iterator = all([start == 0, stop is None, step == 1])
if not used_as_iterator:
retcode = self.seekNextCase(c_int(self.fh), c_long(0)) # reset
if retcode:
checkErrsWarns("Problem seeking first case", retcode)
stop = self.nCases if stop is None else stop
selection = self.selectVars is not None
selectOne = len(self.selectVars) == 1 if self.selectVars else None
for case in xrange(start, stop, step):
if start or step != 1:
# only call this when iterating over part of the records
retcode = self.seekNextCase(c_int(self.fh), c_long(case))
if retcode:
checkErrsWarns("Problem seeking case %d" % case, retcode)
record = self.record
if selection:
record = self.selector(record)
record = [record] if selectOne else list(record)
yield self.formatValues(record)
[docs] def __iter__(self):
"""x.__iter__() <==> iter(x). Yields records as a list.
For example::
with SavReader("someFile.sav") as reader:
for line in reader:
process(line)"""
return self._items(0, None, 1, self.returnHeader)
[docs] def __getitem__(self, key):
"""x.__getitem__(y) <==> x[y], where y may be int or slice.
This function reports the record of case number <key>.
The <key> argument may also be a slice, for example::
data = SavReader("someFile.sav")
print("The first six records look like this: %s" % data[:6])
print("The first record looks like this: %s" % data[0])
print("First column: %s" % data[..., 0]) # requires numpy
print("Row 4 & 5, first three cols: %s" % data[4:6, :3])
data.close()"""
is_slice = isinstance(key, slice)
is_array_slice = key is Ellipsis or isinstance(key, tuple)
if is_slice:
start, stop, step = key.indices(self.nCases)
elif is_array_slice:
return self._get_array_slice(key, self.nCases, len(self.header))
else:
key = operator.index(key)
start = key + self.nCases if key < 0 else key
if not 0 <= start < self.nCases:
raise IndexError("Index out of bounds")
stop = start + 1
step = 1
records = self._items(start, stop, step)
if is_slice:
return list(records)
return next(records)
def _cast_array(self, cstart, cstop, cstep, raw_result):
"""Helper for _get_array_slice function"""
varNames = self.varNames[slice(cstart, cstop, cstep)]
numVars = [v for v in varNames if self.varTypes[v] == 0 and not
re.search(b"time|date|n\d+", self.formats[v], re.I)]
return [[float(item) if v in numVars else item for
v, item in zip(varNames, record)]for record in raw_result]
def _get_array_slice(self, key, nRows, nCols):
"""This is a helper function to implement array slicing with numpy"""
if not numpyOk:
raise ImportError("Array slicing requires the numpy library")
is_index = False
rstart = cstart = 0
cstop = cstep = None
try:
row, col = key
if isinstance(row, int) and row < 0:
row = nRows + row
if isinstance(col, int) and col < 0:
col = nCols + col
## ... slices
if isinstance(row, slice) and col is Ellipsis:
# reader[1:2, ...]
rstart, rstop, rstep = row.indices(nRows)
cstart, cstop, cstep = 0, nRows, 1
elif row is Ellipsis and isinstance(col, slice):
# reader[..., 1:2]
rstart, rstop, rstep = 0, nRows, 1
cstart, cstop, cstep = col.indices(nCols)
elif isinstance(row, slice) and isinstance(col, slice):
# reader[1:2, 1:2]
rstart, rstop, rstep = row.indices(nRows)
cstart, cstop, cstep = col.indices(nCols)
elif row is Ellipsis and col is Ellipsis:
# reader[..., ...]
# DeprecationWarning in recent numpy versions
rstart, rstop, rstep = 0, nRows, 1
cstart, cstop, cstep = 0, nCols, 1
## ... indexes
elif isinstance(row, int) and col is Ellipsis:
# reader[1, ...]
rstart, rstop, rstep = row, row + 1, 1
cstart, cstop, cstep = 0, nCols, 1
is_index = True
elif row is Ellipsis and isinstance(col, int):
# reader[..., 1]
rstart, rstop, rstep = 0, nRows, 1
cstart, cstop, cstep = col, col + 1, 1
is_index = True
elif isinstance(row, int) and isinstance(col, int):
# reader[1, 1]
rstart, rstop, rstep = row, row + 1, 1
cstart, cstop, cstep = col, col + 1, 1
is_index = True
# ... slice + index
elif isinstance(row, slice) and isinstance(col, int):
# reader[1:2, 1]
rstart, rstop, rstep = row.indices(nRows)
cstart, cstop, cstep = col, col + 1, 1
elif isinstance(row, int) and isinstance(col, slice):
# reader[1, 1:2]
rstart, rstop, rstep = row, row + 1, 1
cstart, cstop, cstep = col.indices(nCols)
try:
if not 0 <= abs(rstart) < nRows:
raise IndexError("Index out of bounds")
if not 0 <= abs(cstart) < nCols:
raise IndexError("Index out of bounds")
key = (Ellipsis, slice(cstart, cstop, cstep))
except UnboundLocalError:
msg = "The array index is either invalid, or not implemented"
raise TypeError(msg)
except TypeError:
# reader[...]
rstart, rstop, rstep = 0, nRows, 1
key = (Ellipsis, Ellipsis)
# select the rows, cols respectively
records = self._items(rstart, rstop, rstep)
raw_result = numpy.array(list(records))[key].tolist()
# cast the result, so floats become floats again
result = self._cast_array(cstart, cstop, cstep, raw_result)
# flatten list if it's row or one col
if abs(key[1].start - key[1].stop) == 1 or len(result) == 1:
return functools.reduce(list.__add__, result)
if is_index:
return result[0]
return result
[docs] def head(self, n=5):
""" This convenience function returns the first <n> records.
Example::
data = SavReader("someFile.sav")
print("The first five records look like this: %s" % data.head())
data.close()"""
return self[:abs(n)]
[docs] def tail(self, n=5):
""" This convenience function returns the last <n> records.
Example::
data = SavReader("someFile.sav")
print("The last four records look like this: %s" % data.tail(4))
data.close()"""
return self[-abs(n):]
[docs] def all(self):
""" This convenience function returns all the records.
Example::
data = SavReader("someFile.sav")
list_of_lists = data.all()
data.close()"""
return [record for record in iter(self)]
[docs] def __contains__(self, item):
""" This function implements membership testing and returns True if
<idVar> contains <item>. Thus, it requires the 'idVar' parameter to
be set.
Example::
reader = SavReader(savFileName, idVar="ssn")
"987654321" in reader # returns True or False
"""
return bool(self.get(item))
[docs] def get(self, key, default=None, full=False):
""" This function returns the records for which <idVar> == <key>
if <key> in <savFileName>, else <default>. Thus, the function mimics
dict.get, but note that dict[key] is NOT implemented. NB: Even though
this uses a binary search, this is not very fast on large data (esp.
the first call, and with full=True)
Parameters
----------
key : str, int, float
key for which the corresponding record should be returned
default : (value)
value that should be returned if <key> is not found
full : bool
value that indicates whether *all* records for which
<idVar> == <key> should be returned
Examples
--------
For example::
data = SavReader(savFileName, idVar="ssn")
data.get("987654321", "social security number not found!")
data.close()"""
if not self.idVar in self.varNames:
msg = ("SavReader object must be instantiated with an existing " +
"variable as an idVar argument")
raise NameError(msg)
#two slightly modified functions from the bisect module
def bisect_right(a, x, lo=0, hi=None):
if hi is None:
hi = len(a)
while lo < hi:
mid = (lo + hi) // 2
if x < a[mid][0]:
hi = mid # a[mid][0], not a[mid]
else:
lo = mid + 1
return lo
def bisect_left(a, x, lo=0, hi=None):
if hi is None:
hi = len(a)
while lo < hi:
mid = (lo + hi) // 2
if a[mid][0] < x:
lo = mid + 1 # a[mid][0], not a[mid]
else:
hi = mid
return lo
idPos = self.varNames.index(self.idVar)
if not hasattr(self, "isSorted"):
self.isSorted = True
if self.varTypes[self.idVar] == 0:
if not isinstance(key, (int, float)):
return default
self.recordz = ((record[idPos], i) for i,
record in enumerate(iter(self)))
else:
if not isinstance(key, basestring):
return default
self.recordz = ((record[idPos].rstrip(), i) for i,
record in enumerate(iter(self)))
self.recordz = sorted(self.recordz)
insertLPos = bisect_left(self.recordz, key)
insertRPos = bisect_right(self.recordz, key)
if full:
result = [self[record[1]] for record in
self.recordz[insertLPos: insertRPos]]
else:
if insertLPos == insertRPos:
return default
result = self[self.recordz[insertLPos][1]]
if result:
return result
return default
[docs] def getSavFileInfo(self):
""" This function reads and returns some basic information of the open
spss data file."""
return (self.numVars, self.nCases, self.varNames, self.varTypes,
self.formats, self.varLabels, self.valueLabels)
[docs] def decode(func):
"""Decorator to decode datestrings for ioUtf8"""
@functools.wraps(func)
def wrapper(*args):
value = func(*args)
self = args[0]
if not self.ioUtf8 or self.ioUtf8 == 2:
return value # unchanged
try:
return value.decode("utf-8")
except AttributeError:
return value
return wrapper
@memoize
@decode
[docs] def spss2strDate(self, spssDateValue, fmt, recodeSysmisTo):
"""This function converts internal SPSS dates (number of seconds
since midnight, Oct 14, 1582 (the beginning of the Gregorian calendar))
to a human-readable format (ISO-8601 where possible)
Parameters
----------
spssDateValue : int, float
fmt : strptime format
recodeSysmisTo : what SPSS $sysmis values will be replaced with
Examples
--------
For example::
data = SavReader(savFileName)
iso_date = data.spss2strDate(11654150400.0, "%Y-%m-%d", None)
data.close()
See also
--------
savReaderWriter.SavReaderNp.spss2datetimeDate : returns
``datetime.datetime`` object
strptime-formats-settings
:download:`__init__.py <../__init__.py>` to change the
strptime formats from ISO into something else. Note that dates
before 1900 are *not* affected by format changes in `__init__.py`.
:ref:`dateformats` : overview of SPSS datetime formats"""
try:
MIDNIGHT_OCT_14_1582 = 86400
time_only = spssDateValue < MIDNIGHT_OCT_14_1582
is_time_fmt = fmt.startswith("%H:%M:%S") and time_only
is_dtime_fmt = fmt == "%d %H:%M:%S"
is_normal_fmt = not is_time_fmt and not is_dtime_fmt
delta = datetime.timedelta(seconds=spssDateValue)
gregorianEpoch = datetime.datetime(1582, 10, 14, 0, 0, 0)
theDate = (gregorianEpoch + delta)
if theDate.year >= 1900 and is_normal_fmt:
return bytez(datetime.datetime.strftime(theDate, fmt))
elif is_normal_fmt:
#import mx.DateTime # Python 2 only (2015)
#return mx.DateTime.DateTimeFrom(theDate).strftime(fmt)
if "%H" in fmt:
return bytez(theDate.isoformat(" "))
return bytez(theDate.isoformat().split("T")[0])
elif is_time_fmt:
return bytez(str(delta).zfill(8))
elif is_dtime_fmt:
time_part = bytez(theDate.isoformat().split("T")[1])
day_part = bytez(str(delta.days).zfill(2))
return day_part + b" " + time_part
else:
raise RuntimeError
except (OverflowError, TypeError, ValueError):
return recodeSysmisTo
[docs] def getFileReport(self):
""" This function prints a report about basic file characteristics """
filesize = os.path.getsize(self.savFileName)
kb = float(filesize) / 2**10
mb = float(filesize) / 2**20
(fileSize, label) = (mb, "MB") if mb > 1 else (kb, "kB")
systemString = self.systemString.decode(self.fileEncoding)
spssVersion = ".".join(map(str, self.spssVersion))
lang, cp = locale.getlocale()
intEnc = "Utf-8/Unicode" if self.ioUtf8 else "Codepage (%s)" % cp
varlist = []
line = " %%0%sd. %%s (%%s - %%s)" % len(str(len(self.varNames) + 1))
for cnt, varName in enumerate(self.varNames):
lbl = "string" if self.varTypes[varName] > 0 else "numerical"
format_ = self.formats[varName].decode(self.fileEncoding)
varName = varName.decode(self.fileEncoding)
varlist.append(line % (cnt + 1, varName, format_, lbl))
info = {"savFileName": self.savFileName,
"fileSize": fileSize,
"label": label,
"nCases": self.nCases,
"nCols": len(self.varNames),
"nValues": self.nCases * len(self.varNames),
"spssVersion": "%s (%s)" % (systemString, spssVersion),
"ioLocale": self.ioLocale.decode(self.fileEncoding),
"ioUtf8": intEnc,
"fileEncoding": self.fileEncoding,
"fileCodePage": self.fileCodePage,
"isCompatible": "Yes" if self.isCompatibleEncoding() else "No",
"local_language": lang,
"local_encoding": cp,
"varlist": os.linesep.join(varlist),
"sep": os.linesep,
"asterisks": 70 * "*"}
report = ("%(asterisks)s%(sep)s" +
"*File '%(savFileName)s' (%(fileSize)3.2f %(label)s) has " +
"%(nCols)s columns (variables) and %(nCases)s rows " +
"(%(nValues)s values)%(sep)s" +
"*The file was created with SPSS version: %(spssVersion)s%" +
"(sep)s" +
"*The interface locale is: '%(ioLocale)s'%(sep)s" +
"*The interface mode is: %(ioUtf8)s%(sep)s" +
"*The file encoding is: '%(fileEncoding)s' (Code page: " +
"%(fileCodePage)s)%(sep)s" +
"*File encoding and the interface encoding are compatible:" +
" %(isCompatible)s%(sep)s" +
"*Your computer's locale is: '%(local_language)s' (Code " +
"page: %(local_encoding)s)%(sep)s" +
"*The file contains the following variables:%(sep)s" +
"%(varlist)s%(sep)s%(asterisks)s%(sep)s") % info
if hasattr(report, "decode"):
report = report.decode(self.fileEncoding)
return report