#!/usr/bin/env python
# -*- coding: utf-8 -*-
from ctypes import *
import os
import time
import locale
from collections import Iterable
try:
pandasOK = True
import pandas as pd
except ImportError:
pandasOK = False
try:
numpyOK = True
import numpy as np
except ImportError:
numpyOK = False
from savReaderWriter import *
from py3k import *
from header import *
if cWriterowOK and not isPy3k:
cWriterow = cWriterow.cWriterow
[docs]class SavWriter(Header):
""" Write SPSS system files (.sav, .zsav)
Below, the associated SPSS commands are given in `CAPS`.
Parameters
----------
savFileName : str
The file name of the spss data file.
* File names that end with '.sav' are compressed using the 'old'
compression scheme
* File names that end with '_uncompressed.sav' are, well, not
compressed. This is useful when you intend to read the files with
the faster :py:class:`savReaderWriter.SavReaderNp` class
* File names that end with '.zsav' are compressed using the ZLIB
(ZSAV) compression scheme (requires v21 SPSS I/O files)
varNames : list
list of of strings of the variable names in the order in which they
should appear in the spss data file. See also under
:py:meth:`savReaderWriter.Header.varNamesTypes`.
varTypes : dict
varTypes dictionary `{varName: varType}`
* varType == 0 --> numeric
* varType > 0 --> character' of that length (in bytes!)
See also under :py:meth:`savReaderWriter.Header.varNamesTypes`.
valueLabels : dict, optional
value label dictionary ``{varName: {value: label}}`` Cf.
`VALUE LABELS`. See also under
:py:meth:`savReaderWriter.Header.valueLabels`.
varLabels : dict, optional
variable label dictionary ``{varName: varLabel}``. Cf.
`VARIABLE LABELS`. See also under
:py:meth:`savReaderWriter.Header.varLabels`.
formats : dict, optional
format dictionary ``{varName: printFmt}``. Cf. `FORMATS`.
See also under :py:meth:`savReaderWriter.Header.formats`, under
:ref:`formats` and under :ref:`dateformats`.
missingValues : dict, optional
missing values dictionary ``{varName: {missing value spec}}``.
Cf. `MISSING VALUES`. See also under
:py:meth:`savReaderWriter.Header.missingValues`
measureLevels : dict, optional
measurement level dictionary ``{varName: <level>}``.
Valid levels are: "unknown", "nominal", "ordinal", "scale",
"ratio", "flag", "typeless". Cf. `VARIABLE LEVEL`
See also under :py:meth:`savReaderWriter.Header.measureLevels`.
.. warning::
`measureLevels`, `columnWidths` and `alignments` must all three
be set, if used
columnWidths : dict, optional
column display width dictionary ``{varName: <int>}``.
Cf. `VARIABLE WIDTH`. (default: None --> >= 10 [stringVars] or
automatic [numVars])
See also under :py:meth:`savReaderWriter.Header.columnWidths`.
alignments : dict, optional
variable alignment dictionary ``{varName: <left/center/right>}``.
Cf. `VARIABLE ALIGNMENT` (default: None --> left)
See also under :py:meth:`savReaderWriter.Header.alignments`.
varSets : dict, optional
sets dictionary ``{setName: list_of_valid_varNames}``.
Cf. `SETSMR` command.
See also under :py:meth:`savReaderWriter.Header.varSets`
varRoles : dict, optional
variable roles dictionary ``{varName: varRole}``, where varRole
may be any of the following: 'both', 'frequency', 'input', 'none',
'partition', 'record ID', 'split', 'target'. Cf. `VARIABLE ROLE`
See also under :py:meth:`savReaderWriter.Header.varRoles`.
varAttributes : dict, optional
variable attributes dictionary ``{varName: {attribName:
attribValue}``. Cf. `VARIABLE ATTRIBUTES`.
See also under :py:meth:`savReaderWriter.Header.varAttributes`.
fileAttributes : dict, optional
file attributes dictionary ``{attribName: attribValue}``.
Cf. FILE ATTRIBUTES. See also under
:py:meth:`savReaderWriter.Header.fileAttributes`.
fileLabel : dict, optional
file label string, which defaults to "File created by user
<username> at <datetime>" is file label is None. Cf. `FILE LABEL`
See also under :py:meth:`savReaderWriter.Header.fileLabel`.
multRespDefs : dict, optional
multiple response sets definitions (dichotomy groups or
category groups) dictionary ``{setName: <set definition>}``. In SPSS
syntax, 'setName' has a dollar prefix ('$someSet'). Cf. `MRSETS`.
See also under :py:meth:`savReaderWriter.Header.multRespDefs`.
caseWeightVar : str, optional
valid varName that is set as case weight (cf. `WEIGHT BY`). See also
under :py:meth:`savReaderWriter.Header.caseWeightVar`.
overwrite : bool, optional
indicates whether an existing SPSS file should be overwritten
ioUtf8 : bool, optional
indicates the mode in which text communicated to or from the
I/O Module will be. This refers to unicode mode (`SET UNICODE=ON`)
and codepage mode in SPSS (`SET UNICODE=OFF`).
See also under :py:meth:`savReaderWriter.Generic.ioUtf8` and
under ``ioUtf8`` in :py:class:`savReaderWriter.SavReader`.
* `ioUtf8=False`. Use the current ioLocale setting to determine the
encoding for writing data.
* `ioUtf8=True`. Use Unicode encoding (UTF-8) for writing data.
Note: Data files saved in Unicode encoding cannot be read by versions
of IBM SPSS Statistics prior to 16. Unicode mode is the default since
IBM SPSS Statistics version 21. When opening code page IBM SPSS
Statistics data files in Unicode mode or saving data files as Unicode
in codepage mode, defined string widths are automatically *tripled*.
.. seealso::
`<http://www-01.ibm.com/support/knowledgecenter/SSLVMB_21.0.0/com.ibm.spss.statistics.help/faq_unicode.htm>`_
ioLocale : bool, optional
indicates the locale of the I/O module, cf. `SET LOCALE` (default:
``None``, which is the same as ``locale.setlocale(locale.LC_CTYPE)``).
See also under :py:meth:`savReaderWriter.Generic.ioLocale`
mode : str, optional
indicates the mode in which ``savFileName`` should be opened. Possible
values are:
* "wb" --> write
* "ab" --> append
* "cp" --> copy: initialize header using ``refSavFileName`` as a reference
file, cf. `APPLY DICTIONARY`.
refSavFileName : str, optional
reference file that should be used to initialize the header (aka the
SPSS data dictionary) containing variable label, value label, missing
value, etc. etc. definitions. Only relevant in conjunction with
``mode="cp"``.
See also
--------
savReaderWriter.Header : for details about how to define individual
metadata items
Examples
--------
Typical use::
records = [[b'Test1', 1, 1], [b'Test2', 2, 1]]
varNames = [b'var1', b'v2', b'v3']
varTypes = {b'var1': 5, b'v2': 0, b'v3': 0}
savFileName = 'someFile.sav'
with SavWriter(savFileName, varNames, varTypes) as writer:
for record in records:
writer.writerow(record)
"""
[docs] def __init__(self, savFileName, varNames, varTypes, valueLabels=None,
varLabels=None, formats=None, missingValues=None,
measureLevels=None, columnWidths=None, alignments=None,
varSets=None, varRoles=None, varAttributes=None,
fileAttributes=None, fileLabel=None, multRespDefs=None,
caseWeightVar=None, overwrite=True, ioUtf8=False,
ioLocale=None, mode=b"wb", refSavFileName=None):
""" Constructor. Initializes all vars that can be recycled """
super(Header, self).__init__(savFileName, ioUtf8, ioLocale)
self.savFileName = savFileName
self.varNames = self.encode(varNames)
self.varTypes = self.encode(varTypes)
self.overwrite = overwrite
self.mode = mode
self.refSavFileName = refSavFileName
self.fh = super(Header, self).openSavFile(self.savFileName, self.mode,
self.refSavFileName)
self.myStruct = self.getStruct(self.varTypes, self.varNames, self.mode)
self.pack_into = self.myStruct.pack_into
self.sysmis_ = self.sysmis
self.ioUtf8_ = ioUtf8
self.pad_8_lookup = self._getPaddingLookupTable(self.varTypes)
self.pad_string = self._pyWriterow_pad_string(isPy3k)
self.bytify = bytify(self.fileEncoding) # from py3k module
self.encoding = self.fileEncoding
if self.mode == b"wb":
self._openWrite(self.savFileName, self.overwrite)
self.varNamesTypes = self.varNames, self.varTypes
self.valueLabels = valueLabels
self.varLabels = varLabels
self.formats = formats
self.missingValues = missingValues
self.measureLevels = measureLevels
self.columnWidths = columnWidths
self.alignments = alignments
self.varSets = varSets
self.varRoles = varRoles
self.varAttributes = varAttributes
self.fileAttributes = fileAttributes
self.fileLabel = fileLabel
self.multRespDefs = multRespDefs
self.caseWeightVar = caseWeightVar
#self.dateVariables = dateVariables
triplet = [measureLevels, columnWidths, alignments]
if all([item is None for item in triplet]):
self._setColWidth10()
self.textInfo = self.savFileName
if self.mode in (b"wb", b"cp"):
self._commitHeader()
self.caseBuffer = self.getCaseBuffer()
[docs] def __enter__(self):
"""This function returns the writer object itself so the writerow and
writerows methods become available for use with 'with' statements"""
return self
[docs] def __exit__(self, type, value, tb):
""" This function closes the spss data file.
.. warning::
Always ensure the the .sav file is properly closed, either by
using a context manager (``with`` statement) or by using
``close()``"""
if type is not None:
pass # Exception occurred
self.close()
[docs] def close(self):
""" This function closes the spss data file."""
self.closeSavFile(self.fh, self.mode)
try:
locale.resetlocale() # fails on Windows
except:
locale.setlocale(locale.LC_ALL, "")
def _openWrite(self, savFileName, overwrite):
""" This function opens a file in preparation for creating a new IBM
SPSS Statistics data file"""
if os.path.exists(savFileName) and not os.access(savFileName, os.W_OK):
raise IOError("No write access for file %r" % savFileName)
b = isinstance(savFileName, bytes)
u = isinstance(savFileName, unicode)
fn_endswith = savFileName.lower().endswith
if overwrite or not os.path.exists(savFileName):
if b and fn_endswith(b".zsav") or u and fn_endswith(u".zsav"):
self.fileCompression = b"zlib" # only with v21 libraries!
elif ( b and fn_endswith(b"_uncompressed.sav") or
u and fn_endswith(u"_uncompressed.sav") ):
self.fileCompression = b"uncompressed"
else:
self.fileCompression = b"standard"
elif not overwrite and os.path.exists(savFileName):
raise IOError("File %r already exists!" % savFileName)
[docs] def convertDate(self, day, month, year):
"""This function converts a Gregorian date expressed as day-month-year
to the internal SPSS date format. The time portion of the date variable
is set to 0:00. To set the time portion if the date variable to another
value, use convertTime."""
func = self.spssio.spssConvertDate
func.argtypes = [c_int, c_int, c_int, POINTER(c_double)]
spssDate = c_double()
retcode = func(day, month, year, spssDate)
if retcode:
msg = "Problem converting date value '%s-%s-%s'" % (day, month, year)
checkErrsWarns(msg, retcode)
return spssDate.value
[docs] def convertTime(self, day, hour, minute, second):
"""This function converts a time given as day, hours, minutes, and
seconds to the internal SPSS time format."""
func = self.spssio.spssConvertTime
func.argtypes = [c_int, c_int, c_int, c_double, POINTER(c_double)]
spssTime = c_double()
retcode = func(day, hour, minute, float(second), spssTime)
if retcode:
msg = "Problem converting time value '%s %s:%s:%s'"
checkErrsWarns(msg % (day, hour, minute, second), retcode)
return spssTime.value
[docs] def spssDateTime(self, datetimeStr=b"2001-12-08", strptimeFmt="%Y-%m-%d"):
""" This function converts a date/time string into an SPSS date,
using a strptime format. See also :ref:`dateformats`"""
try:
datetimeStr = datetimeStr.decode("utf-8")
dt = time.strptime(datetimeStr, strptimeFmt)
except (ValueError, TypeError, AttributeError):
return self.sysmis
day, month, year = dt.tm_mday, dt.tm_mon, dt.tm_year
hour, minute, second = dt.tm_hour, dt.tm_min, dt.tm_sec
return (self.convertDate(day, month, year) +
self.convertTime(0, hour, minute, second))
def _commitHeader(self):
"""This function writes the data dictionary to the data file associated
with file handle 'fh'. Before any case data can be written, the
dictionary must be committed; once the dictionary has been committed,
no further changes can be made to it."""
retcode = self.spssio.spssCommitHeader(c_int(self.fh))
if retcode:
checkErrsWarns("Problem committing header", retcode)
def _getPaddingLookupTable(self, varTypes):
"""Helper function that returns a lookup table that maps string lengths
to string lengths to the nearest ceiled multiple of 8. For example:
{1:%-8s, 7:%-8s, 9: %-16s, 24: %-24s}. Purpose: Get rid of trailing
null bytes"""
strLengths = varTypes.values()
if isPy3k:
return dict([(i, (-8 * (i // -8))) for i in strLengths])
return dict([(i, "%%-%ds" % (-8 * (i // -8))) for i in strLengths])
def _pyWriterow_pad_string(self, isPy3k):
"""Helper that returns a function to pad string values using
_getPaddingLookupTable. Padding is done differently for Python 2 and
3 (probably the latter is slower)"""
if isPy3k:
def _padStringValue(value, varType):
# % replacement is not possible with bytes
return value.ljust(self.pad_8_lookup[varType])
else:
def _padStringValue(value, varType):
# Get rid of trailing null bytes --> 7 x faster than 'ljust'
return self.pad_8_lookup[varType] % value
return _padStringValue
def _pyWriterow(self, record):
""" This function writes one record, which is a Python list,
compare this Python version with the Cython version cWriterow."""
float_ = float
encoding = self.encoding
pad_string = self.pad_string
for i, value in enumerate(record):
varName = self.varNames[i]
varType = self.varTypes[varName]
if varType == 0:
try:
value = float_(value)
except (ValueError, TypeError):
value = self.sysmis_
else:
value = pad_string(value, varType)
if self.ioUtf8_ and isinstance(value, unicode):
value = value.encode("utf-8")
record[i] = value
self.record = record
[docs] def writerow(self, record):
"""This function writes one record, which is a Python list."""
if cWriterowOK:
cWriterow(self, record)
return
self._pyWriterow(record)
[docs] def writerows(self, records):
"""This function writes all records.
Parameters
----------
records : list, tuple, numpy.ndarray, pandas.DataFrame, or similar
the records to be written to the .sav file
Raises
------
TypeError : if the records instance is not of a suitable type
ValueError : if bool(records) == False, or if the array/DataFrame
is empty
"""
def is_empty(records):
if hasattr(records, "empty"): # pandas
return records.empty
elif hasattr(records, "size"): # numpy
return not records.size
else:
return not records
if is_empty(records):
raise ValueError("No data")
elif numpyOK and isinstance(records, np.ndarray): # issue #25
records = np.where(np.isnan(records), self.sysmis, records)
for i in range(len(records)):
self.writerow( records[i].tolist() )
elif pandasOK and isinstance(records, pd.DataFrame):
records[records.isnull()] = self.sysmis
for record in records.itertuples(index=False):
self.writerow(list(record))
elif isinstance(records, Iterable) and hasattr(records[0], "__iter__"):
for record in records: # (named)tuple
self.writerow(list(record)) # need item assignment
else:
try:
for record in records:
self.writerow(record)
except:
types = (tuple, list)
if numpyOK: types += (np.array, )
if pandasOK: types += (pd.DataFrame,)
if not isinstance(records, types):
msg = ('records instance type must be one of list, tuple, '
'numpy.array, pandas.DataFrame but got %s')
raise TypeError( msg % (type(records), ))
raise