Source code for savReaderWriter.savWriter

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from ctypes import *
import os
import time
import locale
from collections import Iterable

try:
    pandasOK = True
    import pandas as pd
except ImportError:
    pandasOK = False
try:
    numpyOK = True
    import numpy as np
except ImportError:
    numpyOK = False

from savReaderWriter import *
from py3k import *
from header import *

if cWriterowOK and not isPy3k:
    cWriterow = cWriterow.cWriterow

[docs]class SavWriter(Header): """ Write SPSS system files (.sav, .zsav) Below, the associated SPSS commands are given in `CAPS`. Parameters ---------- savFileName : str The file name of the spss data file. * File names that end with '.sav' are compressed using the 'old' compression scheme * File names that end with '_uncompressed.sav' are, well, not compressed. This is useful when you intend to read the files with the faster :py:class:`savReaderWriter.SavReaderNp` class * File names that end with '.zsav' are compressed using the ZLIB (ZSAV) compression scheme (requires v21 SPSS I/O files) varNames : list list of of strings of the variable names in the order in which they should appear in the spss data file. See also under :py:meth:`savReaderWriter.Header.varNamesTypes`. varTypes : dict varTypes dictionary `{varName: varType}` * varType == 0 --> numeric * varType > 0 --> character' of that length (in bytes!) See also under :py:meth:`savReaderWriter.Header.varNamesTypes`. valueLabels : dict, optional value label dictionary ``{varName: {value: label}}`` Cf. `VALUE LABELS`. See also under :py:meth:`savReaderWriter.Header.valueLabels`. varLabels : dict, optional variable label dictionary ``{varName: varLabel}``. Cf. `VARIABLE LABELS`. See also under :py:meth:`savReaderWriter.Header.varLabels`. formats : dict, optional format dictionary ``{varName: printFmt}``. Cf. `FORMATS`. See also under :py:meth:`savReaderWriter.Header.formats`, under :ref:`formats` and under :ref:`dateformats`. missingValues : dict, optional missing values dictionary ``{varName: {missing value spec}}``. Cf. `MISSING VALUES`. See also under :py:meth:`savReaderWriter.Header.missingValues` measureLevels : dict, optional measurement level dictionary ``{varName: <level>}``. Valid levels are: "unknown", "nominal", "ordinal", "scale", "ratio", "flag", "typeless". Cf. `VARIABLE LEVEL` See also under :py:meth:`savReaderWriter.Header.measureLevels`. .. warning:: `measureLevels`, `columnWidths` and `alignments` must all three be set, if used columnWidths : dict, optional column display width dictionary ``{varName: <int>}``. Cf. `VARIABLE WIDTH`. (default: None --> >= 10 [stringVars] or automatic [numVars]) See also under :py:meth:`savReaderWriter.Header.columnWidths`. alignments : dict, optional variable alignment dictionary ``{varName: <left/center/right>}``. Cf. `VARIABLE ALIGNMENT` (default: None --> left) See also under :py:meth:`savReaderWriter.Header.alignments`. varSets : dict, optional sets dictionary ``{setName: list_of_valid_varNames}``. Cf. `SETSMR` command. See also under :py:meth:`savReaderWriter.Header.varSets` varRoles : dict, optional variable roles dictionary ``{varName: varRole}``, where varRole may be any of the following: 'both', 'frequency', 'input', 'none', 'partition', 'record ID', 'split', 'target'. Cf. `VARIABLE ROLE` See also under :py:meth:`savReaderWriter.Header.varRoles`. varAttributes : dict, optional variable attributes dictionary ``{varName: {attribName: attribValue}``. Cf. `VARIABLE ATTRIBUTES`. See also under :py:meth:`savReaderWriter.Header.varAttributes`. fileAttributes : dict, optional file attributes dictionary ``{attribName: attribValue}``. Cf. FILE ATTRIBUTES. See also under :py:meth:`savReaderWriter.Header.fileAttributes`. fileLabel : dict, optional file label string, which defaults to "File created by user <username> at <datetime>" is file label is None. Cf. `FILE LABEL` See also under :py:meth:`savReaderWriter.Header.fileLabel`. multRespDefs : dict, optional multiple response sets definitions (dichotomy groups or category groups) dictionary ``{setName: <set definition>}``. In SPSS syntax, 'setName' has a dollar prefix ('$someSet'). Cf. `MRSETS`. See also under :py:meth:`savReaderWriter.Header.multRespDefs`. caseWeightVar : str, optional valid varName that is set as case weight (cf. `WEIGHT BY`). See also under :py:meth:`savReaderWriter.Header.caseWeightVar`. overwrite : bool, optional indicates whether an existing SPSS file should be overwritten ioUtf8 : bool, optional indicates the mode in which text communicated to or from the I/O Module will be. This refers to unicode mode (`SET UNICODE=ON`) and codepage mode in SPSS (`SET UNICODE=OFF`). See also under :py:meth:`savReaderWriter.Generic.ioUtf8` and under ``ioUtf8`` in :py:class:`savReaderWriter.SavReader`. * `ioUtf8=False`. Use the current ioLocale setting to determine the encoding for writing data. * `ioUtf8=True`. Use Unicode encoding (UTF-8) for writing data. Note: Data files saved in Unicode encoding cannot be read by versions of IBM SPSS Statistics prior to 16. Unicode mode is the default since IBM SPSS Statistics version 21. When opening code page IBM SPSS Statistics data files in Unicode mode or saving data files as Unicode in codepage mode, defined string widths are automatically *tripled*. .. seealso:: `<http://www-01.ibm.com/support/knowledgecenter/SSLVMB_21.0.0/com.ibm.spss.statistics.help/faq_unicode.htm>`_ ioLocale : bool, optional indicates the locale of the I/O module, cf. `SET LOCALE` (default: ``None``, which is the same as ``locale.setlocale(locale.LC_CTYPE)``). See also under :py:meth:`savReaderWriter.Generic.ioLocale` mode : str, optional indicates the mode in which ``savFileName`` should be opened. Possible values are: * "wb" --> write * "ab" --> append * "cp" --> copy: initialize header using ``refSavFileName`` as a reference file, cf. `APPLY DICTIONARY`. refSavFileName : str, optional reference file that should be used to initialize the header (aka the SPSS data dictionary) containing variable label, value label, missing value, etc. etc. definitions. Only relevant in conjunction with ``mode="cp"``. See also -------- savReaderWriter.Header : for details about how to define individual metadata items Examples -------- Typical use:: records = [[b'Test1', 1, 1], [b'Test2', 2, 1]] varNames = [b'var1', b'v2', b'v3'] varTypes = {b'var1': 5, b'v2': 0, b'v3': 0} savFileName = 'someFile.sav' with SavWriter(savFileName, varNames, varTypes) as writer: for record in records: writer.writerow(record) """
[docs] def __init__(self, savFileName, varNames, varTypes, valueLabels=None, varLabels=None, formats=None, missingValues=None, measureLevels=None, columnWidths=None, alignments=None, varSets=None, varRoles=None, varAttributes=None, fileAttributes=None, fileLabel=None, multRespDefs=None, caseWeightVar=None, overwrite=True, ioUtf8=False, ioLocale=None, mode=b"wb", refSavFileName=None): """ Constructor. Initializes all vars that can be recycled """ super(Header, self).__init__(savFileName, ioUtf8, ioLocale) self.savFileName = savFileName self.varNames = self.encode(varNames) self.varTypes = self.encode(varTypes) self.overwrite = overwrite self.mode = mode self.refSavFileName = refSavFileName self.fh = super(Header, self).openSavFile(self.savFileName, self.mode, self.refSavFileName) self.myStruct = self.getStruct(self.varTypes, self.varNames, self.mode) self.pack_into = self.myStruct.pack_into self.sysmis_ = self.sysmis self.ioUtf8_ = ioUtf8 self.pad_8_lookup = self._getPaddingLookupTable(self.varTypes) self.pad_string = self._pyWriterow_pad_string(isPy3k) self.bytify = bytify(self.fileEncoding) # from py3k module self.encoding = self.fileEncoding if self.mode == b"wb": self._openWrite(self.savFileName, self.overwrite) self.varNamesTypes = self.varNames, self.varTypes self.valueLabels = valueLabels self.varLabels = varLabels self.formats = formats self.missingValues = missingValues self.measureLevels = measureLevels self.columnWidths = columnWidths self.alignments = alignments self.varSets = varSets self.varRoles = varRoles self.varAttributes = varAttributes self.fileAttributes = fileAttributes self.fileLabel = fileLabel self.multRespDefs = multRespDefs self.caseWeightVar = caseWeightVar #self.dateVariables = dateVariables triplet = [measureLevels, columnWidths, alignments] if all([item is None for item in triplet]): self._setColWidth10() self.textInfo = self.savFileName if self.mode in (b"wb", b"cp"): self._commitHeader() self.caseBuffer = self.getCaseBuffer()
[docs] def __enter__(self): """This function returns the writer object itself so the writerow and writerows methods become available for use with 'with' statements""" return self
[docs] def __exit__(self, type, value, tb): """ This function closes the spss data file. .. warning:: Always ensure the the .sav file is properly closed, either by using a context manager (``with`` statement) or by using ``close()``""" if type is not None: pass # Exception occurred self.close()
[docs] def close(self): """ This function closes the spss data file.""" self.closeSavFile(self.fh, self.mode) try: locale.resetlocale() # fails on Windows except: locale.setlocale(locale.LC_ALL, "")
def _openWrite(self, savFileName, overwrite): """ This function opens a file in preparation for creating a new IBM SPSS Statistics data file""" if os.path.exists(savFileName) and not os.access(savFileName, os.W_OK): raise IOError("No write access for file %r" % savFileName) b = isinstance(savFileName, bytes) u = isinstance(savFileName, unicode) fn_endswith = savFileName.lower().endswith if overwrite or not os.path.exists(savFileName): if b and fn_endswith(b".zsav") or u and fn_endswith(u".zsav"): self.fileCompression = b"zlib" # only with v21 libraries! elif ( b and fn_endswith(b"_uncompressed.sav") or u and fn_endswith(u"_uncompressed.sav") ): self.fileCompression = b"uncompressed" else: self.fileCompression = b"standard" elif not overwrite and os.path.exists(savFileName): raise IOError("File %r already exists!" % savFileName)
[docs] def convertDate(self, day, month, year): """This function converts a Gregorian date expressed as day-month-year to the internal SPSS date format. The time portion of the date variable is set to 0:00. To set the time portion if the date variable to another value, use convertTime.""" func = self.spssio.spssConvertDate func.argtypes = [c_int, c_int, c_int, POINTER(c_double)] spssDate = c_double() retcode = func(day, month, year, spssDate) if retcode: msg = "Problem converting date value '%s-%s-%s'" % (day, month, year) checkErrsWarns(msg, retcode) return spssDate.value
[docs] def convertTime(self, day, hour, minute, second): """This function converts a time given as day, hours, minutes, and seconds to the internal SPSS time format.""" func = self.spssio.spssConvertTime func.argtypes = [c_int, c_int, c_int, c_double, POINTER(c_double)] spssTime = c_double() retcode = func(day, hour, minute, float(second), spssTime) if retcode: msg = "Problem converting time value '%s %s:%s:%s'" checkErrsWarns(msg % (day, hour, minute, second), retcode) return spssTime.value
[docs] def spssDateTime(self, datetimeStr=b"2001-12-08", strptimeFmt="%Y-%m-%d"): """ This function converts a date/time string into an SPSS date, using a strptime format. See also :ref:`dateformats`""" try: datetimeStr = datetimeStr.decode("utf-8") dt = time.strptime(datetimeStr, strptimeFmt) except (ValueError, TypeError, AttributeError): return self.sysmis day, month, year = dt.tm_mday, dt.tm_mon, dt.tm_year hour, minute, second = dt.tm_hour, dt.tm_min, dt.tm_sec return (self.convertDate(day, month, year) + self.convertTime(0, hour, minute, second))
def _commitHeader(self): """This function writes the data dictionary to the data file associated with file handle 'fh'. Before any case data can be written, the dictionary must be committed; once the dictionary has been committed, no further changes can be made to it.""" retcode = self.spssio.spssCommitHeader(c_int(self.fh)) if retcode: checkErrsWarns("Problem committing header", retcode) def _getPaddingLookupTable(self, varTypes): """Helper function that returns a lookup table that maps string lengths to string lengths to the nearest ceiled multiple of 8. For example: {1:%-8s, 7:%-8s, 9: %-16s, 24: %-24s}. Purpose: Get rid of trailing null bytes""" strLengths = varTypes.values() if isPy3k: return dict([(i, (-8 * (i // -8))) for i in strLengths]) return dict([(i, "%%-%ds" % (-8 * (i // -8))) for i in strLengths]) def _pyWriterow_pad_string(self, isPy3k): """Helper that returns a function to pad string values using _getPaddingLookupTable. Padding is done differently for Python 2 and 3 (probably the latter is slower)""" if isPy3k: def _padStringValue(value, varType): # % replacement is not possible with bytes return value.ljust(self.pad_8_lookup[varType]) else: def _padStringValue(value, varType): # Get rid of trailing null bytes --> 7 x faster than 'ljust' return self.pad_8_lookup[varType] % value return _padStringValue def _pyWriterow(self, record): """ This function writes one record, which is a Python list, compare this Python version with the Cython version cWriterow.""" float_ = float encoding = self.encoding pad_string = self.pad_string for i, value in enumerate(record): varName = self.varNames[i] varType = self.varTypes[varName] if varType == 0: try: value = float_(value) except (ValueError, TypeError): value = self.sysmis_ else: value = pad_string(value, varType) if self.ioUtf8_ and isinstance(value, unicode): value = value.encode("utf-8") record[i] = value self.record = record
[docs] def writerow(self, record): """This function writes one record, which is a Python list.""" if cWriterowOK: cWriterow(self, record) return self._pyWriterow(record)
[docs] def writerows(self, records): """This function writes all records. Parameters ---------- records : list, tuple, numpy.ndarray, pandas.DataFrame, or similar the records to be written to the .sav file Raises ------ TypeError : if the records instance is not of a suitable type ValueError : if bool(records) == False, or if the array/DataFrame is empty """ def is_empty(records): if hasattr(records, "empty"): # pandas return records.empty elif hasattr(records, "size"): # numpy return not records.size else: return not records if is_empty(records): raise ValueError("No data") elif numpyOK and isinstance(records, np.ndarray): # issue #25 records = np.where(np.isnan(records), self.sysmis, records) for i in range(len(records)): self.writerow( records[i].tolist() ) elif pandasOK and isinstance(records, pd.DataFrame): records[records.isnull()] = self.sysmis for record in records.itertuples(index=False): self.writerow(list(record)) elif isinstance(records, Iterable) and hasattr(records[0], "__iter__"): for record in records: # (named)tuple self.writerow(list(record)) # need item assignment else: try: for record in records: self.writerow(record) except: types = (tuple, list) if numpyOK: types += (np.array, ) if pandasOK: types += (pd.DataFrame,) if not isinstance(records, types): msg = ('records instance type must be one of list, tuple, ' 'numpy.array, pandas.DataFrame but got %s') raise TypeError( msg % (type(records), )) raise