Source code for savReaderWriter.generic

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from ctypes import *
import struct
import sys
import platform
import os
import re
import math
import locale
import encodings
import collections

from savReaderWriter import *
from py3k import *

[docs]class Generic(object):
    """
    Class for methods and data used in reading as well as writing
    IBM SPSS Statistics data files
    """

    def __init__(self, savFileName, ioUtf8=False, ioLocale=None):
        """Constructor. Note that interface locale and encoding can only
        be set once"""
        locale.setlocale(locale.LC_ALL, "" if ioLocale is None else ioLocale)
        self.savFileName = savFileName
        self.spssio = self.loadLibrary()

        self.wholeCaseIn = self.spssio.spssWholeCaseIn
        self.wholeCaseOut = self.spssio.spssWholeCaseOut

        self.ioLocale = ioLocale
        self.ioUtf8 = bool(ioUtf8)  # bool() --> needed for UNICODE_BMODE

    def _encodeFileName(self, fn):
        """Helper function to encode unicode file names into bytestring file
        names encoded in the file system's encoding. Needed for C functions
        that have a c_char_p filename argument.

        See also
        --------
        Effbot `http://effbot.org/pyref/sys.getfilesystemencoding.htm`_
        Python docs `http://docs.python.org/2/howto/unicode.html_ (under 'unicode filenames)"""
        #import pdb; pdb.set_trace()
        if not isinstance(fn, unicode):
            return fn
        elif sys.platform.startswith("win"):  # pragma: no cover
            return self.wideCharToMultiByte(fn)
        else:
            encoding = sys.getfilesystemencoding()
            encoding = "utf-8" if not encoding else encoding  # actually, ascii
        try:
            return fn.encode(encoding)
        except UnicodeEncodeError:
            msg = ("File system encoding %r can not be used to "
                   "encode file name %r [%s]")
            raise ValueError(msg % (encoding, fn, sys.exc_info()[1]))

    def _loadLibs(self, folder):
        """Helper function that loads I/O libraries in the correct order"""
        # Get a list of all the files in the spssio dir for a given OS
        # Sort the list in the order in which the libs need to be loaded
        # Using regex patterns ought to be more resilient to updates of the
        # I/O modules, compared to hardcoding the names
        debug = False
        if getattr(sys, 'frozen', False):  # pragma: no cover
            # The application is frozen by cx_freeze
            path = os.path.dirname(sys.executable)
            path = os.path.join(path, "savReaderWriter", "spssio", folder)
        else:
            path = os.path.join(os.path.dirname(__file__), "spssio", folder)
        libs = sorted(os.listdir(path))

        pats = ['(lib)?icuda?t', '(lib)?icuuc', '(lib)?icui',
                '(lib)?zlib', '(lib)?spssd?io']
        libs = [lib for pat in pats for lib in libs if re.match(pat, lib)]
        isLib = r"""\w+(\.s[ol](?:\.\d+)*| # linux/hp/solaris
                    \.\d+\.a|              # aix
                    \.dll|                 # windows
                    (\.\d+)*\.dylib)$      # mac""" # filter out non-libs
        libs = [lib for lib in libs if re.match(isLib, lib, re.I | re.X)]
        load = WinDLL if sys.platform.lower().startswith("win") else CDLL
        if libs and debug:  # pragma: no cover
            print(os.path.basename(path).upper().center(79, "-"))
            print("\n".join(libs))
        # PermissionError: are the DLLs on a network share (e.g NAS)?
        return [load(os.path.join(path, lib)) for lib in libs][-1]
             
    def loadLibrary(self):  # pragma: no cover
        """This function loads and returns the SPSSIO libraries,
        depending on the platform."""

        arch = platform.architecture()[0]
        is_32bit, is_64bit = arch == "32bit", arch == "64bit"
        pf = sys.platform.lower()

        # windows
        if pf.startswith("win") and is_32bit:
            spssio = self._loadLibs("win32")
        elif pf.startswith("win"):
            spssio = self._loadLibs("win64")

        # linux
        elif pf.startswith("lin") and is_32bit:
            spssio = self._loadLibs("lin32")
        elif pf.startswith("lin") and is_64bit and os.uname()[-1] == "s390x":
            # zLinux64: Thanks Anderson P. from System z Linux LinkedIn Group!
            spssio = self._loadLibs("zlinux")
        elif pf.startswith("lin") and is_64bit:
            spssio = self._loadLibs("lin64")

        # other
        elif pf.startswith("darwin") or pf.startswith("mac"):
            # Mac: Thanks Rich Sadowsky!
            spssio = self._loadLibs("macos")
        elif pf.startswith("aix") and is_64bit:
            spssio = self._loadLibs("aix64")
        elif pf.startswith("hp-ux"):
            spssio = self._loadLibs("hpux_it")
        elif pf.startswith("sunos") and is_64bit:
            spssio = self._loadLibs("sol64")
        else:
            msg = "Your platform (%r, %s) is not supported" % (pf, arch)
            raise EnvironmentError(msg)

        return spssio

    def wideCharToMultiByte(self, fn):  # pragma: no cover
        """Maps a wide character string to a new character filename string. 
        The new character string is not necessarily from a multibyte character set. 

        See also
        --------
        MSDN `http://msdn.microsoft.com/en-us/library/windows/desktop/dd374130`_
        """

        from ctypes import wintypes

        _CP_UTF8 = 65001
        _CP_ACP = 0  # ANSI
        _LPBOOL = POINTER(c_long)

        _wideCharToMultiByte = windll.kernel32.WideCharToMultiByte
        _wideCharToMultiByte.restype = c_int
        _wideCharToMultiByte.argtypes = [wintypes.UINT, wintypes.DWORD,
                                         wintypes.LPCWSTR, c_int,
                                         wintypes.LPSTR, c_int,
                                         wintypes.LPCSTR, _LPBOOL]
        codePage = _CP_ACP
        dwFlags = 0
        lpWideCharStr = fn
        cchWideChar = len(fn)
        lpMultiByteStr = None
        cbMultiByte = 0  # zero requests size
        lpDefaultChar = None
        lpUsedDefaultChar = None

        # get size
        mbcssize = _wideCharToMultiByte(
        codePage, dwFlags, lpWideCharStr, cchWideChar, lpMultiByteStr,
        cbMultiByte, lpDefaultChar, lpUsedDefaultChar)
        if mbcssize <= 0:
            raise WinError(mbcssize)
        lpMultiByteStr = create_string_buffer(mbcssize)

        # convert
        retcode = _wideCharToMultiByte(
        codePage, dwFlags, lpWideCharStr, cchWideChar, lpMultiByteStr,
        mbcssize, lpDefaultChar, lpUsedDefaultChar)
        if retcode <= 0:
            raise WinError(retcode)
        return lpMultiByteStr.value

    def openSavFile(self, savFileName, mode=b"rb", refSavFileName=None):
        """This function opens IBM SPSS Statistics data files in mode <mode>
        and returns a handle that  should be used for subsequent operations on
        the file. If <savFileName> is opened in mode "cp", meta data
        information aka the spss dictionary is copied from <refSavFileName>"""
        # determine which spssOpen* function should be used
        mode = mode.decode("utf-8") if hasattr(mode, "decode") else mode
        spssOpen = {"rb": self.spssio.spssOpenRead,
                    "wb": self.spssio.spssOpenWrite,
                    "cp": self.spssio.spssOpenWriteCopy,
                    "ab": self.spssio.spssOpenAppend}.get(mode)
        if not spssOpen:
            raise ValueError("Invalid mode argument: %r" % mode)

        # get a file descriptor/handle for the file
        expanduser, abspath = os.path.expanduser, os.path.abspath
        expandfn = lambda fn: self._encodeFileName(expanduser(abspath(fn)))
        savFileName = expandfn(savFileName)
        with open(savFileName, mode) as f:
            fd = f.fileno()

        # open the .sav file
        savFileName = c_char_py3k(savFileName)
        fh = c_int(fd) 
        if mode == b"cp":
            if not refSavFileName:
                raise ValueError("You must specify a reference (=donor) file")
            refSavFileName = c_char_py3k(expandfn(refSavFileName))
            spssOpen.argtypes = [c_char_p, c_char_p, POINTER(c_int)]
            retcode = spssOpen(savFileName, refSavFileName, byref(fh))
        else:
            spssOpen.argtypes = [c_char_p, POINTER(c_int)]
            retcode = spssOpen(savFileName, byref(fh))

        msg = "Problem opening file %r in mode %r" % (savFileName.value, mode)
        checkErrsWarns(msg, retcode)
        return fh.value

    def closeSavFile(self, fh, mode=b"rb"):
        """This function closes the .sav file associated with <fh> that was
        open in mode <mode>."""
        mode = mode.encode("utf-8") if hasattr(mode, "encode") else mode
        spssClose = {b"rb": self.spssio.spssCloseRead,
                     b"wb": self.spssio.spssCloseWrite,
                     b"cp": self.spssio.spssCloseWrite,
                     b"ab": self.spssio.spssCloseAppend}.get(mode)
        spssClose.argtypes = [c_int]
        retcode = spssClose(fh) if spssClose else 9999
        msg = "Problem closing file in mode %r" % mode
        checkErrsWarns(msg, retcode)

    @property
    def releaseInfo(self):
        """This function reports release- and machine-specific information
        about the open file."""
        relInfo = ["release number", "release subnumber", "fixpack number",
                   "machine code", "floating-point representation code",
                   "compression scheme code", "big/little-endian code",
                   "character representation code"]
        relInfoArr = (c_int * len(relInfo))()
        func = self.spssio.spssGetReleaseInfo
        func.argtypes = [c_int, (c_int * len(relInfo))]
        retcode = func(self.fh, relInfoArr)
        checkErrsWarns("Problem getting ReleaseInfo", retcode)
        info = dict([(item, relInfoArr[i]) for i, item in enumerate(relInfo)])
        return info

    @property
    def byteorder(self):
        """This function returns the byte order of the open file as a string.
        It returns either 'little' or 'big'.""" 
        endianness = self.releaseInfo["big/little-endian code"]
        return "big" if endianness else "little"

    @property
    def spssVersion(self):
        """Return the SPSS version that was used to create the opened file
        as a three-tuple indicating major, minor, and fixpack version asunde
        ints. NB: in the transition from SPSS to IBM, a new four-digit
        versioning nomenclature is used. This function returns the old
        three-digit nomenclature. Therefore, no patch version information
        is available."""
        info = self.releaseInfo
        major = info["release number"]
        minor = info["release subnumber"]
        fixpack = info["fixpack number"]
        ver_info = (major, minor, fixpack)
        fields = "major minor fixpack"
        return collections.namedtuple("SpssVersion", fields)(*ver_info)

    @property
    def spssioVersion(self):
        """This function returns the version of the IBM SPSS I/O libraries
        as a named tuple with the fields major, minor, patch, fixpack.
        May also be inspected by passing an empty savFileName, as in:
        savReaderWriter.Generic("").spssioVersion"""
        if not hasattr(self, "spssio"):
            self.spssio = self.loadLibrary()
        version_pattern = re.compile(br"\d+\.\d+\.\d+\.\d+")
        for line in open(self.spssio._name, "rb"):
            m = version_pattern.search(line)
            if m:
                ver_info = map(int, m.group(0).split(b"."))
                fields = "major minor patch fixpack"
                version = collections.namedtuple("SpssioVersion", fields)
                return version(*ver_info)

    @property
    def fileCompression(self):
        """Get/Set the file compression.
        Returns/Takes a compression switch which may be any of the following:
        'uncompressed', 'standard', or 'zlib'. Zlib comression requires SPSS
        v21 I/O files."""
        compression = {0: b"uncompressed", 1: b"standard", 2: b"zlib"}
        compSwitch = c_int()
        func = self.spssio.spssGetCompression
        func.argtypes = [c_int, POINTER(c_int)]
        retcode = func(self.fh, byref(compSwitch))
        checkErrsWarns("Problem getting file compression", retcode)
        return compression.get(compSwitch.value)

    @fileCompression.setter
    def fileCompression(self, compSwitch):
        compression = {b"uncompressed": 0, b"standard": 1, b"zlib": 2}
        compSwitch = compression.get(compSwitch)
        func = self.spssio.spssSetCompression
        func.argtypes = [c_int, c_int]
        retcode = func(self.fh, compSwitch)
        invalidSwitch = retcodes.get(retcode) == 'SPSS_INVALID_COMPSW'
        if invalidSwitch and self.spssVersion[0] < 21:
            msg = "Writing zcompressed files requires >=v21 SPSS I/O libraries"
            raise ValueError(msg)
        checkErrsWarns("Problem setting file compression", retcode)

    @property
    def systemString(self):
        """This function returns the name of the system under which the file
        was created aa a string."""
        sysName = create_string_buffer(42)
        func = self.spssio.spssGetSystemString
        retcode = func(c_int(self.fh), byref(sysName))
        checkErrsWarns("Problem getting SystemString", retcode)
        return sysName.value

    def getStruct(self, varTypes, varNames, mode=b"rb"):
        """This function returns a compiled struct object. The required
        struct format string for the conversion between C and Python
        is created on the basis of varType and byte order.
        --varTypes: SPSS data files have either 8-byte doubles/floats or n-byte
          chars[]/ strings, where n is always 8 bytes or a multiple thereof.
        --byte order: files are written in the byte order of the host system
          (mode="wb") and read/appended using the byte order information
          contained in the SPSS data file (mode is "ab" or "rb" or "cp")"""
        if mode in (b"ab", b"rb", b"cp"):   # derive endianness from file
            endianness = "<" if self.byteorder == "little" else ">"
        elif mode == b"wb":                 # derive endianness from host
            if sys.byteorder == "little":
                endianness = "<"
            elif sys.byteorder == "big":  # pragma: no cover
                endianness = ">"
            else:                         # pragma: no cover
                endianness = "@"
        structFmt = [endianness]
        ceil = math.ceil
        for varName in varNames:
            varType = varTypes[varName]
            if varType == 0:
                structFmt.append("d")
            else:
                fmt = str(int(ceil(int(varType) / 8.0) * 8))
                structFmt.append(fmt + "s")
        return struct.Struct("".join(structFmt))

    def getCaseBuffer(self):
        """This function returns a buffer and a pointer to that buffer. A whole
        case will be read into this buffer."""
        caseSize = c_long()
        func = self.spssio.spssGetCaseSize
        func.argtypes = [c_int, POINTER(c_long)]
        retcode = func(self.fh, byref(caseSize))
        caseBuffer = create_string_buffer(caseSize.value)
        checkErrsWarns("Problem getting case buffer", retcode)
        return caseBuffer

    @property
    def sysmis(self):
        """This function returns the IBM SPSS Statistics system-missing
        value (`$SYSMIS`) for the host system (also called 'NA' in other
        systems)."""
        if hasattr(self, "_sysmis"):
            return self._sysmis
        try:
            self._sysmis = -1 * sys.float_info[0]  # Python 2.6 and higher.
        except AttributeError:
            self.spssio.spssSysmisVal.restype = c_float
            self._sysmis = self.spssio.spssSysmisVal()
        return self._sysmis

    @property
    def missingValuesLowHigh(self):
        """This function returns the 'lowest' and 'highest' values used for
        numeric missing value ranges on the host system. This can be used in
        a similar way as the LO and HI keywords in missing values
        specifications (cf. `MISSING VALUES foo (LO THRU 0)`. It may be called
        at any time."""
        Range = collections.namedtuple("MissingValueRange", "lo hi")
        try:
            lowest, highest = c_double(), c_double()
            func = self.spssio.spssLowHighVal
            func.argtypes = [POINTER(c_double), POINTER(c_double)] 
            retcode = func(byref(lowest), byref(highest))
            checkErrsWarns("Problem getting min/max missing values", retcode)
            return Range(lowest.value, highest.value)
        except SPSSIOError:  # Windows, maybe more
            return Range(-sys.float_info.max, sys.float_info.max)

    @property
    def ioLocale(self):
        """This function gets/sets the I/O Module's locale.
        This corresponds with the SPSS command `SET LOCALE`. The I/O Module's
        locale is separate from that of the client application. The
        <localeName> parameter and the return value are identical to those
        for the C run-time function setlocale. The exact locale name
        specification depends on the OS of the host sytem, but has the
        following form::

            <lang>_<territory>.<codeset>[@<modifiers>]

        The 'codeset' and 'modifier' components are optional and in Windows,
        aliases (e.g. 'english') may be used. When the I/O Module is first
        loaded, its locale is set to the system default. 
        
        See also
        --------
        linux : `<https://wiki.archlinux.org/index.php/Locale>`_ 
        windows : `<http://msdn.microsoft.com/en-us/library/39cwe7zf(v=vs.80).aspx>`_"""
        if hasattr(self, "setLocale"):
            return self.setLocale
        else:
            localeName = locale.setlocale(locale.LC_CTYPE)
            msg = "NOTE. Locale not set; getting current locale: %s"
            print(msg % localeName)
            return localeName

    @ioLocale.setter
    def ioLocale(self, localeName=""):
        if not localeName:
            localeName = locale.setlocale(locale.LC_CTYPE)  # see also issue #26
        func = self.spssio.spssSetLocale
        func.argtypes = [c_int, c_char_p]
        func.restype = c_char_p
        self.setLocale = func(locale.LC_ALL, c_char_py3k(localeName))
        if self.setLocale is None:
            raise ValueError("Invalid ioLocale: %r" % localeName)

    @property
    def fileCodePage(self):
        """This function provides the Windows code page number of the encoding
        applicable to a file."""
        nCodePage = c_int()
        func = self.spssio.spssGetFileCodePage
        func.argtypes = [c_int, POINTER(c_int)] 
        retcode = func(self.fh, byref(nCodePage))
        checkErrsWarns("Problem getting file codepage", retcode)
        return nCodePage.value

[docs]    def isCompatibleEncoding(self):
        """This function determines whether the file and interface encoding
        are compatible."""
        try:
            # Windows, note typo 'Endoding'!
            func = self.spssio.spssIsCompatibleEndoding
        except AttributeError:
            func = self.spssio.spssIsCompatibleEncoding
        func.argtypes = [c_int, POINTER(c_int)] 
        func.restype = c_bool
        isCompatible = c_int()
        retcode = func(self.fh, byref(isCompatible))
        msg = "Error testing encoding compatibility: %r" % isCompatible.value
        checkErrsWarns(msg, retcode)
        if not isCompatible.value and not self.ioUtf8:
            msg = ("NOTE. SPSS Statistics data file %r is written in a "
                   "character encoding (%s) incompatible with the current "
                   "ioLocale setting (%s). It may not be readable. Consider "
                   "changing ioLocale or setting ioUtf8=True.")
            print(msg % (self.savFileName, self.fileEncoding, self.ioLocale))
        return bool(isCompatible.value)

    @property
    def ioUtf8(self):
        """This function returns/sets the current interface encoding

        * ``ioUtf8 = False`` --> `CODEPAGE` mode,
        * ``ioUtf8 = True`` --> UTF-8 mode, aka. `UNICODE` mode
        This corresponds with the SPSS command `SHOW UNICODE` (getter)
        and `SET UNICODE=ON/OFF` (setter).

        See also
        --------
        SPSS-unicode-mode : `<http://www-01.ibm.com/support/knowledgecenter/SSLVMB_21.0.0/com.ibm.spss.statistics.help/faq_unicode.htm>`_
        """
        if hasattr(self, "ioUtf8_"):
            return self.ioUtf8_
        self.ioUtf8_ = self.spssio.spssGetInterfaceEncoding()
        return bool(self.ioUtf8_)

    @ioUtf8.setter
    def ioUtf8(self, ioUtf8):
        try:
            func = self.spssio.spssSetInterfaceEncoding
            func.argtypes = [c_int]
            retcode = func(int(ioUtf8))
            label = retcodes.get(retcode)
            # files_open: ignore error with nested context managers
            if retcode > 0 and label != "SPSS_FILES_OPEN":
                msg = "Error setting IO interface [%s]"
                raise SPSSIOError(msg % label)
        except TypeError:
            raise Exception("Invalid interface encoding: %r (must be bool)")
        if retcode < 0:
            checkErrsWarns("Problem setting ioUtf8", retcode)

    @property
    def fileEncoding(self):
        """This function obtains the encoding applicable to a file.
        The encoding is returned as an IANA encoding name, such as
        ISO-8859-1, which is then converted to the corresponding Python
        codec name. If the file contains no file encoding, the locale's
        preferred encoding is returned"""
        try:
            pszEncoding = create_string_buffer(20)  # is 20 enough??
            func = self.spssio.spssGetFileEncoding
            retcode = func(c_int(self.fh), byref(pszEncoding))
            checkErrsWarns("Problem getting file encoding", retcode)
            iana_codes = encodings.aliases.aliases
            rawEncoding = pszEncoding.value.lower().decode("utf-8")
            if rawEncoding.replace("-", "") in iana_codes:
                iana_code = rawEncoding.replace("-", "")
            else:
                iana_code = rawEncoding.replace("-", "_")
            fileEncoding = iana_codes[iana_code]
            return fileEncoding
        except KeyError:
            print ("NOTE. IANA coding lookup error. Code %r " % iana_code +
                   "does not map to any Python codec.")
            return locale.getpreferredencoding()

    @property
    def record(self):
        """Get/Set a whole record from/to a pre-allocated buffer"""
        args = c_int(self.fh), byref(self.caseBuffer)
        retcode = self.wholeCaseIn(*args)
        if retcode:
            checkErrsWarns("Problem reading row", retcode)
        record = list(self.unpack_from(self.caseBuffer))
        return record

    @record.setter
    def record(self, record):
        try:
            self.pack_into(self.caseBuffer, 0, *record)
        except struct.error:
            msg = sys.exc_info()[1]
            if any([isinstance(value, unicode) for value in record]):
                msg +=  ". Use ioUtf8=True to write unicode strings"
            raise TypeError(msg)
        self.wholeCaseOut.argtypes = [c_int, c_char_p]
        retcode = self.wholeCaseOut(self.fh, c_char_py3k(self.caseBuffer.raw))
        if retcode:
            checkErrsWarns("Problem writing row\n" + record, retcode)