Source code for hessianfree.gpu.kernel_wrappers

from __future__ import print_function

import copy
from functools import wraps

import numpy as np
from pycuda import gpuarray
from skcuda import cublas, misc

import hessianfree as hf


[docs]def debug_wrapper(cpu_func, debug=False):
    """Decorator used to specify an equivalent CPU function that can be used
    to verify the output of a GPU function (for debugging)."""

    def debug_func_parametrized(gpu_func):
        @wraps(gpu_func)
        def debug_func(*args, **kwargs):
            if debug:
                cpu_args = list(args)
                for i, a in enumerate(cpu_args):
                    if isinstance(a, gpuarray.GPUArray):
                        cpu_args[i] = a.get()
                cpu_kwargs = copy.copy(kwargs)
                for k in cpu_kwargs:
                    if isinstance(cpu_kwargs[k], gpuarray.GPUArray):
                        cpu_kwargs[k] = cpu_kwargs[k].get()

                cpu_kwargs.pop("stream", None)
                out_cpu = cpu_func(*cpu_args, **cpu_kwargs)

            out_gpu = gpu_func(*args, **kwargs)

            if debug:
                try:
                    tmp = out_gpu.get()
                    assert np.allclose(tmp.ravel(), out_cpu.ravel(), rtol=1e-3)
                except AssertionError:
                    print(np.max(np.abs(out_cpu - tmp)))
                    print("gpu")
                    print(tmp)
                    print("cpu")
                    print(out_cpu)
                    print("gpu/cpu")
                    print(tmp / out_cpu)
                    raise
            return out_gpu
        return debug_func
    return debug_func_parametrized


def cpu_dot(a, b, out=0, transpose_a=False, transpose_b=False,
            increment=False):
    result = np.dot(a.T if transpose_a else a, b.T if transpose_b else b)

    return result + out * increment


def cpu_sum_cols(a, out=0, increment=False):
    result = np.sum(a, axis=0)

    return result + out * increment


def cpu_J_dot(J, v, out=0, transpose_J=False, increment=False):
    if J.ndim == 2:
        result = J * v
    else:
        if transpose_J:
            J = np.transpose(J, (0, 2, 1))
        result = np.einsum("ijk,ik->ij", J, v)

    return result + out * increment


def cpu_multiply(a, b, out=0, increment=False):
    return a * b + out * increment


# @debug_wrapper(cpu_dot, debug=True)
[docs]def cublas_dot(a, b, out=None, transpose_a=False, transpose_b=False,
               increment=False, stream=None):
    """Matrix multiplication using CUBLAS."""

    assert not increment or out is not None

    dtype = a.dtype

    if transpose_a:
        a1, a0 = a.shape
    else:
        a0, a1 = a.shape

    if transpose_b:
        b1, b0 = b.shape
    else:
        b0, b1 = b.shape

    assert a1 == b0

    if out is None:
        out = gpuarray.zeros((a0, b1), dtype=dtype)

    assert a.dtype == b.dtype == out.dtype

    # note: we swap the order of a and b and swap the transposes because
    # cublas assumes column-major ordering
    transa = "t" if transpose_a else "n"
    transb = "t" if transpose_b else "n"
    beta = dtype.type(1.0) if increment else dtype.type(0.0)
    lda = a0 if transpose_a else a1
    ldb = b0 if transpose_b else b1
    ldout = b1

    if stream is not None:
        # note: this assumes that the stream is set to the default stream to
        # start
        cublas.cublasSetStream(misc._global_cublas_handle, stream.handle)

    if dtype == np.float32:
        gemm = cublas.cublasSgemm
    else:
        gemm = cublas.cublasDgemm

    gemm(misc._global_cublas_handle, transb, transa, b1, a0, a1,
         dtype.type(1.0), b.gpudata, ldb, a.gpudata, lda, beta, out.gpudata,
         ldout)

    if stream is not None:
        cublas.cublasSetStream(misc._global_cublas_handle, 0)

    return out


# @debug_wrapper(cpu_J_dot, True)
[docs]def J_dot(J, v, out=None, transpose_J=False, increment=False, stream=None):
    """Equivalent to :meth:`~.FFNet.J_dot`, on the GPU."""

    if J.ndim == 2:
        return multiply(J, v, out=out, increment=increment, stream=stream)

    dtype = J.dtype

    if transpose_J:
        a1, a0 = J.shape[1:]
    else:
        a0, a1 = J.shape[1:]

    if out is None:
        out = gpuarray.zeros((J.shape[0], a1), dtype=dtype)
    elif out is v:
        # note: we allow out to be v in this function because it can be
        # handled efficiently in the element-wise case
        v = v.copy()

    assert J.dtype == v.dtype == out.dtype

    # mv kernel approach
    block = (32, 32, 1)
    grid = (a0 // 32 + (a0 % 32 != 0), J.shape[0])
    hf.gpu.mv_batched_kernel[dtype ==
                             np.float32][transpose_J].prepared_async_call(
        grid, block, stream, J.gpudata, v.gpudata, out.gpudata,
        np.int32(a0), np.int32(a1), np.int32(increment),
        shared_size=1088 * dtype.itemsize)

    return out


# @debug_wrapper(cpu_sum_cols, debug=True)
[docs]def sum_cols(a, out=None, increment=False, stream=None):
    """Sum `a` along columns."""

    dtype = a.dtype

    if out is None:
        out = gpuarray.empty(int(a.shape[1]), dtype=dtype)

    assert a.dtype == out.dtype

    block_x = min(a._block[0] // 32, a.shape[1])
    block_y = 32
    grid = (a.shape[1] // block_x + (a.shape[1] % block_x != 0), 1)

    hf.gpu.sum_cols_kernel[dtype == np.float32].prepared_async_call(
        grid, (block_x, block_y, 1), stream,
        a.gpudata, out.gpudata, np.int32(increment), np.int32(a.shape[0]),
        np.int32(a.shape[1]),
        shared_size=block_x * block_y * dtype.itemsize)

    return out


# @debug_wrapper(lambda a, b: a + b, True)
[docs]def iadd(a, b, stream=None):
    """In-place addition of `a` and `b`, broadcasting `b` along columns."""

    dtype = a.dtype
    assert a.dtype == b.dtype

    block_x = min(32, a.shape[0])
    block_y = min(a._block[0] // block_x, a.shape[1])
    grid = (a.shape[1] // block_x + (a.shape[1] % block_x != 0),
            a.shape[0] // block_y + (a.shape[0] % block_y != 0))

    hf.gpu.iadd_kernel[dtype == np.float32].prepared_async_call(
        grid, (block_x, block_y, 1), stream,
        a.gpudata, b.gpudata, np.int32(a.shape[0]), np.int32(a.shape[1]))

    return a


# @debug_wrapper(cpu_multiply, True)
[docs]def multiply(a, b, out=None, increment=False, stream=None):
    """Element-wise product of `a` and `b`."""

    dtype = a.dtype

    if out is None:
        out = gpuarray.zeros(a.shape, dtype=dtype)

    assert a.size == b.size
    assert a.dtype == b.dtype == out.dtype

    block = (min(a._block[0], a.size), 1, 1)
    grid = (a.size // block[0] + (a.size % block[0] != 0), 1, 1)

    hf.gpu.multiply_kernel[dtype == np.float32].prepared_async_call(
        grid, block, stream,
        a.gpudata, b.gpudata, out.gpudata, np.int32(a.size),
        np.int32(increment))

    return out


# @debug_wrapper(cpu_dot, debug=True)
[docs]def shared_dot(a, b, out=None, transpose_a=False, transpose_b=False,
               increment=False, stream=None):
    """Matrix multiplication that doesn't rely on CUBLAS (could be swapped in
    if scikit-cuda were not available for some reason)."""

    dtype = a.dtype

    # note: these transposes don't actually rearrange anything in memory,
    # just changing the shape
    if transpose_a:
        a = a.T
    if transpose_b:
        b = b.T

    assert a.shape[1] == b.shape[0]
    assert out is None or (out is not a and out is not b)

    if out is None:
        out = gpuarray.zeros((a.shape[0], b.shape[1]), dtype=dtype)

    assert a.dtype == b.dtype == out.dtype

    # note: the block is transposed from what you might think, so it's
    # (b_shape[1], a_shape[0]), because we want the x threads aligned with
    # rows to support memory coalescing
    block_x = block_y = 32
    grid = (b.shape[1] // block_x + (b.shape[1] % block_x != 0),
            a.shape[0] // block_y + (a.shape[0] % block_y != 0))

    hf.gpu.m_dot_kernel[dtype == np.float32
                        ][transpose_a][transpose_b].prepared_async_call(
        grid, (block_x, block_y, 1), stream,
        a.gpudata, b.gpudata, out.gpudata, np.int32(a.shape[0]),
        np.int32(a.shape[1]), np.int32(b.shape[1]), np.int32(increment),
        shared_size=(block_x + 1) * block_y * 2 * dtype.itemsize)

    return out