Source code for hessianfree.rnnet

"""Implementation of recurrent network, including Gauss-Newton approximation
for use in Hessian-free optimization.

.. codeauthor:: Daniel Rasmussen <daniel.rasmussen@appliedbrainresearch.com>

Based on
Martens, J., & Sutskever, I. (2011). Learning recurrent neural networks with
hessian-free optimization. Proceedings of the 28th International Conference on
Machine Learning.
"""

from __future__ import print_function

import numpy as np

import hessianfree as hf


[docs]class RNNet(hf.FFNet):
    """Implementation of recurrent deep network (including gradient/curvature
    computation).

    :param list rec_layers: indices of layers with recurrent connections
            (default is to make all except first and last layers recurrent)
    :param dict W_rec_params: parameters used to initialize recurrent
        weights (passed to :meth:`~.init_weights`)
    :param tuple truncation: a tuple `(n,k)` where backpropagation through
        time will be executed every `n` timesteps and run backwards for `k`
        steps (defaults to full backprop if None)

    See :class:`.FFNet` for the remaining parameters."""

    def __init__(self, shape, rec_layers=None, W_rec_params=None,
                 truncation=None, **kwargs):

        # define recurrence for each layer (needs to be done before super
        # constructor because this is used in compute_offsets)
        if rec_layers is None:
            # assume all recurrent except first/last layer
            rec_layers = np.arange(1, len(shape) - 1)
        self.rec_layers = rec_layers

        # super constructor
        super(RNNet, self).__init__(shape, **kwargs)

        self.truncation = truncation

        # add on recurrent weights
        if kwargs.get("load_weights", None) is None and len(rec_layers) > 0:
            if W_rec_params is None:
                W_rec_params = dict()
            self.W = np.concatenate(
                (self.W, self.init_weights([(self.shape[l], self.shape[l])
                                            for l in range(self.n_layers)
                                            if l in rec_layers],
                                           **W_rec_params)))

[docs]    def forward(self, inputs, params=None, deriv=False, init_activations=None,
                init_state=None):
        """Compute layer activations for given input and parameters.

        :param inputs: input vectors (passed to first layer)
        :type inputs: :class:`~numpy:numpy.ndarray`
        :param params: parameter vector (weights) for the network (defaults to
            ``self.W``)
        :type params: :class:`~numpy:numpy.ndarray`
        :param bool deriv: if True then also compute the derivative of the
            activations
        :param list init_activations: initial values for the activations in
            each layer
        :param list init_state: initial values for the internal state of any
            stateful nonlinearities
        """

        # input shape = [minibatch_size, seq_len, input_dim]
        # activations shape = [n_layers, minibatch_size, seq_len, layer_size]

        params = self.W if params is None else params

        if isinstance(inputs, hf.nl.Plant):
            # reset the plant
            # TODO: allow the initial state of plant to be set?
            inputs.reset()

        batch_size = inputs.shape[0]
        sig_len = inputs.shape[1]

        activations = [np.zeros((batch_size, sig_len, l), dtype=self.dtype)
                       for l in self.shape]

        # temporary space to minimize memory allocations
        tmp_space = [np.zeros((batch_size, l), dtype=self.dtype)
                     for l in self.shape]

        if deriv:
            d_activations = [None for _ in self.layers]

        for i, l in enumerate(self.layers):
            # reset any state in the nonlinearities
            l.reset(None if init_state is None else init_state[i])

        W_recs = [self.get_weights(params, (i, i))
                  for i in range(self.n_layers)]
        for s in range(sig_len):
            for i in range(self.n_layers):
                if i == 0:
                    # get the external input
                    if isinstance(inputs, hf.nl.Plant):
                        if s == 0 and init_activations is not None:
                            ff_input = inputs(init_activations[-1])
                        else:
                            # call the plant with the output of the previous
                            # timestep to generate the next input
                            # note: this will pass zeros on the first timestep
                            # if init_activations is not set
                            ff_input = inputs(activations[-1][:, s - 1])
                    else:
                        ff_input = inputs[:, s]
                else:
                    # compute feedforward input
                    ff_input = np.zeros_like(activations[i][:, s])
                    for pre in self.back_conns[i]:
                        W, b = self.get_weights(params, (pre, i))

                        ff_input += np.dot(activations[pre][:, s], W,
                                           out=tmp_space[i])
                        ff_input += b

                # recurrent input
                if i in self.rec_layers:
                    if s > 0:
                        rec_input = np.dot(activations[i][:, s - 1],
                                           W_recs[i][0], out=tmp_space[i])
                    elif init_activations is None:
                        # apply bias input on first timestep
                        rec_input = W_recs[i][1]
                    else:
                        # use the provided activations to initialize the
                        # 'previous' timestep
                        rec_input = np.dot(init_activations[i],
                                           W_recs[i][0], out=tmp_space[i])
                else:
                    rec_input = 0

                # apply activation function
                activations[i][:, s] = self.layers[i].activation(ff_input +
                                                                 rec_input)

                # compute derivative
                if deriv:
                    d_act = self.layers[i].d_activation(ff_input + rec_input,
                                                        activations[i][:, s])
                    if d_activations[i] is None:
                        # note: we can't allocate this array ahead of time,
                        # because we don't know if d_activations will be
                        # returning diagonal vectors or matrices
                        d_activations[i] = np.zeros(
                            np.concatenate(([batch_size], [sig_len],
                                            d_act.shape[1:])),
                            dtype=self.dtype)
                    d_activations[i][:, s] = d_act

        for i, a in enumerate(activations):
            if not np.all(np.isfinite(a)):
                raise OverflowError("Non-finite nonlinearity activation "
                                    "value (layer %d) \n %s" %
                                    (i, a[not np.isfinite(a)]))

        if deriv:
            return activations, d_activations

        return activations

[docs]    def calc_grad(self):
        """Compute parameter gradient."""

        grad = np.zeros_like(self.W)
        W_recs = [self.get_weights(self.W, (l, l))
                  for l in range(self.n_layers)]
        batch_size = self.inputs.shape[0]
        sig_len = self.inputs.shape[1]

        # temporary space to minimize memory allocations
        tmp_act = [np.zeros((batch_size, l), dtype=self.dtype)
                   for l in self.shape]
        tmp_grad = np.zeros_like(grad)

        if self.truncation is None:
            trunc_per = trunc_len = sig_len
        else:
            trunc_per, trunc_len = self.truncation

        for n in range(trunc_per - 1, sig_len, trunc_per):
            # every trunc_per timesteps we want to run backprop

            deltas = [np.zeros((batch_size, l), dtype=self.dtype)
                      for l in self.shape]
            state_deltas = [None if not l.stateful else
                            np.zeros((batch_size, self.shape[i]),
                                     dtype=self.dtype)
                            for i, l in enumerate(self.layers)]

            # backpropagate error
            for s in range(n, np.maximum(n - trunc_len, -1), -1):
                # execute trunc_len steps of backprop through time

                error = self.loss.d_loss([a[:, s] for a in self.activations],
                                         self.targets[:, s])
                error = [np.zeros_like(self.activations[i][:, s]) if e is None
                         else e for i, e in enumerate(error)]

                for l in range(self.n_layers - 1, -1, -1):
                    for post in self.conns[l]:
                        error[l] += np.dot(deltas[post],
                                           self.get_weights(self.W,
                                                            (l, post))[0].T,
                                           out=tmp_act[l])

                        # feedforward gradient
                        W_grad, b_grad = self.get_weights(grad, (l, post))
                        W_tmp_grad, b_tmp_grad = self.get_weights(tmp_grad,
                                                                  (l, post))
                        W_grad += np.dot(self.activations[l][:, s].T,
                                         deltas[post], out=W_tmp_grad)
                        b_grad += np.sum(deltas[post], axis=0, out=b_tmp_grad)

                    # add recurrent error
                    if l in self.rec_layers:
                        error[l] += np.dot(deltas[l], W_recs[l][0].T,
                                           out=tmp_act[l])

                    # compute deltas
                    if not self.layers[l].stateful:
                        self.J_dot(self.d_activations[l][:, s], error[l],
                                   transpose_J=True, out=deltas[l])
                    else:
                        d_input = self.d_activations[l][:, s, ..., 0]
                        d_state = self.d_activations[l][:, s, ..., 1]
                        d_output = self.d_activations[l][:, s, ..., 2]

                        state_deltas[l] += self.J_dot(d_output, error[l],
                                                      transpose_J=True,
                                                      out=tmp_act[l])
                        self.J_dot(d_input, state_deltas[l], transpose_J=True,
                                   out=deltas[l])
                        self.J_dot(d_state, state_deltas[l], transpose_J=True,
                                   out=state_deltas[l])

                    # gradient for recurrent weights
                    if l in self.rec_layers:
                        W_grad, b_grad = self.get_weights(grad, (l, l))
                        W_tmp_grad, b_tmp_grad = self.get_weights(tmp_grad,
                                                                  (l, l))
                        if s > 0:
                            W_grad += np.dot(self.activations[l][:, s - 1].T,
                                             deltas[l], out=W_tmp_grad)
                        else:
                            # put remaining gradient into initial bias
                            b_grad += np.sum(deltas[l], axis=0,
                                             out=b_tmp_grad)

        grad /= batch_size

        return grad

[docs]    def check_grad(self, calc_grad):
        """Check gradient via finite differences (for debugging)."""

        eps = 1e-6
        grad = np.zeros_like(calc_grad)

        sig_len = self.inputs.shape[1]
        if self.truncation is None:
            trunc_per = trunc_len = sig_len
        else:
            trunc_per, trunc_len = self.truncation

        inc_W = np.zeros_like(self.W)

        for n in range(trunc_per, sig_len + 1, trunc_per):
            start = np.maximum(n - trunc_len, 0)

            # the truncated backprop gradient is equivalent to running the
            # network normally for the initial timesteps and then just changing
            # the parameters for the truncation period.  so that's what we're
            # simulating here.
            if start > 0:
                prev = self.forward(self.inputs[:, :start], self.W)
                init_a = [p[:, -1] for p in prev]
                init_s = [l.state.copy() if l.stateful else None
                          for l in self.layers]
            else:
                init_a = None
                init_s = None

            for i in range(len(self.W)):
                inc_W[i] = eps

                out_inc = self.forward(self.inputs[:, start:n], self.W + inc_W,
                                       init_activations=init_a,
                                       init_state=init_s)
                out_dec = self.forward(self.inputs[:, start:n], self.W - inc_W,
                                       init_activations=init_a,
                                       init_state=init_s)

                error_inc = self.loss.batch_loss(out_inc,
                                                 self.targets[:, start:n])

                error_dec = self.loss.batch_loss(out_dec,
                                                 self.targets[:, start:n])

                grad[i] += (error_inc - error_dec) / (2 * eps)

                inc_W[i] = 0

        try:
            assert np.allclose(calc_grad, grad, rtol=1e-3)
        except AssertionError:
            print("calc_grad")
            print(calc_grad)
            print("finite grad")
            print(grad)
            print("calc_grad - finite grad")
            print(calc_grad - grad)
            print("calc_grad / finite grad")
            print(calc_grad / grad)
            input("Paused (press enter to continue)")

[docs]    def calc_G(self, v, damping=0, out=None):
        """Compute Gauss-Newton matrix-vector product."""

        if out is None:
            Gv = np.zeros(self.W.size, dtype=self.dtype)
        else:
            Gv = out
            Gv.fill(0)

        batch_size = self.inputs.shape[0]
        sig_len = self.inputs.shape[1]

        # temporary space to minimize memory allocations
        tmp_act = [np.zeros((batch_size, l), dtype=self.dtype)
                   for l in self.shape]
        tmp_grad = np.zeros_like(Gv)

        # R forward pass
        R_states = [None if not l.stateful else
                    np.zeros((batch_size, self.shape[i]), dtype=self.dtype)
                    for i, l in enumerate(self.layers)]
        R_activations = self.tmp_space
        for a in R_activations:
            a.fill(0)

        v_recs = [self.get_weights(v, (l, l))
                  for l in range(self.n_layers)]
        W_recs = [self.get_weights(self.W, (l, l))
                  for l in range(self.n_layers)]
        Gv_recs = [self.get_weights(Gv, (l, l))
                   for l in range(self.n_layers)]
        v_ff = dict([(conn, self.get_weights(v, conn))
                     for conn in self.offsets])
        W_ff = dict([(conn, self.get_weights(self.W, conn))
                     for conn in self.offsets])
        Gv_ff = dict([(conn, self.get_weights(Gv, conn))
                      for conn in self.offsets])

        for s in range(sig_len):
            for l in range(self.n_layers):
                R_act = R_activations[l][:, s]

                # input from feedforward connections
                for pre in self.back_conns[l]:
                    vw, vb = v_ff[(pre, l)]
                    Ww, _ = W_ff[(pre, l)]

                    R_act += np.dot(self.activations[pre][:, s], vw,
                                    out=tmp_act[l])
                    R_act += vb
                    R_act += np.dot(R_activations[pre][:, s], Ww,
                                    out=tmp_act[l])

                # recurrent input
                if l in self.rec_layers:
                    if s == 0:
                        # bias input on first step
                        R_act += v_recs[l][1]
                    else:
                        R_act += np.dot(self.activations[l][:, s - 1],
                                        v_recs[l][0], out=tmp_act[l])
                        R_act += np.dot(R_activations[l][:, s - 1],
                                        W_recs[l][0], out=tmp_act[l])

                if not self.layers[l].stateful:
                    self.J_dot(self.d_activations[l][:, s], R_act, out=R_act)
                else:
                    d_input = self.d_activations[l][:, s, ..., 0]
                    d_state = self.d_activations[l][:, s, ..., 1]
                    d_output = self.d_activations[l][:, s, ..., 2]

                    R_states[l] = self.J_dot(d_state, R_states[l])

                    R_states[l] += self.J_dot(d_input, R_act, out=tmp_act[l])
                    self.J_dot(d_output, R_states[l], out=R_act)

        # R backward pass
        if self.truncation is None:
            trunc_per = trunc_len = sig_len
        else:
            trunc_per, trunc_len = self.truncation

        R_error = [np.zeros((batch_size, l), dtype=self.dtype)
                   for l in self.shape]
        R_deltas = [np.zeros((batch_size, l), dtype=self.dtype)
                    for l in self.shape]

        for n in range(trunc_per - 1, sig_len, trunc_per):
            for i in range(self.n_layers):
                R_deltas[i].fill(0)
                if R_states[i] is not None:
                    R_states[i].fill(0)

            for s in range(n, np.maximum(n - trunc_len, -1), -1):
                for l in range(self.n_layers - 1, -1, -1):
                    if self.d2_loss[l] is not None:
                        np.multiply(self.d2_loss[l][:, s],
                                    R_activations[l][:, s],
                                    out=R_error[l])
                    else:
                        R_error[l].fill(0)

                    # error from feedforward connections
                    for post in self.conns[l]:
                        R_error[l] += np.dot(R_deltas[post],
                                             W_ff[(l, post)][0].T,
                                             out=tmp_act[l])

                        # feedforward gradient
                        W_g, b_g = Gv_ff[(l, post)]
                        W_tmp_grad, b_tmp_grad = self.get_weights(tmp_grad,
                                                                  (l, post))
                        W_g += np.dot(self.activations[l][:, s].T,
                                      R_deltas[post], out=W_tmp_grad)
                        b_g += np.sum(R_deltas[post], axis=0, out=b_tmp_grad)

                    # add recurrent error
                    if l in self.rec_layers:
                        R_error[l] += np.dot(R_deltas[l], W_recs[l][0].T,
                                             out=tmp_act[l])

                    # compute deltas
                    if not self.layers[l].stateful:
                        self.J_dot(self.d_activations[l][:, s], R_error[l],
                                   transpose_J=True, out=R_deltas[l])
                    else:
                        d_input = self.d_activations[l][:, s, ..., 0]
                        d_state = self.d_activations[l][:, s, ..., 1]
                        d_output = self.d_activations[l][:, s, ..., 2]

                        R_states[l] += self.J_dot(d_output, R_error[l],
                                                  transpose_J=True,
                                                  out=tmp_act[l])
                        self.J_dot(d_input, R_states[l], transpose_J=True,
                                   out=R_deltas[l])
                        self.J_dot(d_state, R_states[l], transpose_J=True,
                                   out=R_states[l])

                    # recurrent gradient
                    if l in self.rec_layers:
                        W_g, b_g = Gv_recs[l]
                        W_tmp_grad, b_tmp_grad = self.get_weights(tmp_grad,
                                                                  (l, l))
                        if s > 0:
                            W_g += np.dot(self.activations[l][:, s - 1].T,
                                          R_deltas[l], out=W_tmp_grad)
                        else:
                            b_g += np.sum(R_deltas[l], axis=0, out=b_tmp_grad)

        Gv /= batch_size

        Gv += damping * v  # Tikhonov damping

        return Gv

[docs]    def load_GPU_data(self):
        """Load data for the current epoch onto GPU."""

        from pycuda import gpuarray

        def split_axes(array, n=1):
            # split a multidimensional array into a corresponding list of lists
            # along the first n axes (this is used so that array.__getitem__
            # isn't called repeatedly, as it is somewhat expensive for
            # gpuarrays)
            if n == 1:
                return [a for a in array]

            return [split_axes(a, n - 1) for a in array]

        # clear out old data (this would happen eventually on its own, but by
        # doing it first we make sure there is room on the GPU before
        # creating new arrays)
        if hasattr(self, "GPU_W"):
            del self.GPU_W
            del self.GPU_activations
            del self.GPU_d_activations
            del self.GPU_d2_loss
            del self.GPU_tmp_space
            del self.GPU_states
            del self.GPU_errors
            del self.GPU_deltas

        self.GPU_W = gpuarray.to_gpu(self.W)

        # rearrange GPU data so that signal is the first axis (so
        # that each time step is a single block of memory in GPU_calc_G)
        self.GPU_activations = [
            split_axes(gpuarray.to_gpu(np.ascontiguousarray(
                np.swapaxes(a, 0, 1))), 1)
            for a in self.activations]

        self.GPU_d_activations = [
            split_axes(gpuarray.to_gpu(np.ascontiguousarray(
                np.rollaxis(np.swapaxes(a, 0, 1), -1, 1))), 2)
            if self.layers[i].stateful else
            split_axes(gpuarray.to_gpu(np.ascontiguousarray(
                np.swapaxes(a, 0, 1))), 1)
            for i, a in enumerate(self.d_activations)]

        self.GPU_d2_loss = [
            split_axes(gpuarray.to_gpu(np.ascontiguousarray(
                np.swapaxes(a, 0, 1))), 1)
            if a is not None else None for a in self.d2_loss]

        self.GPU_tmp_space = [split_axes(gpuarray.empty((a.shape[1],
                                                         a.shape[0],
                                                         a.shape[2]),
                                                        self.dtype), 1)
                              for a in self.activations]

        # pre-allocate calc_G arrays
        batch_size = self.inputs.shape[0]
        self.GPU_states = [[gpuarray.empty((batch_size, self.shape[i]),
                                           dtype=self.dtype) for _ in range(2)]
                           if l.stateful else None
                           for i, l in enumerate(self.layers)]
        self.GPU_errors = [gpuarray.empty((batch_size, l),
                                          dtype=self.dtype)
                           for l in self.shape]
        self.GPU_deltas = [gpuarray.empty((batch_size, l),
                                          dtype=self.dtype)
                           for l in self.shape]

[docs]    def GPU_calc_G(self, v, damping=0, out=None):
        """Compute Gauss-Newton matrix-vector product on GPU."""

        from pycuda import gpuarray

        if out is None or not isinstance(out, gpuarray.GPUArray):
            Gv = gpuarray.zeros(self.W.shape, dtype=self.dtype)
        else:
            Gv = out
            Gv.fill(0)

        if not isinstance(v, gpuarray.GPUArray):
            GPU_v = gpuarray.to_gpu(v)
        else:
            GPU_v = v

        batch_size = self.inputs.shape[0]
        sig_len = self.inputs.shape[1]

        # R forward pass
        R_states = self.GPU_states
        R_activations = self.GPU_tmp_space
        for i in range(self.n_layers):
            R_activations[i][0].base.fill(0)
            if R_states[i] is not None:
                R_states[i][0].fill(0)

        v_recs = [self.get_weights(GPU_v, (l, l))
                  for l in range(self.n_layers)]
        W_recs = [self.get_weights(self.GPU_W, (l, l))
                  for l in range(self.n_layers)]
        Gv_recs = [self.get_weights(Gv, (l, l))
                   for l in range(self.n_layers)]
        v_ff = dict([(conn, self.get_weights(GPU_v, conn))
                     for conn in self.offsets])
        W_ff = dict([(conn, self.get_weights(self.GPU_W, conn))
                     for conn in self.offsets])
        Gv_ff = dict([(conn, self.get_weights(Gv, conn))
                      for conn in self.offsets])

        for s in range(sig_len):
            for l in range(self.n_layers):
                R_act = R_activations[l][s]

                # input from feedforward connections
                for pre in self.back_conns[l]:
                    vw, vb = v_ff[(pre, l)]
                    hf.gpu.dot(self.GPU_activations[pre][s], vw,
                               out=R_act, increment=True)
                    hf.gpu.iadd(R_act, vb)
                    hf.gpu.dot(R_activations[pre][s], W_ff[(pre, l)][0],
                               out=R_act, increment=True)

                # recurrent input
                if l in self.rec_layers:
                    if s == 0:
                        # bias input on first step
                        hf.gpu.iadd(R_act, v_recs[l][1])
                    else:
                        hf.gpu.dot(self.GPU_activations[l][s - 1],
                                   v_recs[l][0], out=R_act, increment=True)
                        hf.gpu.dot(R_activations[l][s - 1], W_recs[l][0],
                                   out=R_act, increment=True)

                if not self.layers[l].stateful:
                    if not isinstance(self.layers[l], hf.nl.Linear):
                        # note: this requires a memory allocation if
                        # d_activations is non-diagonal
                        hf.gpu.J_dot(self.GPU_d_activations[l][s], R_act,
                                     out=R_act)
                else:
                    d_input = self.GPU_d_activations[l][s][0]
                    d_state = self.GPU_d_activations[l][s][1]
                    d_output = self.GPU_d_activations[l][s][2]

                    # note: we're doing this weird thing with two R_states
                    # in order to avoid doing a copy every time
                    i = s % 2
                    hf.gpu.J_dot(d_state, R_states[l][i],
                                 out=R_states[l][1 - i])
                    hf.gpu.J_dot(d_input, R_act, out=R_states[l][1 - i],
                                 increment=True)
                    hf.gpu.J_dot(d_output, R_states[l][1 - i], out=R_act)

        # R backward pass
        if self.truncation is None:
            trunc_per = trunc_len = sig_len
        else:
            trunc_per, trunc_len = self.truncation

        R_error = self.GPU_errors
        R_deltas = self.GPU_deltas

        for n in range(trunc_per - 1, sig_len, trunc_per):
            for i in range(self.n_layers):
                R_deltas[i].fill(0)
                if R_states[i] is not None:
                    R_states[i][n % 2].fill(0)

            for s in range(n, np.maximum(n - trunc_len, -1), -1):
                for l in range(self.n_layers - 1, -1, -1):
                    if self.GPU_d2_loss[l] is not None:
                        hf.gpu.multiply(self.GPU_d2_loss[l][s],
                                        R_activations[l][s], out=R_error[l])
                    else:
                        R_error[l].fill(0)

                    # error from feedforward connections
                    for post in self.conns[l]:
                        W, _ = W_ff[(l, post)]
                        hf.gpu.dot(R_deltas[post], W, out=R_error[l],
                                   transpose_b=True, increment=True)

                        # feedforward gradient
                        W_g, b_g = Gv_ff[(l, post)]
                        hf.gpu.dot(self.GPU_activations[l][s], R_deltas[post],
                                   out=W_g, transpose_a=True, increment=True)
                        hf.gpu.sum_cols(R_deltas[post], out=b_g,
                                        increment=True)

                    # add recurrent error
                    if l in self.rec_layers:
                        hf.gpu.dot(R_deltas[l], W_recs[l][0], out=R_error[l],
                                   transpose_b=True, increment=True)

                    # compute deltas
                    if not self.layers[l].stateful:
                        hf.gpu.J_dot(self.GPU_d_activations[l][s], R_error[l],
                                     out=R_deltas[l], transpose_J=True)
                    else:
                        d_input = self.GPU_d_activations[l][s][0]
                        d_state = self.GPU_d_activations[l][s][1]
                        d_output = self.GPU_d_activations[l][s][2]

                        i = s % 2
                        hf.gpu.J_dot(d_output, R_error[l], out=R_states[l][i],
                                     increment=True, transpose_J=True)
                        hf.gpu.J_dot(d_input, R_states[l][i], out=R_deltas[l],
                                     transpose_J=True)
                        hf.gpu.J_dot(d_state, R_states[l][i],
                                     out=R_states[l][1 - i], transpose_J=True)

                    # recurrent gradient
                    if l in self.rec_layers:
                        if s > 0:
                            hf.gpu.dot(self.GPU_activations[l][s - 1],
                                       R_deltas[l], out=Gv_recs[l][0],
                                       transpose_a=True, increment=True)
                        else:
                            hf.gpu.sum_cols(R_deltas[l], out=Gv_recs[l][1],
                                            increment=True)

        # Tikhonov damping and batch mean
        Gv._axpbyz(1.0 / batch_size, GPU_v, damping, Gv)

        if isinstance(v, gpuarray.GPUArray):
            return Gv
        else:
            return Gv.get(out, pagelocked=True)

[docs]    def check_J(self, start=0, stop=None):
        """Compute the Jacobian of the network via finite differences."""

        eps = 1e-6
        N = self.W.size

        # as in check_grad, the truncation is equivalent to running the network
        # normally for the initial timesteps and then changing the parameters,
        # so that's what we do here to compute the Jacobian

        if start > 0:
            prev = self.forward(self.inputs[:, :start], self.W)
            init_a = [p[:, -1] for p in prev]
            init_s = [l.state.copy() if l.stateful else None
                      for l in self.layers]
        else:
            init_a = None
            init_s = None

        if stop is None:
            stop = self.inputs.shape[1]

        # compute the Jacobian
        J = [None for _ in self.layers]
        inc_i = np.zeros_like(self.W)
        for i in range(N):
            inc_i[i] = eps

            inc = self.forward(self.inputs[:, start:stop], self.W + inc_i,
                               init_activations=init_a, init_state=init_s)
            dec = self.forward(self.inputs[:, start:stop], self.W - inc_i,
                               init_activations=init_a, init_state=init_s)

            for l in range(self.n_layers):
                if start > 0:
                    inc[l] = np.concatenate((prev[l], inc[l]), axis=1)
                    dec[l] = np.concatenate((prev[l], dec[l]), axis=1)

                J_i = (inc[l] - dec[l]) / (2 * eps)
                if J[l] is None:
                    J[l] = J_i[..., None]
                else:
                    J[l] = np.concatenate((J[l], J_i[..., None]), axis=-1)

            inc_i[i] = 0

        return J

[docs]    def check_G(self, calc_G, v, damping=0):
        """Check Gv calculation via finite differences (for debugging)."""

        sig_len = self.inputs.shape[1]
        if self.truncation is None:
            trunc_per = trunc_len = sig_len
        else:
            trunc_per, trunc_len = self.truncation

        G = np.zeros((len(self.W), len(self.W)), dtype=self.dtype)

        for n in range(trunc_per, sig_len + 1, trunc_per):
            start = np.maximum(n - trunc_len, 0)

            # compute Jacobian
            # note that we do a full forward pass and a partial backwards
            # pass, so we only truncate the backwards J matrix
            J = self.check_J(0, n)
            trunc_J = self.check_J(start, n) if start > 0 else J

            # second derivative of loss function
            L = self.loss.d2_loss([a[:, :n] for a in self.activations],
                                  self.targets[:, :n])
            # TODO: check loss via finite differences

            G += np.sum([np.einsum("abji,abj,abjk->ik", trunc_J[l], L[l], J[l])
                         for l in range(self.n_layers) if L[l] is not None],
                        axis=0)

        # divide by batch size
        G /= self.inputs.shape[0]

        Gv = np.dot(G, v)
        Gv += damping * v

        try:
            assert np.allclose(calc_G, Gv, rtol=1e-3)
        except AssertionError:
            print("calc_G")
            print(calc_G)
            print("finite G")
            print(Gv)
            print("calc_G - finite G")
            print(calc_G - Gv)
            print("calc_G / finite G")
            print(calc_G / Gv)
            input("Paused (press enter to continue)")

[docs]    def compute_offsets(self):
        """Precompute offsets for layers in the overall parameter vector."""

        ff_offset = super(RNNet, self).compute_offsets()

        # offset for recurrent weights is end of ff weights
        offset = ff_offset
        for l in range(self.n_layers):
            if l in self.rec_layers:
                self.offsets[(l, l)] = (
                    offset,
                    offset + self.shape[l] * self.shape[l],
                    offset + (self.shape[l] + 1) * self.shape[l])
                offset += (self.shape[l] + 1) * self.shape[l]

        return offset - ff_offset