Source code for hessianfree.ffnet

"""Implementation of feedforward network, including Gauss-Newton approximation
for use in Hessian-free optimization.

.. codeauthor:: Daniel Rasmussen <daniel.rasmussen@appliedbrainresearch.com>

Based on
Martens, J. (2010). Deep learning via Hessian-free optimization. In Proceedings
of the 27th International Conference on Machine Learning.
"""

from __future__ import print_function

from collections import defaultdict, OrderedDict
import pickle
import warnings

import numpy as np

import hessianfree as hf


[docs]class FFNet(object):
    """Implementation of feed-forward network (including gradient/curvature
    computation).

    :param list shape: the number of neurons in each layer
    :param layers: nonlinearity to use in the network (or a list giving a
        nonlinearity for each layer)
    :type layers: :class:`~.nonlinearities.Nonlinearity` or `list`
    :param dict conns: dictionary of the form `{layer_x:[layer_y, layer_z],
        ...}` specifying the connections between layers (default is to
        connect in series)
    :param loss_type: loss function (or list of loss functions) used to
        evaluate network
    :type loss_type: :class:`~.loss_funcs.LossFunction` or `list`
    :param dict W_init_params: parameters passed to :meth:`.init_weights`
        (see parameter descriptions in that function)
    :param bool use_GPU: run curvature computation on GPU (requires
        PyCUDA and scikit-cuda)
    :param load_weights: load initial weights from given array or filename
    :type load_weights: `str` or :class:`~numpy:numpy.ndarray`
    :param bool debug: activates expensive features to help with debugging
    :param rng: used to generate any random numbers for this network (use
        this to control the seed)
    :type rng: :class:`~numpy:numpy.random.RandomState`
    :param dtype: floating point precision used throughout the network
    :type dtype: :class:`~numpy:numpy.dtype`
    """

    def __init__(self, shape, layers=hf.nl.Logistic(), conns=None,
                 loss_type=hf.loss_funcs.SquaredError(), W_init_params=None,
                 use_GPU=False, load_weights=None, debug=False, rng=None,
                 dtype=np.float32):

        self.debug = debug
        self.shape = shape
        self.n_layers = len(shape)
        self.dtype = np.float64 if debug else dtype
        self.mask = None
        self._optimizer = None
        self.rng = np.random.RandomState() if rng is None else rng

        # note: this isn't used internally, it is just here so that an
        # external process with a handle to this object can tell what epoch
        # it is on
        self.epoch = None

        self.inputs = None
        self.targets = None
        self.activations = None
        self.d_activations = None

        # initialize layer nonlinearities
        if not isinstance(layers, (list, tuple)):
            if isinstance(layers, hf.nl.Nonlinearity) and layers.stateful:
                warnings.warn("Multiple layers sharing stateful nonlinearity, "
                              "consider creating a separate instance for each "
                              "layer.")
            layers = [layers for _ in range(self.n_layers)]
            layers[0] = hf.nl.Linear()

        if len(layers) != len(shape):
            raise ValueError("Number of nonlinearities (%d) does not match "
                             "number of layers (%d)" %
                             (len(layers), len(shape)))

        self.layers = []
        for t in layers:
            if isinstance(t, str):
                # look up the nonlinearity with the given name
                t = getattr(hf.nl, t)()
            if not isinstance(t, hf.nl.Nonlinearity):
                raise TypeError("Layer type (%s) must be an instance of "
                                "nonlinearities.Nonlinearity" % t)
            self.layers += [t]

        # initialize loss function
        self.init_loss(loss_type)

        # initialize connections
        if conns is None:
            # set up the feedforward series connections
            conns = {}
            for pre, post in zip(np.arange(self.n_layers - 1),
                                 np.arange(1, self.n_layers)):
                conns[pre] = [post]

        self.conns = OrderedDict(sorted(conns.items(), key=lambda x: x[0]))
        # note: conns is an ordered dict sorted by layer so that we can
        # reliably loop over the items (in compute_offsets and init_weights)

        # maintain a list of backwards connections as well (for efficient
        # lookup in the other direction)
        self.back_conns = defaultdict(list)
        for pre in conns:
            for post in conns[pre]:
                self.back_conns[post] += [pre]

                if pre >= post:
                    raise ValueError("Can only connect from lower to higher "
                                     "layers (%s >= %s)" % (pre, post))

        # add empty connection for first/last layer (just helps smooth the code
        # elsewhere)
        self.conns[self.n_layers - 1] = []
        self.back_conns[0] = []

        # compute indices for the different connection weight matrices in the
        # overall parameter vector
        self.compute_offsets()

        # initialize connection weights
        if load_weights is None:
            if W_init_params is None:
                W_init_params = {}
            self.W = self.init_weights(
                [(self.shape[pre], self.shape[post])
                 for pre in self.conns for post in self.conns[pre]],
                **W_init_params)
        else:
            if isinstance(load_weights, np.ndarray):
                self.W = load_weights
            else:
                # load weights from file
                self.W = np.load(load_weights)

            if len(self.W) != np.max(list(self.offsets.values())):
                raise IndexError(
                    "Length of loaded weights (%s) does not match expected "
                    "length (%s)" % (len(self.W),
                                     np.max(list(self.offsets.values()))))

            if self.W.dtype != self.dtype:
                raise TypeError("Loaded weights dtype (%s) doesn't match "
                                "self.dtype (%s)" % (self.W.dtype, self.dtype))

        # initialize GPU
        if use_GPU:
            try:
                import pycuda
                import skcuda
            except Exception as e:
                print(e)
                raise ImportError("PyCuda/scikit-cuda not installed. "
                                  "Set use_GPU=False.")

            hf.gpu.init_kernels()

        self.use_GPU = use_GPU

[docs]    def run_epochs(self, inputs, targets, optimizer,
                   max_epochs=100, minibatch_size=None, test=None,
                   test_err=None, target_err=1e-6, plotting=False,
                   file_output=None, print_period=10):
        """Apply the given optimizer with a sequence of (mini)batches.

        :param inputs: input vectors (or a :class:`~.nonlinearities.Plant` that
            will generate the input vectors dynamically)
        :type inputs: :class:`~numpy:numpy.ndarray` or
            :class:`~.nonlinearities.Plant`
        :param targets: target vectors corresponding to each input vector (or
            None if a plant is being used)
        :type targets: :class:`~numpy:numpy.ndarray`
        :param optimizer: computes the weight update each epoch (see
            optimizers.py)
        :param int max_epochs: the maximum number of epochs to run
        :param int minibatch_size: the size of the minibatch to use in each epoch
            (or None to use full batches)
        :param tuple test: tuple of (inputs,targets) to use as the test data
            (if None then the same inputs and targets as training will be used)
        :param test_err: a custom error function to be applied to
            the test data (e.g., classification error)
        :type test_err: :class:`~.loss_funcs.LossFunction`
        :param float target_err: run will terminate if this test error is
            reached
        :param str file_output: output files from the run will use this as a
            prefix (if None then don't output files)
        :param bool plotting: if True then data from the run will be output to
            a file, which can be displayed via dataplotter.py
        :param int print_period: print out information about the run every `x`
            epochs
        """

        test_errs = []
        self.best_W = None
        self.best_error = None
        prefix = "HF" if file_output is None else file_output
        minibatch_size = minibatch_size or inputs.shape[0]
        plots = defaultdict(list)
        self.optimizer = optimizer

        for i in range(max_epochs):
            self.epoch = i
            printing = print_period is not None and (i % print_period == 0 or
                                                     self.debug)

            if printing:
                print("=" * 40)
                print("epoch", i)

            # run minibatches
            indices = self.rng.permutation(inputs.shape[0])
            for start in range(0, inputs.shape[0], minibatch_size):
                # generate minibatch and cache activations
                self.cache_minibatch(
                    inputs, targets, indices[start:start + minibatch_size])

                # validity checks
                if self.inputs.shape[-1] != self.shape[0]:
                    raise ValueError(
                        "Input dimension (%d) does not match number of input "
                        "nodes (%d)" % (self.inputs.shape[-1], self.shape[0]))
                if self.targets.shape[-1] != self.shape[-1]:
                    raise ValueError(
                        "Target dimension (%d) does not match number of "
                        "output nodes (%d)" % (self.targets.shape[-1],
                                               self.shape[-1]))

                assert self.activations[-1].dtype == self.dtype

                # compute update
                update = optimizer.compute_update(printing)

                assert update.dtype == self.dtype

                # apply mask
                if self.mask is not None:
                    update[self.mask] = 0

                # update weights
                self.W += update

                # invalidate cached activations (shouldn't be necessary,
                # but doesn't hurt)
                self.activations = None
                self.d_activations = None
                self.GPU_activations = None

            # compute test error
            if test is None:
                test_in, test_t = inputs, targets
            else:
                test_in, test_t = test[0], test[1]

            if test_err is None:
                err = self.error(self.W, test_in, test_t)
            else:
                output = self.forward(test_in, self.W)
                err = test_err.batch_loss(output, test_t)
            test_errs += [err]

            if printing:
                print("test error", test_errs[-1])

            # save the weights with the best error
            if self.best_W is None or test_errs[-1] < self.best_error:
                self.best_W = self.W.copy()
                self.best_error = test_errs[-1]

            # dump plot data
            if plotting:
                plots["update norm"] += [np.linalg.norm(update)]
                plots["W norm"] += [np.linalg.norm(self.W)]
                plots["test error (log)"] += [test_errs[-1]]

                if hasattr(optimizer, "plots"):
                    plots.update(optimizer.plots)

                with open("%s_plots.pkl" % prefix, "wb") as f:
                    pickle.dump(plots, f)

            # dump weights
            if file_output is not None:
                np.save("%s_weights.npy" % prefix, self.W)

            # check for termination
            if test_errs[-1] < target_err:
                if print_period is not None:
                    print("target error reached")
                break
            if test is not None and i > 10 and test_errs[-10] < test_errs[-1]:
                if print_period is not None:
                    print("overfitting detected, terminating")
                break

[docs]    def forward(self, inputs, params=None, deriv=False):
        """Compute layer activations for given input and parameters.

        :param inputs: input vectors (passed to first layer)
        :type inputs: :class:`~numpy:numpy.ndarray`
        :param params: parameter vector (weights) for the network (defaults to
            ``self.W``)
        :type params: :class:`~numpy:numpy.ndarray`
        :param bool deriv: if True then also compute the derivative of the
            activations
        """

        params = self.W if params is None else params

        if isinstance(inputs, hf.nl.Plant):
            inputs.reset()

        activations = [None for _ in range(self.n_layers)]
        if deriv:
            d_activations = [None for _ in range(self.n_layers)]

        for i in range(self.n_layers):
            if i == 0:
                if isinstance(inputs, hf.nl.Plant):
                    inputs = inputs(None)
                else:
                    inputs = inputs
            else:
                inputs = np.zeros((inputs.shape[0], self.shape[i]),
                                  dtype=self.dtype)
                for pre in self.back_conns[i]:
                    W, b = self.get_weights(params, (pre, i))
                    inputs += np.dot(activations[pre], W)
                    inputs += b
                    # note: we're applying a bias on each connection to a
                    # neuron (rather than one for each neuron). just because
                    # it's easier than tracking how many connections there are
                    # for each layer (but we could do it if it becomes
                    # important).
            activations[i] = self.layers[i].activation(inputs)

            if deriv:
                d_activations[i] = self.layers[i].d_activation(inputs,
                                                               activations[i])

        for i, a in enumerate(activations):
            if not np.all(np.isfinite(a)):
                raise OverflowError("Non-finite nonlinearity activation "
                                    "value (layer %d) \n %s" %
                                    (i, a[not np.isfinite(a)]))
        if deriv:
            return activations, d_activations

        return activations

[docs]    def error(self, W=None, inputs=None, targets=None):
        """Compute network error.

        :param W: network parameters (defaults to ``self.W``)
        :type W: :class:`~numpy:numpy.ndarray`
        :param inputs: input vectors (defaults to the cached (mini)batch for
            current epoch)
        :type inputs: :class:`~numpy:numpy.ndarray`
        :param targets: target vectors (defaults to the cached (mini)batch for
            current epoch)
        :type targets: :class:`~numpy:numpy.ndarray`
        """

        W = self.W if W is None else W
        inputs = self.inputs if inputs is None else inputs

        # get outputs
        if (W is self.W and inputs is self.inputs and
                    self.activations is not None):
            # use cached activations
            activations = self.activations
        else:
            # compute activations
            activations = self.forward(inputs, W)

        # get targets
        if isinstance(inputs, hf.nl.Plant):
            # get targets from plant
            targets = inputs.get_vecs()[1]
        else:
            targets = self.targets if targets is None else targets

        # note: np.nan can be used in the target to specify places
        # where the target is not defined. those get translated to
        # zero error in the loss function.

        error = self.loss.batch_loss(activations, targets)

        return error

[docs]    def cache_minibatch(self, inputs, targets, minibatch=None):
        """Pick a subset of inputs and targets to use in minibatch, and cache
        the activations for that minibatch."""

        if minibatch is None:
            minibatch = np.arange(inputs.shape[0])

        if not isinstance(inputs, hf.nl.Plant):
            # inputs/targets are vectors
            self.inputs = inputs[minibatch]
            self.targets = targets[minibatch]

            # cache activations
            self.activations, self.d_activations = self.forward(self.inputs,
                                                                self.W,
                                                                deriv=True)
        else:
            # input is a dynamic plant
            if targets is not None:
                raise ValueError("Cannot specify targets when using dynamic "
                                 "plant to generate inputs (plant should "
                                 "generate targets itself)")

            # run plant to generate batch
            inputs.shape[0] = len(minibatch)
            self.activations, self.d_activations = self.forward(inputs, self.W,
                                                                deriv=True)
            self.inputs, self.targets = inputs.get_vecs()

        # cast to self.dtype
        if self.inputs.dtype != self.dtype:
            warnings.warn("Input dtype (%s) not equal to self.dtype (%s)" %
                          (self.inputs.dtype, self.dtype))
        self.inputs = np.asarray(self.inputs, dtype=self.dtype)
        self.targets = np.asarray(self.targets, dtype=self.dtype)
        self.activations = [np.asarray(a, dtype=self.dtype)
                            for a in self.activations]
        self.d_activations = [np.asarray(a, dtype=self.dtype)
                              for a in self.d_activations]
        self.d2_loss = self.loss.d2_loss(self.activations, self.targets)

        # allocate temporary space for intermediate values, to save on
        # memory allocations
        self.tmp_space = [np.zeros(a.shape, self.dtype)
                          for a in self.activations]

        if self.use_GPU:
            # TODO: we could just allocate these on the first timestep and
            # then do a copy rather than an allocation after that, if this
            # ever became a significant part of the computation time
            self.load_GPU_data()

[docs]    def load_GPU_data(self):
        """Load data for the current epoch onto GPU."""

        from pycuda import gpuarray

        # clear out old data (this would happen eventually on its own, but by
        # doing it first we make sure there is room on the GPU before
        # creating new arrays)
        if hasattr(self, "GPU_W"):
            del self.GPU_W
            del self.GPU_activations
            del self.GPU_d_activations
            del self.GPU_d2_loss
            del self.GPU_tmp_space

        self.GPU_W = gpuarray.to_gpu(self.W)
        self.GPU_activations = [gpuarray.to_gpu(a)
                                for a in self.activations]
        self.GPU_d_activations = [gpuarray.to_gpu(a)
                                  for a in self.d_activations]
        self.GPU_d2_loss = [gpuarray.to_gpu(a) if a is not None else None
                            for a in self.d2_loss]
        self.GPU_tmp_space = [gpuarray.empty(a.shape, self.dtype)
                              for a in self.activations]

    @staticmethod
[docs]    def J_dot(J, vec, transpose_J=False, out=None):
        """Compute the product of a Jacobian and some vector."""

        # In many cases the Jacobian is a diagonal matrix, so it is more
        # efficient to just represent it with the diagonal vector.  This
        # function just lets those two be used interchangeably.

        if J.ndim == 2:
            # note: the first dimension is the batch, so ndim==2 means
            # this is a vector representation
            if out is None:
                # passing out=None fails for some reason
                return np.multiply(J, vec)
            else:
                return np.multiply(J, vec, out=out)
        else:
            if transpose_J:
                J = np.transpose(J, (0, 2, 1))

            if out is None:
                # passing out=None fails for some reason
                return np.einsum("ijk,ik->ij", J, vec)

            if out is vec:
                tmp_vec = vec.copy()
            else:
                tmp_vec = vec

            return np.einsum("ijk,ik->ij", J, tmp_vec, out=out)

[docs]    def calc_grad(self):
        """Compute parameter gradient."""

        for l in self.layers:
            if l.stateful:
                raise TypeError("Cannot use neurons with internal state in "
                                "a one-step feedforward network; use "
                                "RNNet instead.")

        grad = np.zeros_like(self.W)

        # backpropagation
        # note: this uses the cached activations, so the forward
        # pass has already been run elsewhere

        # compute output error for each layer
        error = self.loss.d_loss(self.activations, self.targets)

        error = [np.zeros_like(self.activations[i]) if e is None else e
                 for i, e in enumerate(error)]

        deltas = [np.zeros_like(a) for a in self.activations]

        # backwards pass
        for i in range(self.n_layers - 1, -1, -1):
            for post in self.conns[i]:
                error[i] += np.dot(deltas[post],
                                   self.get_weights(self.W, (i, post))[0].T)

                W_grad, b_grad = self.get_weights(grad, (i, post))
                np.dot(self.activations[i].T, deltas[post], out=W_grad)
                np.sum(deltas[post], axis=0, out=b_grad)

            self.J_dot(self.d_activations[i], error[i], transpose_J=True,
                       out=deltas[i])

        grad /= self.inputs.shape[0]

        return grad

[docs]    def check_grad(self, calc_grad):
        """Check gradient via finite differences (for debugging)."""

        eps = 1e-6
        grad = np.zeros_like(calc_grad)
        inc_W = np.zeros_like(self.W)
        for i in range(len(self.W)):
            inc_W[i] = eps

            error_inc = self.error(self.W + inc_W, self.inputs, self.targets)
            error_dec = self.error(self.W - inc_W, self.inputs, self.targets)
            grad[i] = (error_inc - error_dec) / (2 * eps)

            inc_W[i] = 0
        try:
            assert np.allclose(calc_grad, grad, rtol=1e-3)
        except AssertionError:
            print("calc_grad")
            print(calc_grad)
            print("finite grad")
            print(grad)
            print("calc_grad - finite grad")
            print(calc_grad - grad)
            print("calc_grad / finite grad")
            print(calc_grad / grad)
            input("Paused (press enter to continue)")

[docs]    def calc_G(self, v, damping=0, out=None):
        """Compute Gauss-Newton matrix-vector product."""

        if out is None:
            Gv = np.zeros(self.W.size, dtype=self.dtype)
        else:
            Gv = out
            Gv.fill(0)

        # R forward pass
        R_activations = [np.zeros_like(a) for a in self.activations]
        for i in range(1, self.n_layers):
            for pre in self.back_conns[i]:
                vw, vb = self.get_weights(v, (pre, i))
                Ww, _ = self.get_weights(self.W, (pre, i))

                R_activations[i] += np.dot(self.activations[pre], vw,
                                           out=self.tmp_space[i])
                R_activations[i] += vb
                R_activations[i] += np.dot(R_activations[pre], Ww,
                                           out=self.tmp_space[i])

            self.J_dot(self.d_activations[i], R_activations[i],
                       out=R_activations[i])

        # backward pass
        R_error = R_activations

        for i in range(self.n_layers - 1, -1, -1):
            if self.d2_loss[i] is not None:
                # note: R_error[i] is already set to R_activations[i]
                R_error[i] *= self.d2_loss[i]
            else:
                R_error[i].fill(0)

            for post in self.conns[i]:
                W, _ = self.get_weights(self.W, (i, post))

                R_error[i] += np.dot(R_error[post], W.T,
                                     out=self.tmp_space[i])

                W_g, b_g = self.get_weights(Gv, (i, post))
                np.dot(self.activations[i].T, R_error[post], out=W_g)
                np.sum(R_error[post], axis=0, out=b_g)

            self.J_dot(self.d_activations[i], R_error[i],
                       out=R_error[i], transpose_J=True)

        Gv /= len(self.inputs)

        Gv += damping * v  # Tikhonov damping

        return Gv

[docs]    def GPU_calc_G(self, v, damping=0, out=None):
        """Compute Gauss-Newton matrix-vector product on GPU."""

        from pycuda import gpuarray

        if out is None or not isinstance(out, gpuarray.GPUArray):
            Gv = gpuarray.zeros(self.W.shape, self.dtype)
        else:
            Gv = out
            Gv.fill(0)

        if not isinstance(v, gpuarray.GPUArray):
            GPU_v = gpuarray.to_gpu(v)
        else:
            GPU_v = v

        # R forward pass
        R_activations = self.GPU_tmp_space

        for i in range(self.n_layers):
            R_activations[i].fill(0)
            for pre in self.back_conns[i]:
                vw, vb = self.get_weights(GPU_v, (pre, i))
                Ww, _ = self.get_weights(self.GPU_W, (pre, i))

                hf.gpu.dot(self.GPU_activations[pre], vw,
                           out=R_activations[i], increment=True)
                hf.gpu.iadd(R_activations[i], vb)
                hf.gpu.dot(R_activations[pre], Ww,
                           out=R_activations[i], increment=True)

            hf.gpu.J_dot(self.GPU_d_activations[i], R_activations[i],
                         out=R_activations[i])

        # backward pass
        R_error = R_activations

        for i in range(self.n_layers - 1, -1, -1):
            if self.GPU_d2_loss[i] is not None:
                # note: R_error[i] is already set to R_activations[i]
                R_error[i] *= self.GPU_d2_loss[i]
            else:
                R_error[i].fill(0)

            for post in self.conns[i]:
                W, _ = self.get_weights(self.GPU_W, (i, post))
                W_g, b_g = self.get_weights(Gv, (i, post))

                hf.gpu.dot(R_error[post], W, transpose_b=True,
                           out=R_error[i], increment=True)

                hf.gpu.dot(self.GPU_activations[i], R_error[post],
                           transpose_a=True, out=W_g)

                hf.gpu.sum_cols(R_error[post], out=b_g)

            hf.gpu.J_dot(self.GPU_d_activations[i], R_error[i], out=R_error[i],
                         transpose_J=True)

        # Tikhonov damping and batch mean
        Gv._axpbyz(1.0 / len(self.inputs), GPU_v, damping, Gv)

        if isinstance(v, gpuarray.GPUArray):
            return Gv
        else:
            return Gv.get(out, pagelocked=True)

[docs]    def check_J(self):
        """Compute the Jacobian of the network via finite differences."""

        eps = 1e-6
        N = self.W.size

        # compute the Jacobian
        J = [None for _ in self.layers]
        inc_i = np.zeros_like(self.W)
        for i in range(N):
            inc_i[i] = eps

            inc = self.forward(self.inputs, self.W + inc_i)
            dec = self.forward(self.inputs, self.W - inc_i)

            for l in range(self.n_layers):
                J_i = (inc[l] - dec[l]) / (2 * eps)
                if J[l] is None:
                    J[l] = J_i[..., None]
                else:
                    J[l] = np.concatenate((J[l], J_i[..., None]), axis=-1)

            inc_i[i] = 0

        return J

[docs]    def check_G(self, calc_G, v, damping=0):
        """Check Gv calculation via finite differences (for debugging)."""

        # compute Jacobian
        J = self.check_J()

        # second derivative of loss function
        L = self.loss.d2_loss(self.activations, self.targets)
        # TODO: check loss via finite differences

        G = np.sum([np.einsum("aji,aj,ajk->ik", J[l], L[l], J[l])
                    for l in range(self.n_layers) if L[l] is not None], axis=0)

        # divide by batch size
        G /= self.inputs.shape[0]

        Gv = np.dot(G, v)
        Gv += damping * v

        try:
            assert np.allclose(calc_G, Gv, rtol=1e-3)
        except AssertionError:
            print("calc_G")
            print(calc_G)
            print("finite G")
            print(Gv)
            print("calc_G - finite G")
            print(calc_G - Gv)
            print("calc_G / finite G")
            print(calc_G / Gv)
            input("Paused (press enter to continue)")

[docs]    def init_weights(self, shapes, coeff=1.0, biases=0.0, init_type="sparse"):
        """Weight initialization, given shapes of weight matrices.

        Note: coeff, biases, and init_type can be specified by the
        `W_init_params` dict in :class:`.FFNet`.  Each can be
        specified as a single value (for all matrices) or as a list giving a
        value for each matrix.

        :param list shapes: list of (pre,post) shapes for each weight matrix
        :param float coeff: scales the magnitude of the connection weights
        :param float biases: bias values for the post of each matrix
        :param str init_type: type of initialization to use (currently supports
            'sparse', 'uniform', 'gaussian')
        """

        # if given single parameters, expand for all matrices
        if isinstance(coeff, (int, float)):
            coeff = [coeff] * len(shapes)
        if isinstance(biases, (int, float)):
            biases = [biases] * len(shapes)
        if isinstance(init_type, str):
            init_type = [init_type] * len(shapes)

        W = [np.zeros((pre + 1, post), dtype=self.dtype)
             for pre, post in shapes]

        for i, s in enumerate(shapes):
            if init_type[i] == "sparse":
                # sparse initialization (from martens)
                num_conn = 15

                for j in range(s[1]):
                    # pick num_conn random pre neurons
                    indices = self.rng.choice(np.arange(s[0]),
                                              size=min(num_conn, s[0]),
                                              replace=False)

                    # connect to post
                    W[i][indices, j] = self.rng.randn(indices.size) * coeff[i]
            elif init_type[i] == "uniform":
                W[i][:-1] = self.rng.uniform(-coeff[i] / np.sqrt(s[0]),
                                             coeff[i] / np.sqrt(s[0]),
                                             (s[0], s[1]))
            elif init_type[i] == "gaussian":
                W[i][:-1] = self.rng.randn(s[0], s[1]) * coeff[i]
            else:
                raise ValueError("Unknown weight initialization (%s)"
                                 % init_type)

            # set biases
            W[i][-1, :] = biases[i]

        W = np.concatenate([w.flatten() for w in W])

        return W

[docs]    def compute_offsets(self):
        """Precompute offsets for layers in the overall parameter vector."""

        self.offsets = {}
        offset = 0
        for pre in self.conns:
            for post in self.conns[pre]:
                n_params = (self.shape[pre] + 1) * self.shape[post]
                self.offsets[(pre, post)] = (
                    offset,
                    offset + n_params - self.shape[post],
                    offset + n_params)
                offset += n_params

        return offset

[docs]    def get_weights(self, params, conn):
        """Get weight matrix for a connection from overall parameter vector."""

        if conn not in self.offsets:
            return None

        offset, W_end, b_end = self.offsets[conn]
        W = params[offset:W_end]
        b = params[W_end:b_end]
        return W.reshape((self.shape[conn[0]], self.shape[conn[1]])), b

[docs]    def init_loss(self, loss_type):
        """Set the loss type for this network to the given
        :class:`~.loss_funcs.LossFunction` (or a list of functions can be
        passed to create a :class:`~.loss_funcs.LossSet`)."""

        if isinstance(loss_type, (list, tuple)):
            tmp = loss_type
        else:
            tmp = [loss_type]

        for t in tmp:
            if not isinstance(t, hf.loss_funcs.LossFunction):
                raise TypeError("loss_type (%s) must be an instance of "
                                "LossFunction" % t)

            # sanity checks
            if (isinstance(t, hf.loss_funcs.CrossEntropy) and
                    np.any(self.layers[-1].activation(
                        np.linspace(-80, 80, 100)[None, :]) <= 0)):
                # this won't catch everything, but hopefully a useful warning
                raise ValueError("Must use positive activation function "
                                 "with cross-entropy error")
            if (isinstance(t, hf.loss_funcs.CrossEntropy) and
                    not isinstance(self.layers[-1], hf.nl.Softmax)):
                warnings.warn("Softmax should probably be used with "
                              "cross-entropy error")

        if isinstance(loss_type, (list, tuple)):
            self.loss = hf.loss_funcs.LossSet(loss_type)
        else:
            self.loss = loss_type

    def _run_epoch(self, inputs, targets, minibatch_size=None):
        """A stripped down version of run_epochs that just does the update
        without any overhead.

        Can be used for optimizers where the cost to compute an update is
        very cheap, in which case the overhead (e.g., computing test error,
        saving weights, outputting data for plotting, etc.) becomes
        non-negligible.
        """

        minibatch_size = minibatch_size or inputs.shape[0]
        indices = self.rng.permutation(inputs.shape[0])
        for start in range(0, inputs.shape[0], minibatch_size):
            # generate minibatch and cache activations
            self.cache_minibatch(
                inputs, targets, indices[start:start + minibatch_size])

            # compute update
            self.W += self.optimizer.compute_update(False)

    @property
    def optimizer(self):
        return self._optimizer

    @optimizer.setter
    def optimizer(self, o):
        self._optimizer = o
        o.net = self