Source code for hessianfree.loss_funcs

from functools import wraps

import numpy as np


[docs]class LossFunction:
    """Defines a loss function that maps nonlinearity activations to error."""

[docs]    def loss(self, activities, targets):
        """Computes the loss for each unit in the network.

        Note that most loss functions are only based on the output of the
        final layer, activities[-1]. However, we pass the activities of all
        layers here so that loss functions can include things like
        sparsity constraints. Targets, however, are only defined for the
        output layer.

        Targets can be defined as ``np.nan``, which will be translated
        into zero error.

        :param list activities: output activations of each layer
        :param targets: target activation values for last layer
        :type targets: :class:`~numpy:numpy.ndarray`
        """
        raise NotImplementedError()

[docs]    def d_loss(self, activities, targets):
        """First derivative of loss function (with respect to activities)."""
        raise NotImplementedError()

[docs]    def d2_loss(self, activities, targets):
        """Second derivative of loss function (with respect to activities)."""
        raise NotImplementedError()

[docs]    def batch_loss(self, activities, targets):
        """Utility function to compute a single loss value for the network
        (taking the mean across batches and summing across and within layers).
        """

        losses = self.loss(activities, targets)
        return np.sum([np.true_divide(np.sum(l), l.shape[0]) for l in losses
                       if l is not None])


[docs]def output_loss(func):
    """Convenience decorator that takes a loss defined for the output layer
    and converts it into the more general form in terms of all layers."""

    @wraps(func)
    def wrapped_loss(self, activities, targets):
        result = [None for _ in activities[:-1]]
        result += [func(self, activities[-1], targets)]

        return result

    return wrapped_loss


[docs]class SquaredError(LossFunction):
    """Squared error

    :math:`\\frac{1}{2} \\sum(output - target)^2`
    """

    @output_loss
    def loss(self, output, targets):
        return np.sum(np.nan_to_num(output - targets) ** 2,
                      axis=tuple(range(1, output.ndim))) / 2

    @output_loss
    def d_loss(self, output, targets):
        return np.nan_to_num(output - targets)

    @output_loss
    def d2_loss(self, output, _):
        return np.ones_like(output)


[docs]class CrossEntropy(LossFunction):
    """Cross-entropy error

    :math:`-\\sum(target * log(output))`
    """
    @output_loss
    def loss(self, output, targets):
        return -np.sum(np.nan_to_num(targets) * np.log(output),
                       axis=tuple(range(1, output.ndim)))

    @output_loss
    def d_loss(self, output, targets):
        return -np.nan_to_num(targets) / output

    @output_loss
    def d2_loss(self, output, targets):
        return np.nan_to_num(targets) / output ** 2


[docs]class ClassificationError(LossFunction):
    """Classification error

    :math:`argmax(output) \\neq argmax(target)`

    Note: ``d_loss`` and ``d2_loss`` are not defined; classification error
    should only be used for validation, which doesn't require either.
    """

    @output_loss
    def loss(self, output, targets):
        return np.logical_and(np.argmax(output, axis=-1) !=
                              np.argmax(targets, axis=-1),
                              np.logical_not(np.isnan(np.sum(targets,
                                                             axis=-1))))


[docs]class StructuralDamping(LossFunction):
    """Applies structural damping, which penalizes layers for having
    highly variable output activity.

    Note: this is not exactly the same as the structural damping in
    Martens (2010), because it is applied on the output side of the
    nonlinearity (meaning that this error will be filtered through
    ``d_activations`` during the backwards propagation).

    :param float weight: scale on structural damping relative to other losses
    :param list layers: indices specifying which layers will have the
        damping applied (defaults to all except first/last layers)
    :param optimizer: if provided, the weight on structural damping will be
        scaled relative to the ``damping`` attribute in the optimizer
        (so that any processes dynamically adjusting the damping during the
        optimization will also affect the structural damping)
    :type optimizer: :class:`~hessianfree.optimizers.Optimizer`
    """

    def __init__(self, weight, layers=None, optimizer=None):
        self.weight = weight
        self.layers = (np.index_exp[1:-1] if layers is None else
                       np.asarray(layers))
        self.opt = optimizer

    def loss(self, activities, _):
        return [None for _ in activities]

    def d_loss(self, activities, _):
        return [None for _ in activities]

    def d2_loss(self, activities, _):
        opt_damp = 1 if self.opt is None else getattr(self.opt, "damping", 1)

        d2_loss = [None for _ in activities]
        for l in np.arange(len(activities))[self.layers]:
            d2_loss[l] = np.ones_like(activities[l]) * self.weight * opt_damp

        return d2_loss


[docs]class SparseL1(LossFunction):
    """Imposes L1 sparsity constraint on nonlinearity activations.

    :param float weight: relative weight of sparsity constraint
    :param list layers: indices specifying which layers will have the
        sparsity constraint applied (defaults to all except first/last layers)
    :param float target: target activation level for nonlinearities
    """
    def __init__(self, weight, layers=None, target=0.0):
        # TODO: is it valid to apply L1 sparsity to HF, given that CG is meant
        # to optimize quadratic loss functions?

        self.weight = weight
        self.layers = np.index_exp[1:-1] if layers is None else layers
        self.target = target

    def loss(self, activities, _):
        loss = [None for _ in activities]
        for l in np.arange(len(activities))[self.layers]:
            loss[l] = self.weight * np.abs(activities[l] - self.target)

        return loss

    def d_loss(self, activities, _):
        d_loss = [None for _ in activities]
        for l in np.arange(len(activities))[self.layers]:
            d_loss[l] = self.weight * ((activities[l] > self.target) * 2 - 1)

        return d_loss

    def d2_loss(self, activities, _):
        return [None for _ in activities]


[docs]class SparseL2(LossFunction):
    """Imposes L2 sparsity constraint on nonlinearity activations.

    :param float weight: relative weight of sparsity constraint
    :param list layers: indices specifying which layers will have the
        sparsity constraint applied (defaults to all except first/last layers)
    :param float target: target activation level for nonlinearities
    """

    # note: this is similar to structural damping, except we also include it
    # in the first derivative
    # TODO: test how well this works relative to standard structural damping

    def __init__(self, weight, layers=None, target=0.0):
        self.weight = weight
        self.layers = np.index_exp[1:-1] if layers is None else layers
        self.target = target

    def loss(self, activities, _):
        loss = [None for _ in activities]
        for l in np.arange(len(activities))[self.layers]:
            loss[l] = 0.5 * self.weight * (activities[l] - self.target) ** 2

        return loss

    def d_loss(self, activities, _):
        d_loss = [None for _ in activities]
        for l in np.arange(len(activities))[self.layers]:
            d_loss[l] = self.weight * (activities[l] - self.target)

        return d_loss

    def d2_loss(self, activities, _):
        d2_loss = [None for _ in activities]
        for l in np.arange(len(activities))[self.layers]:
            d2_loss[l] = np.ones_like(activities[l]) * self.weight

        return d2_loss


[docs]class LossSet(LossFunction):
    """Combines several loss functions into one (e.g., combining
    :class:`SquaredError` and :class:`SparseL2`).  It doesn't need to be
    created directly; a list of loss functions can be passed to
    :class:`.FFNet`/:class:`.RNNet` and a LossSet will be created
    automatically.

    :param list set: list of :class:`LossFunction`"""

    def __init__(self, set):
        self.set = set

[docs]    def group_func(self, func_name, activities, targets):
        """Computes the given function for each :class:`LossFunction` in the
        set, and sums the result."""

        # apply each of the loss functions
        result = [getattr(s, func_name)(activities, targets)
                  for s in self.set]

        # sum the losses for each layer across the loss functions
        result = [np.sum([s[i] for s in result if s[i] is not None], axis=0)
                  for i in range(len(activities))]

        # convert 0.0's (from np.sum([])) back to None
        result = [None if (isinstance(x, float) and x == 0.0) else x
                  for x in result]

        return result

    def loss(self, activities, targets):
        return self.group_func("loss", activities, targets)

    def d_loss(self, activities, targets):
        return self.group_func("d_loss", activities, targets)

    def d2_loss(self, activities, targets):
        return self.group_func("d2_loss", activities, targets)