Source code for lspi.domains

# -*- coding: utf-8 -*-
"""Contains example domains that LSPI works on."""


import abc


from random import randint, random

import numpy as np

from sample import Sample


[docs]class Domain(object):

    r"""ABC for domains.

    Minimum interface for a reinforcement learning domain.
    """

    __metaclass__ = abc.ABCMeta

    @abc.abstractmethod
[docs]    def num_actions(self):
        """Return number of possible actions for the given domain.

        Actions are indexed from 0 to num_actions - 1.

        Returns
        -------
        int
            Number of possible actions.
        """
        pass  # pragma: no cover

    @abc.abstractmethod
[docs]    def current_state(self):
        """Return the current state of the domain.

        Returns
        -------
        numpy.array
            The current state of the environment expressed as a numpy array
            of the individual state variables.
        """
        pass  # pragma: no cover

    @abc.abstractmethod
[docs]    def apply_action(self, action):
        """Apply action and return a sample.

        Parameters
        ----------
        action: int
            The action index to apply. This should be a number in the range
            [0, num_actions())

        Returns
        -------
        sample.Sample
            Sample containing the previous state, the action applied, the
            received reward and the resulting state.
        """
        pass  # pragma: no cover

    @abc.abstractmethod
[docs]    def reset(self, initial_state=None):
        """Reset the simulator to initial conditions.

        Parameters
        ----------
        initial_state: numpy.array
            Optionally specify the state to reset to. If None then the domain
            should use its default initial set of states. The type will
            generally be a numpy.array, but a subclass may accept other types.

        """
        pass  # pragma: no cover

    @abc.abstractmethod
[docs]    def action_name(self, action):
        """Return a string representation of the action.

        Parameters
        ----------
        action: int
            The action index to apply. This number should be in the range
            [0, num_actions())

        Returns
        -------
        str
            String representation of the action index.
        """
        pass  # pragma: no cover


[docs]class ChainDomain(Domain):

    """Chain domain from LSPI paper.

    Very simple MDP. Used to test LSPI methods and demonstrate the interface.
    The state space is a series of discrete nodes in a chain. There are two
    actions: Left and Right. These actions fail with a configurable
    probability. When the action fails to performs the opposite action. In
    otherwords if left is the action applied, but it fails, then the agent will
    actually move right (assuming it is not in the right most state).

    The default reward for any action in a state is 0. There are 2 special
    states that will give a +1 reward for entering. The two special states can
    be configured to appear at the end of the chain, in the middle, or
    in the middle of each half of the state space.

    Parameters
    ----------
    num_states: int
        Number of states in the chain. Must be at least 4.
        Defaults to 10 states.
    reward_location: ChainDomain.RewardLoction
        Location of the states with +1 rewards
    failure_probability: float
        The probability that the applied action will fail. Must be in range
        [0, 1]

    """

[docs]    class RewardLocation(object):

        """Location of states giving +1 reward in the chain.

        Ends:
            Rewards will be given at the ends of the chain.
        Middle:
            Rewards will be given at the middle two states of the chain.
        HalfMiddles:
            Rewards will be given at the middle two states of each half
            of the chain.

        """

        Ends, Middle, HalfMiddles = range(3)

    __action_names = ['left', 'right']

    def __init__(self, num_states=10,
                 reward_location=RewardLocation.Ends,
                 failure_probability=.1):
        """Initialize ChainDomain."""
        if num_states < 4:
            raise ValueError('num_states must be >= 4')
        if failure_probability < 0 or failure_probability > 1:
            raise ValueError('failure_probability must be in range [0, 1]')

        self.num_states = int(num_states)
        self.reward_location = reward_location
        self.failure_probability = failure_probability

        self._state = ChainDomain.__init_random_state(num_states)

[docs]    def num_actions(self):
        """Return number of actions.

        Chain domain has 2 actions.

        Returns
        -------
        int
            Number of actions

        """
        return 2

[docs]    def current_state(self):
        """Return the current state of the domain.

        Returns
        -------
        numpy.array
            The current state as a 1D numpy vector of type int.

        """
        return self._state

[docs]    def apply_action(self, action):
        """Apply the action to the chain.

        If left is applied then the occupied state index will decrease by 1.
        Unless the agent is already at 0, in which case the state will not
        change.

        If right is applied then the occupied state index will increase by 1.
        Unless the agent is already at num_states-1, in which case the state
        will not change.

        The reward function is determined by the reward location specified when
        constructing the domain.

        If failure_probability is > 0 then there is the chance for the left
        and right actions to fail. If the left action fails then the agent
        will move right. Similarly if the right action fails then the agent
        will move left.

        Parameters
        ----------
        action: int
            Action index. Must be in range [0, num_actions())

        Returns
        -------
        sample.Sample
            The sample for the applied action.

        Raises
        ------
        ValueError
            If the action index is outside of the range [0, num_actions())

        """
        if action < 0 or action >= 2:
            raise ValueError('Action index outside of bounds [0, %d)' %
                             self.num_actions())

        action_failed = False
        if random() < self.failure_probability:
            action_failed = True

        # this assumes that the state has one and only one occupied location
        if (action == 0 and not action_failed) \
                or (action == 1 and action_failed):
            new_location = max(0, self._state[0]-1)
        else:
            new_location = min(self.num_states-1, self._state[0]+1)

        next_state = np.array([new_location])

        reward = 0
        if self.reward_location == ChainDomain.RewardLocation.Ends:
            if new_location == 0 or new_location == self.num_states-1:
                reward = 1
        elif self.reward_location == ChainDomain.RewardLocation.Middle:
            if new_location == int(self.num_states/2) \
                    or new_location == int(self.num_states/2 + 1):
                reward = 1
        else:  # HalfMiddles case
            if new_location == int(self.num_states/4) \
                    or new_location == int(3*self.num_states/4):
                reward = 1

        sample = Sample(self._state.copy(), action, reward, next_state.copy())

        self._state = next_state

        return sample

[docs]    def reset(self, initial_state=None):
        """Reset the domain to initial state or specified state.

        If the state is unspecified then it will generate a random state, just
        like when constructing from scratch.

        State must be the same size as the original state. State values can be
        either 0 or 1. There must be one and only one location that contains
        a value of 1. Whatever the numpy array type used, it will be converted
        to an integer numpy array.

        Parameters
        ----------
        initial_state: numpy.array
            The state to set the simulator to. If None then set to a random
            state.

        Raises
        ------
        ValueError
            If initial state's shape does not match (num_states, ). In
            otherwords the initial state must be a 1D numpy array with the
            same length as the existing state.
        ValueError
            If part of the state has a value or 1, or there are multiple
            parts of the state with value of 1.
        ValueError
            If there are values in the state other than 0 or 1.

        """
        if initial_state is None:
            self._state = ChainDomain.__init_random_state(self.num_states)
        else:
            if initial_state.shape != (1, ):
                raise ValueError('The specified state did not match the '
                                 + 'current state size')
            state = initial_state.astype(np.int)
            if state[0] < 0 or state[0] >= self.num_states:
                raise ValueError('State value must be in range '
                                 + '[0, num_states)')
            self._state = state

[docs]    def action_name(self, action):
        """Return string representation of actions.

        0:
            left
        1:
            right

        Returns
        -------
        str
            String representation of action.
        """
        return ChainDomain.__action_names[action]

    @staticmethod
    def __init_random_state(num_states):
        """Return randomly initialized state of the specified size."""
        return np.array([randint(0, num_states-1)])