Source code for garage.replay_buffer.base

"""
This module implements a replay buffer memory.

Replay buffer is an important technique in reinforcement learning. It
stores transitions in a memory buffer of fixed size. When the buffer is
full, oldest memory will be discarded. At each step, a batch of memories
will be sampled from the buffer to update the agent's parameters. In a
word, replay buffer breaks temporal correlations and thus benefits RL
algorithms.
"""

import abc
from abc import abstractmethod

import numpy as np


[docs]class ReplayBuffer(metaclass=abc.ABCMeta):
    """
    Abstract class for Replay Buffer.

    Args:
        env_spec (garage.envs.EnvSpec): Environment specification.
        size_in_transitions (int): total size of transitions in the buffer
        time_horizon (int): time horizon of rollout.
    """

    def __init__(self, env_spec, size_in_transitions, time_horizon):
        self._current_size = 0
        self._current_ptr = 0
        self._n_transitions_stored = 0
        self._time_horizon = time_horizon
        self._size_in_transitions = size_in_transitions
        self._size = size_in_transitions // time_horizon
        self._initialized_buffer = False
        self._buffer = {}
        self._episode_buffer = {}

[docs]    def store_episode(self):
        """Add an episode to the buffer."""
        episode_buffer = self._convert_episode_to_batch_major()
        rollout_batch_size = len(episode_buffer['observation'])
        idx = self._get_storage_idx(rollout_batch_size)

        for key in self._buffer.keys():
            self._buffer[key][idx] = episode_buffer[key]
        self._n_transitions_stored = min(
            self._size_in_transitions, self._n_transitions_stored +
            self._time_horizon * rollout_batch_size)

[docs]    @abstractmethod
    def sample(self, batch_size):
        """Sample a transition of batch_size."""
        raise NotImplementedError

[docs]    def add_transition(self, **kwargs):
        """Add one transition into the replay buffer."""
        transition = {k: [v] for k, v in kwargs.items()}
        self.add_transitions(**transition)

[docs]    def add_transitions(self, **kwargs):
        """
        Add multiple transitions into the replay buffer.

        A transition contains one or multiple entries, e.g.
        observation, action, reward, terminal and next_observation.
        The same entry of all the transitions are stacked, e.g.
        {'observation': [obs1, obs2, obs3]} where obs1 is one
        numpy.ndarray observation from the environment.

        Args:
            kwargs (dict(str, [numpy.ndarray])): Dictionary that holds
                the transitions.
        """
        if not self._initialized_buffer:
            self._initialize_buffer(**kwargs)

        for key, value in kwargs.items():
            self._episode_buffer[key].append(value)

        if len(self._episode_buffer['observation']) == self._time_horizon:
            self.store_episode()
            for key in self._episode_buffer.keys():
                self._episode_buffer[key].clear()

    def _initialize_buffer(self, **kwargs):
        for key, value in kwargs.items():
            self._episode_buffer[key] = list()
            values = np.array(value)
            self._buffer[key] = np.zeros(
                [self._size, self._time_horizon, *values.shape[1:]],
                dtype=values.dtype)
        self._initialized_buffer = True

    def _get_storage_idx(self, size_increment=1):
        """Get the storage index for the episode to add into the buffer."""
        if self._current_size + size_increment <= self._size:
            idx = np.arange(self._current_size,
                            self._current_size + size_increment)
        elif self._current_size < self._size:
            overflow = size_increment - (self._size - self._current_size)
            idx_a = np.arange(self._current_size, self._size)
            idx_b = np.arange(0, overflow)
            idx = np.concatenate([idx_a, idx_b])
            self._current_ptr = overflow
        else:
            if self._current_ptr + size_increment <= self._size:
                idx = np.arange(self._current_ptr,
                                self._current_ptr + size_increment)
                self._current_ptr += size_increment
            else:
                overflow = size_increment - (self._size - self._current_size)
                idx_a = np.arange(self._current_ptr, self._size)
                idx_b = np.arange(0, overflow)
                idx = np.concatenate([idx_a, idx_b])
                self._current_ptr = overflow

        # Update replay size
        self._current_size = min(self._size,
                                 self._current_size + size_increment)

        if size_increment == 1:
            idx = idx[0]

        return idx

    def _convert_episode_to_batch_major(self):
        """
        Convert the shape of episode_buffer.

        episode_buffer: {time_horizon, algo.rollout_batch_size, flat_dim}.
        buffer: {size, time_horizon, flat_dim}.
        """
        transitions = {}
        for key in self._episode_buffer.keys():
            val = np.array(self._episode_buffer[key])
            transitions[key] = val.swapaxes(0, 1)
        return transitions

    @property
    def full(self):
        """Whether the buffer is full."""
        return self._current_size == self._size

    @property
    def n_transitions_stored(self):
        """
        Return the size of the replay buffer.

        Returns:
            self._size: Size of the current replay buffer.

        """
        return self._n_transitions_stored