"""Data types for agent-based learning."""
import collections
import akro
import numpy as np
from garage.misc import tensor_utils
[docs]class TrajectoryBatch(
collections.namedtuple('TrajectoryBatch', [
'env_spec',
'observations',
'last_observations',
'actions',
'rewards',
'terminals',
'env_infos',
'agent_infos',
'lengths',
])):
# pylint: disable=missing-return-doc, missing-return-type-doc, missing-param-doc, missing-type-doc # noqa: E501
r"""A tuple representing a batch of whole trajectories.
Data type for on-policy algorithms.
A :class:`TrajectoryBatch` represents a batch of whole trajectories
produced when one or more agents interacts with one or more environments.
+-----------------------+-------------------------------------------------+
| Symbol | Description |
+=======================+=================================================+
| :math:`N` | Trajectory index dimension |
+-----------------------+-------------------------------------------------+
| :math:`[T]` | Variable-length time dimension of each |
| | trajectory |
+-----------------------+-------------------------------------------------+
| :math:`S^*` | Single-step shape of a time-series tensor |
+-----------------------+-------------------------------------------------+
| :math:`N \bullet [T]` | A dimension computed by flattening a |
| | variable-length time dimension :math:`[T]` into |
| | a single batch dimension with length |
| | :math:`sum_{i \in N} [T]_i` |
+-----------------------+-------------------------------------------------+
Attributes:
env_spec (garage.envs.EnvSpec): Specification for the environment from
which this data was sampled.
observations (numpy.ndarray): A numpy array of shape
:math:`(N \bullet [T], O^*)` containing the (possibly
multi-dimensional) observations for all time steps in this batch.
These must conform to :obj:`env_spec.observation_space`.
last_observations (numpy.ndarray): A numpy array of shape
:math:`(N, O^*)` containing the last observation of each
trajectory. This is necessary since there are one more
observations than actions every trajectory.
actions (numpy.ndarray): A numpy array of shape
:math:`(N \bullet [T], A^*)` containing the (possibly
multi-dimensional) actions for all time steps in this batch. These
must conform to :obj:`env_spec.action_space`.
rewards (numpy.ndarray): A numpy array of shape
:math:`(N \bullet [T])` containing the rewards for all time steps
in this batch.
terminals (numpy.ndarray): A boolean numpy array of shape
:math:`(N \bullet [T])` containing the termination signals for all
time steps in this batch.
env_infos (dict): A dict of numpy arrays arbitrary environment state
information. Each value of this dict should be a numpy array of
shape :math:`(N \bullet [T])` or :math:`(N \bullet [T], S^*)`.
agent_infos (numpy.ndarray): A dict of numpy arrays arbitrary agent
state information. Each value of this dict should be a numpy array
of shape :math:`(N \bullet [T])` or :math:`(N \bullet [T], S^*)`.
For example, this may contain the hidden states from an RNN policy.
lengths (numpy.ndarray): An integer numpy array of shape :math:`(N,)`
containing the length of each trajectory in this batch. This may be
used to reconstruct the individual trajectories.
Raises:
ValueError: If any of the above attributes do not conform to their
prescribed types and shapes.
"""
__slots__ = ()
def __new__(cls, env_spec, observations, last_observations, actions,
rewards, terminals, env_infos, agent_infos,
lengths): # noqa: D102
# pylint: disable=too-many-branches
first_observation = observations[0]
first_action = actions[0]
inferred_batch_size = lengths.sum()
# lengths
if len(lengths.shape) != 1:
raise ValueError(
'Lengths tensor must be a tensor of shape (N,), but got a '
'tensor of shape {} instead'.format(lengths.shape))
if not (lengths.dtype.kind == 'u' or lengths.dtype.kind == 'i'):
raise ValueError(
'Lengths tensor must have an integer dtype, but got dtype {} '
'instead.'.format(lengths.dtype))
# observations
if not env_spec.observation_space.contains(first_observation):
# Discrete actions can be either in the space normally, or one-hot
# encoded.
if isinstance(env_spec.observation_space,
(akro.Box, akro.Discrete, akro.Dict)):
if env_spec.observation_space.flat_dim != np.prod(
first_observation.shape):
raise ValueError('observations should have the same '
'dimensionality as the observation_space '
'({}), but got data with shape {} '
'instead'.format(
env_spec.observation_space.flat_dim,
first_observation.shape))
else:
raise ValueError(
'observations must conform to observation_space {}, but '
'got data with shape {} instead.'.format(
env_spec.observation_space, first_observation))
if observations.shape[0] != inferred_batch_size:
raise ValueError(
'Expected batch dimension of observations to be length {}, '
'but got length {} instead.'.format(inferred_batch_size,
observations.shape[0]))
# observations
if not env_spec.observation_space.contains(last_observations[0]):
# Discrete actions can be either in the space normally, or one-hot
# encoded.
if isinstance(env_spec.observation_space,
(akro.Box, akro.Discrete, akro.Dict)):
if env_spec.observation_space.flat_dim != np.prod(
last_observations[0].shape):
raise ValueError('last_observations should have the same '
'dimensionality as the observation_space '
'({}), but got data with shape {} '
'instead'.format(
env_spec.observation_space.flat_dim,
last_observations[0].shape))
else:
raise ValueError(
'last_observations must conform to observation_space {}, '
'but got data with shape {} instead.'.format(
env_spec.observation_space, last_observations[0]))
if last_observations.shape[0] != len(lengths):
raise ValueError(
'Expected batch dimension of last_observations to be length '
'{}, but got length {} instead.'.format(
len(lengths), last_observations.shape[0]))
# actions
if not env_spec.action_space.contains(first_action):
# Discrete actions can be either in the space normally, or one-hot
# encoded.
if isinstance(env_spec.action_space,
(akro.Box, akro.Discrete, akro.Dict)):
if env_spec.action_space.flat_dim != np.prod(
first_action.shape):
raise ValueError('actions should have the same '
'dimensionality as the action_space '
'({}), but got data with shape {} '
'instead'.format(
env_spec.action_space.flat_dim,
first_action.shape))
else:
raise ValueError(
'actions must conform to action_space {}, but got data '
'with shape {} instead.'.format(env_spec.action_space,
first_action))
if actions.shape[0] != inferred_batch_size:
raise ValueError(
'Expected batch dimension of actions to be length {}, but got '
'length {} instead.'.format(inferred_batch_size,
actions.shape[0]))
# rewards
if rewards.shape != (inferred_batch_size, ):
raise ValueError(
'Rewards tensor must have shape {}, but got shape {} '
'instead.'.format(inferred_batch_size, rewards.shape))
# terminals
if terminals.shape != (inferred_batch_size, ):
raise ValueError(
'terminals tensor must have shape {}, but got shape {} '
'instead.'.format(inferred_batch_size, terminals.shape))
if terminals.dtype != np.bool:
raise ValueError(
'terminals tensor must be dtype np.bool, but got tensor '
'of dtype {} instead.'.format(terminals.dtype))
# env_infos
for key, val in env_infos.items():
if not isinstance(val, (dict, np.ndarray)):
raise ValueError(
'Each entry in env_infos must be a numpy array or '
'dictionary, but got key {} with value type {} instead.'.
format(key, type(val)))
if (isinstance(val, np.ndarray)
and val.shape[0] != inferred_batch_size):
raise ValueError(
'Each entry in env_infos must have a batch dimension of '
'length {}, but got key {} with batch size {} instead.'.
format(inferred_batch_size, key, val.shape[0]))
# agent_infos
for key, val in agent_infos.items():
if not isinstance(val, (dict, np.ndarray)):
raise ValueError(
'Each entry in agent_infos must be a numpy array or '
'dictionary, but got key {} with value type {} instead.'
'instead'.format(key, type(val)))
if (isinstance(val, np.ndarray)
and val.shape[0] != inferred_batch_size):
raise ValueError(
'Each entry in agent_infos must have a batch dimension of '
'length {}, but got key {} with batch size {} instead.'.
format(inferred_batch_size, key, val.shape[0]))
return super().__new__(TrajectoryBatch, env_spec, observations,
last_observations, actions, rewards, terminals,
env_infos, agent_infos, lengths)
[docs] @classmethod
def concatenate(cls, *batches):
"""Create a TrajectoryBatch by concatenating TrajectoryBatches.
Args:
batches (list[TrajectoryBatch]): Batches to concatenate.
Returns:
TrajectoryBatch: The concatenation of the batches.
"""
if __debug__:
for b in batches:
assert (set(b.env_infos.keys()) == set(
batches[0].env_infos.keys()))
assert (set(b.agent_infos.keys()) == set(
batches[0].agent_infos.keys()))
env_infos = {
k: np.concatenate([b.env_infos[k] for b in batches])
for k in batches[0].env_infos.keys()
}
agent_infos = {
k: np.concatenate([b.agent_infos[k] for b in batches])
for k in batches[0].agent_infos.keys()
}
return cls(
batches[0].env_spec,
np.concatenate([batch.observations for batch in batches]),
np.concatenate([batch.last_observations for batch in batches]),
np.concatenate([batch.actions for batch in batches]),
np.concatenate([batch.rewards for batch in batches]),
np.concatenate([batch.terminals for batch in batches]), env_infos,
agent_infos, np.concatenate([batch.lengths for batch in batches]))
[docs] def split(self):
"""Split a TrajectoryBatch into a list of TrajectoryBatches.
The opposite of concatenate.
Returns:
list[TrajectoryBatch]: A list of TrajectoryBatches, with one
trajectory per batch.
"""
trajectories = []
start = 0
for i, length in enumerate(self.lengths):
stop = start + length
traj = TrajectoryBatch(env_spec=self.env_spec,
observations=self.observations[start:stop],
last_observations=np.asarray(
[self.last_observations[i]]),
actions=self.actions[start:stop],
rewards=self.rewards[start:stop],
terminals=self.terminals[start:stop],
env_infos=tensor_utils.slice_nested_dict(
self.env_infos, start, stop),
agent_infos=tensor_utils.slice_nested_dict(
self.agent_infos, start, stop),
lengths=np.asarray([length]))
trajectories.append(traj)
start = stop
return trajectories
[docs] def to_trajectory_list(self):
"""Convert the batch into a list of dictionaries.
Returns:
list[dict[str, np.ndarray or dict[str, np.ndarray]]]: Keys:
* observations (np.ndarray): Non-flattened array of
observations. Has shape (T, S^*) (the unflattened state
space of the current environment). observations[i] was
used by the agent to choose actions[i].
* next_observations (np.ndarray): Non-flattened array of
observations. Has shape (T, S^*). next_observations[i] was
observed by the agent after taking actions[i].
* actions (np.ndarray): Non-flattened array of actions. Should
have shape (T, S^*) (the unflattened action space of the
current environment).
* rewards (np.ndarray): Array of rewards of shape (T,) (1D
array of length timesteps).
* dones (np.ndarray): Array of dones of shape (T,) (1D array
of length timesteps).
* agent_infos (dict[str, np.ndarray]): Dictionary of stacked,
non-flattened `agent_info` arrays.
* env_infos (dict[str, np.ndarray]): Dictionary of stacked,
non-flattened `env_info` arrays.
"""
start = 0
trajectories = []
for i, length in enumerate(self.lengths):
stop = start + length
trajectories.append({
'observations':
self.observations[start:stop],
'next_observations':
np.concatenate((self.observations[1 + start:stop],
[self.last_observations[i]])),
'actions':
self.actions[start:stop],
'rewards':
self.rewards[start:stop],
'env_infos':
{k: v[start:stop]
for (k, v) in self.env_infos.items()},
'agent_infos':
{k: v[start:stop]
for (k, v) in self.agent_infos.items()},
'dones':
self.terminals[start:stop]
})
start = stop
return trajectories
[docs] @classmethod
def from_trajectory_list(cls, env_spec, paths):
"""Create a TrajectoryBatch from a list of trajectories.
Args:
env_spec (garage.envs.EnvSpec): Specification for the environment
from which this data was sampled.
paths (list[dict[str, np.ndarray or dict[str, np.ndarray]]]): Keys:
* observations (np.ndarray): Non-flattened array of
observations. Typically has shape (T, S^*) (the unflattened
state space of the current environment). observations[i]
was used by the agent to choose actions[i]. observations
may instead have shape (T + 1, S^*).
* next_observations (np.ndarray): Non-flattened array of
observations. Has shape (T, S^*). next_observations[i] was
observed by the agent after taking actions[i]. Optional.
Note that to ensure all information from the environment
was preserved, observations[i] should have shape (T + 1,
S^*), or this key should be set. However, this method is
lenient and will "duplicate" the last observation if the
original last observation has been lost.
* actions (np.ndarray): Non-flattened array of actions. Should
have shape (T, S^*) (the unflattened action space of the
current environment).
* rewards (np.ndarray): Array of rewards of shape (T,) (1D
array of length timesteps).
* dones (np.ndarray): Array of rewards of shape (T,) (1D array
of length timesteps).
* agent_infos (dict[str, np.ndarray]): Dictionary of stacked,
non-flattened `agent_info` arrays.
* env_infos (dict[str, np.ndarray]): Dictionary of stacked,
non-flattened `env_info` arrays.
"""
lengths = np.asarray([len(p['rewards']) for p in paths])
if all(
len(path['observations']) == length + 1
for (path, length) in zip(paths, lengths)):
last_observations = np.asarray(
[p['observations'][-1] for p in paths])
observations = np.concatenate(
[p['observations'][:-1] for p in paths])
else:
# The number of observations and timesteps must match.
observations = np.concatenate([p['observations'] for p in paths])
if paths[0].get('next_observations') is not None:
last_observations = np.asarray(
[p['next_observations'][-1] for p in paths])
else:
last_observations = np.asarray(
[p['observations'][-1] for p in paths])
stacked_paths = tensor_utils.concat_tensor_dict_list(paths)
return cls(env_spec=env_spec,
observations=observations,
last_observations=last_observations,
actions=stacked_paths['actions'],
rewards=stacked_paths['rewards'],
terminals=stacked_paths['dones'],
env_infos=stacked_paths['env_infos'],
agent_infos=stacked_paths['agent_infos'],
lengths=lengths)
[docs]class TimeStep(
collections.namedtuple('TimeStep', [
'env_spec',
'observation',
'action',
'reward',
'next_observation',
'terminal',
'env_info',
'agent_info',
])):
# pylint: disable=missing-return-doc, missing-return-type-doc, missing-param-doc, missing-type-doc # noqa: E501
r"""A tuple representing a single TimeStep.
A :class:`TimeStep` represents a single sample when an agent interacts with
an environment.
Attributes:
env_spec (garage.envs.EnvSpec): Specification for the environment from
which this data was sampled.
observation (numpy.ndarray): A numpy array of shape :math:`(O^*)`
containing the observation for the this time step in the
environment. These must conform to
:obj:`env_spec.observation_space`.
action (numpy.ndarray): A numpy array of shape :math:`(A^*)`
containing the action for the this time step. These must conform
to :obj:`env_spec.action_space`.
reward (float): A float representing the reward for taking the action
given the observation, at the this time step.
terminals (bool): The termination signal for the this time step.
env_info (dict): A dict arbitrary environment state information.
agent_info (numpy.ndarray): A dict of arbitrary agent
state information. For example, this may contain the hidden states
from an RNN policy.
Raises:
ValueError: If any of the above attributes do not conform to their
prescribed types and shapes.
"""
def __new__(cls, env_spec, observation, action, reward, next_observation,
terminal, env_info, agent_info): # noqa: D102
# pylint: disable=too-many-branches
# observation
if not env_spec.observation_space.contains(observation):
if isinstance(env_spec.observation_space,
(akro.Box, akro.Discrete, akro.Dict)):
if env_spec.observation_space.flat_dim != np.prod(
observation.shape):
raise ValueError('observation should have the same '
'dimensionality as the observation_space '
'({}), but got data with shape {} '
'instead'.format(
env_spec.observation_space.flat_dim,
observation.shape))
else:
raise ValueError(
'observation must conform to observation_space {}, '
'but got data with shape {} instead.'.format(
env_spec.observation_space, observation))
if not env_spec.observation_space.contains(next_observation):
if isinstance(env_spec.observation_space,
(akro.Box, akro.Discrete, akro.Dict)):
if env_spec.observation_space.flat_dim != np.prod(
next_observation.shape):
raise ValueError('next_observation should have the same '
'dimensionality as the observation_space '
'({}), but got data with shape {} '
'instead'.format(
env_spec.observation_space.flat_dim,
next_observation.shape))
else:
raise ValueError(
'next_observation must conform to observation_space {}, '
'but got data with shape {} instead.'.format(
env_spec.observation_space, next_observation))
# action
if not env_spec.action_space.contains(action):
if isinstance(env_spec.action_space,
(akro.Box, akro.Discrete, akro.Dict)):
if env_spec.action_space.flat_dim != np.prod(action.shape):
raise ValueError('action should have the same '
'dimensionality as the action_space '
'({}), but got data with shape {} '
'instead'.format(
env_spec.action_space.flat_dim,
action.shape))
else:
raise ValueError('action must conform to action_space {}, '
'but got data with shape {} instead.'.format(
env_spec.action_space, action))
if not isinstance(agent_info, dict):
raise ValueError('agent_info must be type {}, but got type {} '
'instead.'.format(dict, type(agent_info)))
if not isinstance(env_info, dict):
raise ValueError('env_info must be type {}, but got type {} '
'instead.'.format(dict, type(env_info)))
# rewards
if not isinstance(reward, float):
raise ValueError('reward must be type {}, but got type {} '
'instead.'.format(float, type(reward)))
if not isinstance(terminal, bool):
raise ValueError(
'terminal must be dtype bool, but got dtype {} instead.'.
format(type(terminal)))
return super().__new__(TimeStep, env_spec, observation, action, reward,
next_observation, terminal, env_info,
agent_info)
[docs]class InOutSpec:
"""Describes the input and output spaces of a primitive or module.
Args:
input_space (akro.Space): Input space of a module.
output_space (akro.Space): Output space of a module.
"""
def __init__(self, input_space, output_space):
self._input_space = input_space
self._output_space = output_space
@property
def input_space(self):
"""Get input space of the module.
Returns:
akro.Space: Input space of the module.
"""
return self._input_space
@property
def output_space(self):
"""Get output space of the module.
Returns:
akro.Space: Output space of the module.
"""
return self._output_space
[docs]class TimeStepBatch(
collections.namedtuple('TimeStepBatch', [
'env_spec',
'observations',
'actions',
'rewards',
'next_observations',
'terminals',
'env_infos',
'agent_infos',
])):
# pylint: disable=missing-param-doc, missing-type-doc
"""A tuple representing a batch of TimeSteps.
Data type for off-policy algorithms, imitation learning and batch-RL.
Attributes:
env_spec (garage.envs.EnvSpec): Specification for the environment from
which this data was sampled.
observations (numpy.ndarray): Non-flattened array of observations.
Typically has shape (batch_size, S^*) (the unflattened state space
of the current environment).
actions (numpy.ndarray): Non-flattened array of actions. Should
have shape (batch_size, S^*) (the unflattened action space of the
current environment).
rewards (numpy.ndarray): Array of rewards of shape (batch_size,) (1D
array of length batch_size).
next_observation (numpy.ndarray): Non-flattened array of next
observations. Has shape (batch_size, S^*). next_observations[i] was
observed by the agent after taking actions[i].
terminals (numpy.ndarray): A boolean numpy array of shape
shape (batch_size,) containing the termination signals for all
transitions in this batch.
env_infos (dict): A dict arbitrary environment state
information.
agent_infos (dict): A dict of arbitrary agent state information. For
example, this may contain the hidden states from an RNN policy.
Raises:
ValueError: If any of the above attributes do not conform to their
prescribed types and shapes.
"""
__slots__ = ()
def __new__(cls, env_spec, observations, actions, rewards,
next_observations, terminals, env_infos,
agent_infos): # noqa: D102
# pylint: disable=missing-return-doc, missing-return-type-doc,
# pylint: disable=too-many-branches
inferred_batch_size = len(terminals)
if inferred_batch_size < 1:
raise ValueError(
'Expected batch dimension of terminals to be greater than 1, '
'but got length {} instead.'.format(inferred_batch_size))
first_observation = observations[0]
first_action = actions[0]
# observation
if not env_spec.observation_space.contains(first_observation):
if isinstance(env_spec.observation_space,
(akro.Box, akro.Discrete, akro.Dict)):
if env_spec.observation_space.flat_dim != np.prod(
first_observation.shape):
raise ValueError('observations should have the same '
'dimensionality as the observation_space '
'({}), but got data with shape {} '
'instead'.format(
env_spec.observation_space.flat_dim,
first_observation.shape))
else:
raise ValueError(
'observations must conform to observation_space {}, '
'but got data with shape {} instead.'.format(
env_spec.observation_space, first_observation.shape))
if observations.shape[0] != inferred_batch_size:
raise ValueError(
'Expected batch dimension of observations to be length {}, '
'but got length {} instead.'.format(inferred_batch_size,
observations.shape[0]))
# next_observation
if not env_spec.observation_space.contains(next_observations[0]):
if isinstance(env_spec.observation_space,
(akro.Box, akro.Discrete, akro.Dict)):
if env_spec.observation_space.flat_dim != np.prod(
next_observations[0].shape):
raise ValueError('next_observations should have the same '
'dimensionality as the observation_space '
'({}), but got data with shape {} '
'instead'.format(
env_spec.observation_space.flat_dim,
next_observations[0].shape))
else:
raise ValueError(
'next_observations must conform to observation_space {}, '
'but got data with shape {} instead.'.format(
env_spec.observation_space,
next_observations[0].shape[0]))
if next_observations.shape[0] != inferred_batch_size:
raise ValueError(
'Expected batch dimension of next_observations to be length {'
'}, but got length {} instead.'.format(
inferred_batch_size, next_observations[0].shape[0]))
# action
if not env_spec.action_space.contains(first_action):
if isinstance(env_spec.action_space,
(akro.Box, akro.Discrete, akro.Dict)):
if env_spec.action_space.flat_dim != np.prod(
first_action.shape):
raise ValueError('actions should have the same '
'dimensionality as the action_space '
'({}), but got data with shape {} '
'instead'.format(
env_spec.action_space.flat_dim,
first_action.shape))
else:
raise ValueError('actions must conform to action_space {}, '
'but got data with shape {} instead.'.format(
env_spec.action_space,
first_action.shape))
if actions.shape[0] != inferred_batch_size:
raise ValueError(
'Expected batch dimension of actions to be length {}, but got '
'length {} instead.'.format(inferred_batch_size,
actions.shape[0]))
# rewards
if rewards.shape[0] != inferred_batch_size:
raise ValueError(
'Expected batch dimension of rewards to be length {}, but got '
'length {} instead.'.format(inferred_batch_size,
rewards.shape[0]))
# terminals
if terminals.dtype != np.bool:
raise ValueError(
'terminals tensor must be dtype np.bool, but got tensor '
'of dtype {} instead.'.format(terminals.dtype))
# env_infos
for key, val in env_infos.items():
if not isinstance(val, (dict, np.ndarray)):
raise ValueError(
'Each entry in env_infos must be a numpy array or '
'dictionary, but got key {} with value type {} '
'instead.'.format(key, type(val)))
if (isinstance(val, np.ndarray)
and val.shape[0] != inferred_batch_size):
raise ValueError(
'Each entry in env_infos must have a batch dimension '
'of '
'length {}, but got key {} with batch size {} instead.'.
format(inferred_batch_size, key, val.shape[0]))
# agent_infos
for key, val in agent_infos.items():
if not isinstance(val, (dict, np.ndarray)):
raise ValueError(
'Each entry in agent_infos must be a numpy array or '
'dictionary, but got key {} with value type {} instead.'
'instead'.format(key, type(val)))
if (isinstance(val, np.ndarray)
and val.shape[0] != inferred_batch_size):
raise ValueError(
'Each entry in agent_infos must have a batch '
'dimension of '
'length {}, but got key {} with batch size {} instead.'.
format(inferred_batch_size, key, val.shape[0]))
return super().__new__(TimeStepBatch, env_spec, observations, actions,
rewards, next_observations, terminals,
env_infos, agent_infos)
[docs] @classmethod
def concatenate(cls, *batches):
"""Create a TimeStepBatch by concatenating TimeStepBatches.
Args:
batches (list[TimeStepBatch]): Batches to concatenate.
Returns:
TimeStepBatch: The concatenation of the batches.
Raises:
ValueError: If no TimeStepBatches are provided.
"""
if len(batches) < 1:
raise ValueError('Please provide at least one TimeStepBatch to '
'concatenate')
env_infos = {
k: np.concatenate([b.env_infos[k] for b in batches])
for k in batches[0].env_infos.keys()
}
agent_infos = {
k: np.concatenate([b.agent_infos[k] for b in batches])
for k in batches[0].agent_infos.keys()
}
return cls(
batches[0].env_spec,
np.concatenate([batch.observations for batch in batches]),
np.concatenate([batch.actions for batch in batches]),
np.concatenate([batch.rewards for batch in batches]),
np.concatenate([batch.next_observations for batch in batches]),
np.concatenate([batch.terminals for batch in batches]), env_infos,
agent_infos)
[docs] def split(self):
"""Split a TimeStepBatch into a list of TimeStepBatches.
The opposite of concatenate.
Returns:
list[TimeStepBatch]: A list of TimeStepBatches, with one
TimeStep per TimeStepBatch.
"""
time_steps = []
for i in range(len(self.terminals)):
time_step = TimeStepBatch(
env_spec=self.env_spec,
observations=np.asarray([self.observations[i]]),
actions=np.asarray([self.actions[i]]),
rewards=np.asarray([self.rewards[i]]),
next_observations=np.asarray([self.next_observations[i]]),
terminals=np.asarray([self.terminals[i]]),
env_infos={
k: np.asarray([v[i]])
for (k, v) in self.env_infos.items()
},
agent_infos={
k: np.asarray([v[i]])
for (k, v) in self.agent_infos.items()
},
)
time_steps.append(time_step)
return time_steps
[docs] def to_time_step_list(self):
"""Convert the batch into a list of dictionaries.
This breaks the TimeStepBatch object into a list of single
time step sample dictionaries. len(terminals) (or the number of
discrete time step) dictionaries are returned
Returns:
list[dict[str, np.ndarray or dict[str, np.ndarray]]]: Keys:
observations (numpy.ndarray): Non-flattened array of
observations.
Typically has shape (batch_size, S^*) (the unflattened
state space
of the current environment).
actions (numpy.ndarray): Non-flattened array of actions. Should
have shape (batch_size, S^*) (the unflattened action
space of the
current environment).
rewards (numpy.ndarray): Array of rewards of shape (
batch_size,) (1D array of length batch_size).
next_observation (numpy.ndarray): Non-flattened array of next
observations. Has shape (batch_size, S^*).
next_observations[i] was
observed by the agent after taking actions[i].
terminals (numpy.ndarray): A boolean numpy array of shape
shape (batch_size,) containing the termination signals
for all
transitions in this batch.
env_infos (dict): A dict arbitrary environment state
information.
agent_infos (dict): A dict of arbitrary agent state
information. For example, this may contain the
hidden states from an RNN policy.
"""
samples = []
for i in range(len(self.terminals)):
samples.append({
'observations':
np.asarray([self.observations[i]]),
'actions':
np.asarray([self.actions[i]]),
'rewards':
np.asarray([self.rewards[i]]),
'next_observations':
np.asarray([self.next_observations[i]]),
'terminals':
np.asarray([self.terminals[i]]),
'env_infos':
{k: np.asarray([v[i]])
for (k, v) in self.env_infos.items()},
'agent_infos':
{k: np.asarray([v[i]])
for (k, v) in self.agent_infos.items()},
})
return samples
[docs] @classmethod
def from_time_step_list(cls, env_spec, ts_samples):
"""Create a TimeStepBatch from a list of time step dictionaries.
Args:
env_spec (garage.envs.EnvSpec): Specification for the environment
from which this data was sampled.
ts_samples (list[dict[str, np.ndarray or dict[str, np.ndarray]]]):
keys:
* observations (numpy.ndarray): Non-flattened array of
observations.
Typically has shape (batch_size, S^*) (the unflattened
state space of the current environment).
* actions (numpy.ndarray): Non-flattened array of actions.
Should have shape (batch_size, S^*) (the unflattened action
space of the current environment).
* rewards (numpy.ndarray): Array of rewards of shape (
batch_size,) (1D array of length batch_size).
* next_observation (numpy.ndarray): Non-flattened array of next
observations. Has shape (batch_size, S^*).
next_observations[i] was observed by the agent after
taking actions[i].
* terminals (numpy.ndarray): A boolean numpy array of shape
shape (batch_size,) containing the termination signals
for all transitions in this batch.
* env_infos (dict): A dict arbitrary environment state
information.
* agent_infos (dict): A dict of arbitrary agent
state information. For example, this may contain the
hidden states from an RNN policy.
Returns:
TimeStepBatch: The concatenation of samples.
Raises:
ValueError: If no dicts are provided.
"""
if len(ts_samples) < 1:
raise ValueError('Please provide at least one dict')
ts_batches = [
TimeStepBatch(env_spec=env_spec,
observations=sample['observations'],
actions=sample['actions'],
rewards=sample['rewards'],
next_observations=sample['next_observations'],
terminals=sample['terminals'],
env_infos=sample['env_infos'],
agent_infos=sample['agent_infos'])
for sample in ts_samples
]
return TimeStepBatch.concatenate(*ts_batches)