Source code for garage._functions
"""Functions exposed directly in the garage namespace."""
from collections import defaultdict
from dowel import tabular
import numpy as np
from garage import EpisodeBatch, StepType
from garage.np import discount_cumsum, stack_tensor_dict_list
class _Default: # pylint: disable=too-few-public-methods
"""A wrapper class to represent default arguments.
Args:
val (object): Argument value.
"""
def __init__(self, val):
self.val = val
[docs]def make_optimizer(optimizer_type, module=None, **kwargs):
"""Create an optimizer for pyTorch & tensorflow algos.
Args:
optimizer_type (Union[type, tuple[type, dict]]): Type of optimizer.
This can be an optimizer type such as 'torch.optim.Adam' or a
tuple of type and dictionary, where dictionary contains arguments
to initialize the optimizer e.g. (torch.optim.Adam, {'lr' : 1e-3})
module (optional): If the optimizer type is a `torch.optimizer`.
The `torch.nn.Module` module whose parameters needs to be optimized
must be specify.
kwargs (dict): Other keyword arguments to initialize optimizer. This
is not used when `optimizer_type` is tuple.
Returns:
torch.optim.Optimizer: Constructed optimizer.
Raises:
ValueError: Raises value error when `optimizer_type` is tuple, and
non-default argument is passed in `kwargs`.
"""
if isinstance(optimizer_type, tuple):
opt_type, opt_args = optimizer_type
for name, arg in kwargs.items():
if not isinstance(arg, _Default):
raise ValueError('Should not specify {} and explicit \
optimizer args at the same time'.format(name))
if module is not None:
return opt_type(module.parameters(), **opt_args)
else:
return opt_type(**opt_args)
opt_args = {
k: v.val if isinstance(v, _Default) else v
for k, v in kwargs.items()
}
if module is not None:
return optimizer_type(module.parameters(), **opt_args)
else:
return optimizer_type(**opt_args)
[docs]def rollout(env,
agent,
*,
max_episode_length=np.inf,
animated=False,
speedup=1,
deterministic=False):
"""Sample a single episode of the agent in the environment.
Args:
agent (Policy): Policy used to select actions.
env (Environment): Environment to perform actions in.
max_episode_length (int): If the episode reaches this many timesteps,
it is truncated.
animated (bool): If true, render the environment after each step.
speedup (float): Factor by which to decrease the wait time between
rendered steps. Only relevant, if animated == true.
deterministic (bool): If true, use the mean action returned by the
stochastic policy instead of sampling from the returned action
distribution.
Returns:
dict[str, np.ndarray or dict]: Dictionary, with keys:
* observations(np.array): Flattened array of observations.
There should be one more of these than actions. Note that
observations[i] (for i < len(observations) - 1) was used by the
agent to choose actions[i]. Should have shape
:math:`(T + 1, S^*)`, i.e. the unflattened observation space of
the current environment.
* actions(np.array): Non-flattened array of actions. Should have
shape :math:`(T, S^*)`, i.e. the unflattened action space of
the current environment.
* rewards(np.array): Array of rewards of shape :math:`(T,)`, i.e. a
1D array of length timesteps.
* agent_infos(Dict[str, np.array]): Dictionary of stacked,
non-flattened `agent_info` arrays.
* env_infos(Dict[str, np.array]): Dictionary of stacked,
non-flattened `env_info` arrays.
* dones(np.array): Array of termination signals.
"""
del speedup
env_steps = []
agent_infos = []
observations = []
last_obs = env.reset()[0]
agent.reset()
episode_length = 0
if animated:
env.visualize()
while episode_length < (max_episode_length or np.inf):
a, agent_info = agent.get_action(last_obs)
if deterministic and 'mean' in agent_info:
a = agent_info['mean']
es = env.step(a)
env_steps.append(es)
observations.append(last_obs)
agent_infos.append(agent_info)
episode_length += 1
if es.last:
break
last_obs = es.observation
return dict(
observations=np.array(observations),
actions=np.array([es.action for es in env_steps]),
rewards=np.array([es.reward for es in env_steps]),
agent_infos=stack_tensor_dict_list(agent_infos),
env_infos=stack_tensor_dict_list([es.env_info for es in env_steps]),
dones=np.array([es.terminal for es in env_steps]),
)
[docs]def obtain_evaluation_episodes(policy,
env,
max_episode_length=1000,
num_eps=100,
deterministic=True):
"""Sample the policy for num_eps episodes and return average values.
Args:
policy (Policy): Policy to use as the actor when gathering samples.
env (Environment): The environement used to obtain episodes.
max_episode_length (int): Maximum episode length. The episode will
truncated when length of episode reaches max_episode_length.
num_eps (int): Number of episodes.
deterministic (bool): Whether the a deterministic approach is used
in rollout.
Returns:
EpisodeBatch: Evaluation episodes, representing the best current
performance of the algorithm.
"""
episodes = []
# Use a finite length rollout for evaluation.
for _ in range(num_eps):
eps = rollout(env,
policy,
max_episode_length=max_episode_length,
deterministic=deterministic)
episodes.append(eps)
return EpisodeBatch.from_list(env.spec, episodes)
[docs]def log_multitask_performance(itr, batch, discount, name_map=None):
r"""Log performance of episodes from multiple tasks.
Args:
itr (int): Iteration number to be logged.
batch (EpisodeBatch): Batch of episodes. The episodes should have
either the "task_name" or "task_id" `env_infos`. If the "task_name"
is not present, then `name_map` is required, and should map from
task id's to task names.
discount (float): Discount used in computing returns.
name_map (dict[int, str] or None): Mapping from task id's to task
names. Optional if the "task_name" environment info is present.
Note that if provided, all tasks listed in this map will be logged,
even if there are no episodes present for them.
Returns:
numpy.ndarray: Undiscounted returns averaged across all tasks. Has
shape :math:`(N \bullet [T])`.
"""
eps_by_name = defaultdict(list)
for eps in batch.split():
task_name = '__unnamed_task__'
if 'task_name' in eps.env_infos:
task_name = eps.env_infos['task_name'][0]
elif 'task_id' in eps.env_infos:
name_map = {} if name_map is None else name_map
task_id = eps.env_infos['task_id'][0]
task_name = name_map.get(task_id, 'Task #{}'.format(task_id))
eps_by_name[task_name].append(eps)
if name_map is None:
task_names = eps_by_name.keys()
else:
task_names = name_map.values()
for task_name in task_names:
if task_name in eps_by_name:
episodes = eps_by_name[task_name]
log_performance(itr,
EpisodeBatch.concatenate(*episodes),
discount,
prefix=task_name)
else:
with tabular.prefix(task_name + '/'):
tabular.record('Iteration', itr)
tabular.record('NumEpisodes', 0)
tabular.record('AverageDiscountedReturn', np.nan)
tabular.record('AverageReturn', np.nan)
tabular.record('StdReturn', np.nan)
tabular.record('MaxReturn', np.nan)
tabular.record('MinReturn', np.nan)
tabular.record('TerminationRate', np.nan)
tabular.record('SuccessRate', np.nan)
return log_performance(itr, batch, discount=discount, prefix='Average')
[docs]def log_performance(itr, batch, discount, prefix='Evaluation'):
"""Evaluate the performance of an algorithm on a batch of episodes.
Args:
itr (int): Iteration number.
batch (EpisodeBatch): The episodes to evaluate with.
discount (float): Discount value, from algorithm's property.
prefix (str): Prefix to add to all logged keys.
Returns:
numpy.ndarray: Undiscounted returns.
"""
returns = []
undiscounted_returns = []
termination = []
success = []
for eps in batch.split():
returns.append(discount_cumsum(eps.rewards, discount))
undiscounted_returns.append(sum(eps.rewards))
termination.append(
float(
any(step_type == StepType.TERMINAL
for step_type in eps.step_types)))
if 'success' in eps.env_infos:
success.append(float(eps.env_infos['success'].any()))
average_discounted_return = np.mean([rtn[0] for rtn in returns])
with tabular.prefix(prefix + '/'):
tabular.record('Iteration', itr)
tabular.record('NumEpisodes', len(returns))
tabular.record('AverageDiscountedReturn', average_discounted_return)
tabular.record('AverageReturn', np.mean(undiscounted_returns))
tabular.record('StdReturn', np.std(undiscounted_returns))
tabular.record('MaxReturn', np.max(undiscounted_returns))
tabular.record('MinReturn', np.min(undiscounted_returns))
tabular.record('TerminationRate', np.mean(termination))
if success:
tabular.record('SuccessRate', np.mean(success))
return undiscounted_returns