"""Vanilla Policy Gradient (REINFORCE)."""
import collections
import copy
from dowel import tabular
import numpy as np
import torch
import torch.nn.functional as F
from garage import log_performance, TrajectoryBatch
from garage.misc import tensor_utils as tu
from garage.np.algos.rl_algorithm import RLAlgorithm
from garage.sampler import OnPolicyVectorizedSampler
from garage.torch import (compute_advantages, filter_valids, pad_to_last)
from garage.torch.optimizers import OptimizerWrapper
[docs]class VPG(RLAlgorithm):
"""Vanilla Policy Gradient (REINFORCE).
VPG, also known as Reinforce, trains stochastic policy in an on-policy way.
Args:
env_spec (garage.envs.EnvSpec): Environment specification.
policy (garage.torch.policies.Policy): Policy.
value_function (garage.torch.value_functions.ValueFunction): The value
function.
policy_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer
for policy.
vf_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer for
value function.
max_path_length (int): Maximum length of a single rollout.
num_train_per_epoch (int): Number of train_once calls per epoch.
discount (float): Discount.
gae_lambda (float): Lambda used for generalized advantage
estimation.
center_adv (bool): Whether to rescale the advantages
so that they have mean 0 and standard deviation 1.
positive_adv (bool): Whether to shift the advantages
so that they are always positive. When used in
conjunction with center_adv the advantages will be
standardized before shifting.
policy_ent_coeff (float): The coefficient of the policy entropy.
Setting it to zero would mean no entropy regularization.
use_softplus_entropy (bool): Whether to estimate the softmax
distribution of the entropy to prevent the entropy from being
negative.
stop_entropy_gradient (bool): Whether to stop the entropy gradient.
entropy_method (str): A string from: 'max', 'regularized',
'no_entropy'. The type of entropy method to use. 'max' adds the
dense entropy to the reward for each time step. 'regularized' adds
the mean entropy to the surrogate objective. See
https://arxiv.org/abs/1805.00909 for more details.
"""
def __init__(
self,
env_spec,
policy,
value_function,
policy_optimizer=None,
vf_optimizer=None,
max_path_length=500,
num_train_per_epoch=1,
discount=0.99,
gae_lambda=1,
center_adv=True,
positive_adv=False,
policy_ent_coeff=0.0,
use_softplus_entropy=False,
stop_entropy_gradient=False,
entropy_method='no_entropy',
):
self.discount = discount
self.policy = policy
self.max_path_length = max_path_length
self._value_function = value_function
self._gae_lambda = gae_lambda
self._center_adv = center_adv
self._positive_adv = positive_adv
self._policy_ent_coeff = policy_ent_coeff
self._use_softplus_entropy = use_softplus_entropy
self._stop_entropy_gradient = stop_entropy_gradient
self._entropy_method = entropy_method
self._n_samples = num_train_per_epoch
self._env_spec = env_spec
self._maximum_entropy = (entropy_method == 'max')
self._entropy_regularzied = (entropy_method == 'regularized')
self._check_entropy_configuration(entropy_method, center_adv,
stop_entropy_gradient,
policy_ent_coeff)
self._episode_reward_mean = collections.deque(maxlen=100)
self.sampler_cls = OnPolicyVectorizedSampler
if policy_optimizer:
self._policy_optimizer = policy_optimizer
else:
self._policy_optimizer = OptimizerWrapper(torch.optim.Adam, policy)
if vf_optimizer:
self._vf_optimizer = vf_optimizer
else:
self._vf_optimizer = OptimizerWrapper(torch.optim.Adam,
value_function)
self._old_policy = copy.deepcopy(self.policy)
@staticmethod
def _check_entropy_configuration(entropy_method, center_adv,
stop_entropy_gradient, policy_ent_coeff):
if entropy_method not in ('max', 'regularized', 'no_entropy'):
raise ValueError('Invalid entropy_method')
if entropy_method == 'max':
if center_adv:
raise ValueError('center_adv should be False when '
'entropy_method is max')
if not stop_entropy_gradient:
raise ValueError('stop_gradient should be True when '
'entropy_method is max')
if entropy_method == 'no_entropy':
if policy_ent_coeff != 0.0:
raise ValueError('policy_ent_coeff should be zero '
'when there is no entropy method')
[docs] def train_once(self, itr, paths):
"""Train the algorithm once.
Args:
itr (int): Iteration number.
paths (list[dict]): A list of collected paths.
Returns:
numpy.float64: Calculated mean value of undiscounted returns.
"""
obs, actions, rewards, returns, valids, baselines = \
self.process_samples(paths)
if self._maximum_entropy:
policy_entropies = self._compute_policy_entropy(obs)
rewards += self._policy_ent_coeff * policy_entropies
obs_flat = torch.cat(filter_valids(obs, valids))
actions_flat = torch.cat(filter_valids(actions, valids))
rewards_flat = torch.cat(filter_valids(rewards, valids))
returns_flat = torch.cat(filter_valids(returns, valids))
advs_flat = self._compute_advantage(rewards, valids, baselines)
with torch.no_grad():
policy_loss_before = self._compute_loss_with_adv(
obs_flat, actions_flat, rewards_flat, advs_flat)
vf_loss_before = self._value_function.compute_loss(
obs_flat, returns_flat)
kl_before = self._compute_kl_constraint(obs)
self._train(obs_flat, actions_flat, rewards_flat, returns_flat,
advs_flat)
with torch.no_grad():
policy_loss_after = self._compute_loss_with_adv(
obs_flat, actions_flat, rewards_flat, advs_flat)
vf_loss_after = self._value_function.compute_loss(
obs_flat, returns_flat)
kl_after = self._compute_kl_constraint(obs)
policy_entropy = self._compute_policy_entropy(obs)
with tabular.prefix(self.policy.name):
tabular.record('/LossBefore', policy_loss_before.item())
tabular.record('/LossAfter', policy_loss_after.item())
tabular.record('/dLoss',
(policy_loss_before - policy_loss_after).item())
tabular.record('/KLBefore', kl_before.item())
tabular.record('/KL', kl_after.item())
tabular.record('/Entropy', policy_entropy.mean().item())
with tabular.prefix(self._value_function.name):
tabular.record('/LossBefore', vf_loss_before.item())
tabular.record('/LossAfter', vf_loss_after.item())
tabular.record('/dLoss',
vf_loss_before.item() - vf_loss_after.item())
self._old_policy.load_state_dict(self.policy.state_dict())
undiscounted_returns = log_performance(
itr,
TrajectoryBatch.from_trajectory_list(self._env_spec, paths),
discount=self.discount)
return np.mean(undiscounted_returns)
[docs] def train(self, runner):
"""Obtain samplers and start actual training for each epoch.
Args:
runner (LocalRunner): LocalRunner is passed to give algorithm
the access to runner.step_epochs(), which provides services
such as snapshotting and sampler control.
Returns:
float: The average return in last epoch cycle.
"""
last_return = None
for _ in runner.step_epochs():
for _ in range(self._n_samples):
runner.step_path = runner.obtain_samples(runner.step_itr)
last_return = self.train_once(runner.step_itr,
runner.step_path)
runner.step_itr += 1
return last_return
def _train(self, obs, actions, rewards, returns, advs):
r"""Train the policy and value function with minibatch.
Args:
obs (torch.Tensor): Observation from the environment with shape
:math:`(N, O*)`.
actions (torch.Tensor): Actions fed to the environment with shape
:math:`(N, A*)`.
rewards (torch.Tensor): Acquired rewards with shape :math:`(N, )`.
returns (torch.Tensor): Acquired returns with shape :math:`(N, )`.
advs (torch.Tensor): Advantage value at each step with shape
:math:`(N, )`.
"""
for dataset in self._policy_optimizer.get_minibatch(
obs, actions, rewards, advs):
self._train_policy(*dataset)
for dataset in self._vf_optimizer.get_minibatch(obs, returns):
self._train_value_function(*dataset)
def _train_policy(self, obs, actions, rewards, advantages):
r"""Train the policy.
Args:
obs (torch.Tensor): Observation from the environment
with shape :math:`(N, O*)`.
actions (torch.Tensor): Actions fed to the environment
with shape :math:`(N, A*)`.
rewards (torch.Tensor): Acquired rewards
with shape :math:`(N, )`.
advantages (torch.Tensor): Advantage value at each step
with shape :math:`(N, )`.
Returns:
torch.Tensor: Calculated mean scalar value of policy loss (float).
"""
self._policy_optimizer.zero_grad()
loss = self._compute_loss_with_adv(obs, actions, rewards, advantages)
loss.backward()
self._policy_optimizer.step()
return loss
def _train_value_function(self, obs, returns):
r"""Train the value function.
Args:
obs (torch.Tensor): Observation from the environment
with shape :math:`(N, O*)`.
returns (torch.Tensor): Acquired returns
with shape :math:`(N, )`.
Returns:
torch.Tensor: Calculated mean scalar value of value function loss
(float).
"""
self._vf_optimizer.zero_grad()
loss = self._value_function.compute_loss(obs, returns)
loss.backward()
self._vf_optimizer.step()
return loss
def _compute_loss(self, obs, actions, rewards, valids, baselines):
r"""Compute mean value of loss.
Notes: P is the maximum path length (self.max_path_length)
Args:
obs (torch.Tensor): Observation from the environment
with shape :math:`(N, P, O*)`.
actions (torch.Tensor): Actions fed to the environment
with shape :math:`(N, P, A*)`.
rewards (torch.Tensor): Acquired rewards
with shape :math:`(N, P)`.
valids (list[int]): Numbers of valid steps in each paths
baselines (torch.Tensor): Value function estimation at each step
with shape :math:`(N, P)`.
Returns:
torch.Tensor: Calculated negative mean scalar value of
objective (float).
"""
obs_flat = torch.cat(filter_valids(obs, valids))
actions_flat = torch.cat(filter_valids(actions, valids))
rewards_flat = torch.cat(filter_valids(rewards, valids))
advantages_flat = self._compute_advantage(rewards, valids, baselines)
return self._compute_loss_with_adv(obs_flat, actions_flat,
rewards_flat, advantages_flat)
def _compute_loss_with_adv(self, obs, actions, rewards, advantages):
r"""Compute mean value of loss.
Args:
obs (torch.Tensor): Observation from the environment
with shape :math:`(N \dot [T], O*)`.
actions (torch.Tensor): Actions fed to the environment
with shape :math:`(N \dot [T], A*)`.
rewards (torch.Tensor): Acquired rewards
with shape :math:`(N \dot [T], )`.
advantages (torch.Tensor): Advantage value at each step
with shape :math:`(N \dot [T], )`.
Returns:
torch.Tensor: Calculated negative mean scalar value of objective.
"""
objectives = self._compute_objective(advantages, obs, actions, rewards)
if self._entropy_regularzied:
policy_entropies = self._compute_policy_entropy(obs)
objectives += self._policy_ent_coeff * policy_entropies
return -objectives.mean()
def _compute_advantage(self, rewards, valids, baselines):
r"""Compute mean value of loss.
Notes: P is the maximum path length (self.max_path_length)
Args:
rewards (torch.Tensor): Acquired rewards
with shape :math:`(N, P)`.
valids (list[int]): Numbers of valid steps in each paths
baselines (torch.Tensor): Value function estimation at each step
with shape :math:`(N, P)`.
Returns:
torch.Tensor: Calculated advantage values given rewards and
baselines with shape :math:`(N \dot [T], )`.
"""
advantages = compute_advantages(self.discount, self._gae_lambda,
self.max_path_length, baselines,
rewards)
advantage_flat = torch.cat(filter_valids(advantages, valids))
if self._center_adv:
means = advantage_flat.mean()
variance = advantage_flat.var()
advantage_flat = (advantage_flat - means) / (variance + 1e-8)
if self._positive_adv:
advantage_flat -= advantage_flat.min()
return advantage_flat
def _compute_kl_constraint(self, obs):
r"""Compute KL divergence.
Compute the KL divergence between the old policy distribution and
current policy distribution.
Notes: P is the maximum path length (self.max_path_length)
Args:
obs (torch.Tensor): Observation from the environment
with shape :math:`(N, P, O*)`.
Returns:
torch.Tensor: Calculated mean scalar value of KL divergence
(float).
"""
with torch.no_grad():
old_dist = self._old_policy(obs)[0]
new_dist = self.policy(obs)[0]
kl_constraint = torch.distributions.kl.kl_divergence(
old_dist, new_dist)
return kl_constraint.mean()
def _compute_policy_entropy(self, obs):
r"""Compute entropy value of probability distribution.
Notes: P is the maximum path length (self.max_path_length)
Args:
obs (torch.Tensor): Observation from the environment
with shape :math:`(N, P, O*)`.
Returns:
torch.Tensor: Calculated entropy values given observation
with shape :math:`(N, P)`.
"""
if self._stop_entropy_gradient:
with torch.no_grad():
policy_entropy = self.policy(obs)[0].entropy()
else:
policy_entropy = self.policy(obs)[0].entropy()
# This prevents entropy from becoming negative for small policy std
if self._use_softplus_entropy:
policy_entropy = F.softplus(policy_entropy)
return policy_entropy
def _compute_objective(self, advantages, obs, actions, rewards):
r"""Compute objective value.
Args:
advantages (torch.Tensor): Advantage value at each step
with shape :math:`(N \dot [T], )`.
obs (torch.Tensor): Observation from the environment
with shape :math:`(N \dot [T], O*)`.
actions (torch.Tensor): Actions fed to the environment
with shape :math:`(N \dot [T], A*)`.
rewards (torch.Tensor): Acquired rewards
with shape :math:`(N \dot [T], )`.
Returns:
torch.Tensor: Calculated objective values
with shape :math:`(N \dot [T], )`.
"""
del rewards
log_likelihoods = self.policy(obs)[0].log_prob(actions)
return log_likelihoods * advantages
[docs] def process_samples(self, paths):
r"""Process sample data based on the collected paths.
Notes: P is the maximum path length (self.max_path_length)
Args:
paths (list[dict]): A list of collected paths
Returns:
torch.Tensor: The observations of the environment
with shape :math:`(N, P, O*)`.
torch.Tensor: The actions fed to the environment
with shape :math:`(N, P, A*)`.
torch.Tensor: The acquired rewards with shape :math:`(N, P)`.
list[int]: Numbers of valid steps in each paths.
torch.Tensor: Value function estimation at each step
with shape :math:`(N, P)`.
"""
valids = torch.Tensor([len(path['actions']) for path in paths]).int()
obs = torch.stack([
pad_to_last(path['observations'],
total_length=self.max_path_length,
axis=0) for path in paths
])
actions = torch.stack([
pad_to_last(path['actions'],
total_length=self.max_path_length,
axis=0) for path in paths
])
rewards = torch.stack([
pad_to_last(path['rewards'], total_length=self.max_path_length)
for path in paths
])
returns = torch.stack([
pad_to_last(tu.discount_cumsum(path['rewards'],
self.discount).copy(),
total_length=self.max_path_length) for path in paths
])
with torch.no_grad():
baselines = self._value_function(obs)
return obs, actions, rewards, returns, valids, baselines