TRPO¶
Paper |
Trust Region Policy Optimization [2] |
|
Framework(s) |
||
API Reference |
||
Code |
||
Examples |
Trust Region Policy Optimization, or TRPO, is a policy gradient algorithm that builds on REINFORCE/VPG to improve performance. It introduces a KL constraint that prevents incremental policy updates from deviating excessively from the current policy, and instead mandates that it remains within a specified trust region. The TRPO paper is available here. Also, please see Spinning Up’s write up for a detailed description of the inner workings of the algorithm.
Examples¶
TF¶
#!/usr/bin/env python3
"""This is an example to train a task with TRPO algorithm.
Here it runs CartPole-v1 environment with 100 iterations.
Results:
AverageReturn: 100
RiseTime: itr 13
"""
from garage import wrap_experiment
from garage.envs import GymEnv
from garage.experiment.deterministic import set_seed
from garage.np.baselines import LinearFeatureBaseline
from garage.tf.algos import TRPO
from garage.tf.policies import CategoricalMLPPolicy
from garage.trainer import TFTrainer
@wrap_experiment
def trpo_cartpole(ctxt=None, seed=1):
"""Train TRPO with CartPole-v1 environment.
Args:
ctxt (garage.experiment.ExperimentContext): The experiment
configuration used by Trainer to create the snapshotter.
seed (int): Used to seed the random number generator to produce
determinism.
"""
set_seed(seed)
with TFTrainer(ctxt) as trainer:
env = GymEnv('CartPole-v1')
policy = CategoricalMLPPolicy(name='policy',
env_spec=env.spec,
hidden_sizes=(32, 32))
baseline = LinearFeatureBaseline(env_spec=env.spec)
algo = TRPO(env_spec=env.spec,
policy=policy,
baseline=baseline,
discount=0.99,
max_kl_step=0.01)
trainer.setup(algo, env)
trainer.train(n_epochs=100, batch_size=4000)
trpo_cartpole()
#!/usr/bin/env python3
"""This is an example to train a task with TRPO algorithm.
Here it runs CubeCrash-v0 environment with 100 iterations.
"""
import click
from garage import wrap_experiment
from garage.envs import GymEnv, normalize
from garage.experiment.deterministic import set_seed
from garage.tf.algos import TRPO
from garage.tf.baselines import GaussianCNNBaseline
from garage.tf.policies import CategoricalCNNPolicy
from garage.trainer import TFTrainer
@click.command()
@click.option('--batch_size', type=int, default=4000)
@click.option('--max_episode_length', type=int, default=5)
@wrap_experiment
def trpo_cubecrash(ctxt=None, seed=1, max_episode_length=5, batch_size=4000):
"""Train TRPO with CubeCrash-v0 environment.
Args:
ctxt (garage.experiment.ExperimentContext): The experiment
configuration used by Trainer to create the snapshotter.
seed (int): Used to seed the random number generator to produce
determinism.
max_episode_length (int): Maximum length of a single episode.
batch_size (int): Number of timesteps to use in each training step.
"""
set_seed(seed)
with TFTrainer(ctxt) as trainer:
env = normalize(
GymEnv('CubeCrash-v0', max_episode_length=max_episode_length))
policy = CategoricalCNNPolicy(env_spec=env.spec,
filters=((32, (8, 8)), (64, (4, 4))),
strides=(4, 2),
padding='VALID',
hidden_sizes=(32, 32))
baseline = GaussianCNNBaseline(env_spec=env.spec,
filters=((32, (8, 8)), (64, (4, 4))),
strides=(4, 2),
padding='VALID',
hidden_sizes=(32, 32),
use_trust_region=True)
algo = TRPO(env_spec=env.spec,
policy=policy,
baseline=baseline,
discount=0.99,
gae_lambda=0.95,
lr_clip_range=0.2,
policy_ent_coeff=0.0)
trainer.setup(algo, env)
trainer.train(n_epochs=100, batch_size=batch_size)
trpo_cubecrash()
#!/usr/bin/env python3
"""This is an example to train a task with TRPO algorithm.
It uses an LSTM-based recurrent policy.
Here it runs CartPole-v1 environment with 100 iterations.
Results:
AverageReturn: 100
RiseTime: itr 13
"""
# pylint: disable=no-value-for-parameter
import click
from garage import wrap_experiment
from garage.envs import GymEnv
from garage.experiment.deterministic import set_seed
from garage.np.baselines import LinearFeatureBaseline
from garage.tf.algos import TRPO
from garage.tf.optimizers import (ConjugateGradientOptimizer,
FiniteDifferenceHVP)
from garage.tf.policies import CategoricalLSTMPolicy
from garage.trainer import TFTrainer
@click.command()
@click.option('--seed', default=1)
@click.option('--n_epochs', default=100)
@click.option('--batch_size', default=4000)
@click.option('--plot', default=False)
@wrap_experiment
def trpo_cartpole_recurrent(ctxt, seed, n_epochs, batch_size, plot):
"""Train TRPO with a recurrent policy on CartPole.
Args:
ctxt (garage.experiment.ExperimentContext): The experiment
configuration used by Trainer to create the snapshotter.
n_epochs (int): Number of epochs for training.
seed (int): Used to seed the random number generator to produce
determinism.
batch_size (int): Batch size used for training.
plot (bool): Whether to plot or not.
"""
set_seed(seed)
with TFTrainer(snapshot_config=ctxt) as trainer:
env = GymEnv('CartPole-v1', max_episode_length=100)
policy = CategoricalLSTMPolicy(name='policy', env_spec=env.spec)
baseline = LinearFeatureBaseline(env_spec=env.spec)
algo = TRPO(env_spec=env.spec,
policy=policy,
baseline=baseline,
discount=0.99,
max_kl_step=0.01,
optimizer=ConjugateGradientOptimizer,
optimizer_args=dict(hvp_approach=FiniteDifferenceHVP(
base_eps=1e-5)))
trainer.setup(algo, env)
trainer.train(n_epochs=n_epochs, batch_size=batch_size, plot=plot)
trpo_cartpole_recurrent()
Pytorch¶
#!/usr/bin/env python3
"""This is an example to train a task with TRPO algorithm (PyTorch).
Here it runs InvertedDoublePendulum-v2 environment with 100 iterations.
"""
import torch
from garage import wrap_experiment
from garage.envs import GymEnv
from garage.experiment.deterministic import set_seed
from garage.torch.algos import TRPO
from garage.torch.policies import GaussianMLPPolicy
from garage.torch.value_functions import GaussianMLPValueFunction
from garage.trainer import Trainer
@wrap_experiment
def trpo_pendulum(ctxt=None, seed=1):
"""Train TRPO with InvertedDoublePendulum-v2 environment.
Args:
ctxt (garage.experiment.ExperimentContext): The experiment
configuration used by Trainer to create the snapshotter.
seed (int): Used to seed the random number generator to produce
determinism.
"""
set_seed(seed)
env = GymEnv('InvertedDoublePendulum-v2')
trainer = Trainer(ctxt)
policy = GaussianMLPPolicy(env.spec,
hidden_sizes=[32, 32],
hidden_nonlinearity=torch.tanh,
output_nonlinearity=None)
value_function = GaussianMLPValueFunction(env_spec=env.spec,
hidden_sizes=(32, 32),
hidden_nonlinearity=torch.tanh,
output_nonlinearity=None)
algo = TRPO(env_spec=env.spec,
policy=policy,
value_function=value_function,
discount=0.99,
center_adv=False)
trainer.setup(algo, env)
trainer.train(n_epochs=100, batch_size=1024)
trpo_pendulum(seed=1)
#!/usr/bin/env python3
"""This is an example to train a task with TRPO algorithm (PyTorch).
Uses Ray sampler instead of MultiprocessingSampler.
Here it runs InvertedDoublePendulum-v2 environment with 100 iterations.
"""
import numpy as np
import ray
import torch
from garage import wrap_experiment
from garage.envs import GymEnv
from garage.experiment import deterministic
from garage.sampler import RaySampler
from garage.torch.algos import TRPO
from garage.torch.policies import GaussianMLPPolicy
from garage.torch.value_functions import GaussianMLPValueFunction
from garage.trainer import Trainer
@wrap_experiment(snapshot_mode='none')
def trpo_pendulum_ray_sampler(ctxt=None, seed=1):
"""Set up environment and algorithm and run the task.
Args:
ctxt (garage.experiment.ExperimentContext): The experiment
configuration used by Trainer to create the snapshotter.
seed (int): Used to seed the random number generator to produce
determinism.
"""
# Since this is an example, we are running ray in a reduced state.
# One can comment this line out in order to run ray at full capacity
ray.init(_memory=52428800,
object_store_memory=78643200,
ignore_reinit_error=True,
log_to_driver=False,
include_dashboard=False)
deterministic.set_seed(seed)
env = GymEnv('InvertedDoublePendulum-v2')
trainer = Trainer(ctxt)
policy = GaussianMLPPolicy(env.spec,
hidden_sizes=[32, 32],
hidden_nonlinearity=torch.tanh,
output_nonlinearity=None)
value_function = GaussianMLPValueFunction(env_spec=env.spec,
hidden_sizes=(32, 32),
hidden_nonlinearity=torch.tanh,
output_nonlinearity=None)
algo = TRPO(env_spec=env.spec,
policy=policy,
value_function=value_function,
discount=0.99,
center_adv=False)
trainer.setup(algo, env, sampler_cls=RaySampler)
trainer.train(n_epochs=100, batch_size=1024)
s = np.random.randint(0, 1000)
trpo_pendulum_ray_sampler(seed=s)
References¶
- 2
John Schulman, Sergey Levine, Philipp Moritz, Michael I. Jordan, and Pieter Abbeel. Trust region policy optimization. arXiv, 2015. arXiv:1502.05477.
This page was authored by Mishari Aliesa (@maliesa96).