Probablistic Embeddings for Actor-Critic Reinforcement Learning (PEARL)¶
Paper |
Efficient Off-Policy Meta-Reinforcement Learning via Probabilistic Context Variables [1] |
Framework(s) |
|
API Reference |
|
Code |
|
Examples |
pearl_half_cheetah_vel, pearl_metaworld_ml1_push, pearl_metaworld_ml10, pearl_metaworld_ml45 |
PEARL, which stands for Probablistic Embeddings for Actor-Critic Reinforcement Learning, is an off-policy meta-RL algorithm. It is built on top of SAC using two Q-functions and a value function with an addition of an inference network that estimates the posterior 𝑞(𝑧‖𝑐)
. The policy is conditioned on the latent variable Z
in order to adpat its behavior to specific tasks.
Default Parameters¶
batch_size=256,
embedding_batch_size=100,
embedding_mini_batch_size=100,
encoder_hidden_size=200,
latent_size=5,
max_episode_length=200,
meta_batch_size=16,
net_size=300,
num_epochs=500,
num_train_tasks=100,
num_test_tasks=30,
num_steps_per_epoch=2000,
num_initial_steps=2000,
num_tasks_sample=5,
num_steps_prior=400,
num_extra_rl_steps_posterior=600,
reward_scale=5.
Examples¶
pearl_half_cheetah_vel¶
#!/usr/bin/env python3
"""PEARL HalfCheetahVel example."""
import click
from garage import wrap_experiment
from garage.envs import GymEnv, normalize
from garage.envs.mujoco import HalfCheetahVelEnv
from garage.experiment.deterministic import set_seed
from garage.experiment.task_sampler import SetTaskSampler
from garage.sampler import LocalSampler
from garage.torch import set_gpu_mode
from garage.torch.algos import PEARL
from garage.torch.algos.pearl import PEARLWorker
from garage.torch.embeddings import MLPEncoder
from garage.torch.policies import (ContextConditionedPolicy,
TanhGaussianMLPPolicy)
from garage.torch.q_functions import ContinuousMLPQFunction
from garage.trainer import Trainer
@click.command()
@click.option('--num_epochs', default=500)
@click.option('--num_train_tasks', default=100)
@click.option('--num_test_tasks', default=100)
@click.option('--encoder_hidden_size', default=200)
@click.option('--net_size', default=300)
@click.option('--num_steps_per_epoch', default=2000)
@click.option('--num_initial_steps', default=2000)
@click.option('--num_steps_prior', default=400)
@click.option('--num_extra_rl_steps_posterior', default=600)
@click.option('--batch_size', default=256)
@click.option('--embedding_batch_size', default=100)
@click.option('--embedding_mini_batch_size', default=100)
@click.option('--max_episode_length', default=200)
@wrap_experiment
def pearl_half_cheetah_vel(ctxt=None,
seed=1,
num_epochs=500,
num_train_tasks=100,
num_test_tasks=100,
latent_size=5,
encoder_hidden_size=200,
net_size=300,
meta_batch_size=16,
num_steps_per_epoch=2000,
num_initial_steps=2000,
num_tasks_sample=5,
num_steps_prior=400,
num_extra_rl_steps_posterior=600,
batch_size=256,
embedding_batch_size=100,
embedding_mini_batch_size=100,
max_episode_length=200,
reward_scale=5.,
use_gpu=False):
"""Train PEARL with HalfCheetahVel environment.
Args:
ctxt (garage.experiment.ExperimentContext): The experiment
configuration used by Trainer to create the snapshotter.
seed (int): Used to seed the random number generator to produce
determinism.
num_epochs (int): Number of training epochs.
num_train_tasks (int): Number of tasks for training.
num_test_tasks (int): Number of tasks to use for testing.
latent_size (int): Size of latent context vector.
encoder_hidden_size (int): Output dimension of dense layer of the
context encoder.
net_size (int): Output dimension of a dense layer of Q-function and
value function.
meta_batch_size (int): Meta batch size.
num_steps_per_epoch (int): Number of iterations per epoch.
num_initial_steps (int): Number of transitions obtained per task before
training.
num_tasks_sample (int): Number of random tasks to obtain data for each
iteration.
num_steps_prior (int): Number of transitions to obtain per task with
z ~ prior.
num_extra_rl_steps_posterior (int): Number of additional transitions
to obtain per task with z ~ posterior that are only used to train
the policy and NOT the encoder.
batch_size (int): Number of transitions in RL batch.
embedding_batch_size (int): Number of transitions in context batch.
embedding_mini_batch_size (int): Number of transitions in mini context
batch; should be same as embedding_batch_size for non-recurrent
encoder.
max_episode_length (int): Maximum episode length.
reward_scale (int): Reward scale.
use_gpu (bool): Whether or not to use GPU for training.
"""
set_seed(seed)
encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size,
encoder_hidden_size)
# create multi-task environment and sample tasks
env_sampler = SetTaskSampler(
HalfCheetahVelEnv,
wrapper=lambda env, _: normalize(
GymEnv(env, max_episode_length=max_episode_length)))
env = env_sampler.sample(num_train_tasks)
test_env_sampler = SetTaskSampler(
HalfCheetahVelEnv,
wrapper=lambda env, _: normalize(
GymEnv(env, max_episode_length=max_episode_length)))
trainer = Trainer(ctxt)
# instantiate networks
augmented_env = PEARL.augment_env_spec(env[0](), latent_size)
qf = ContinuousMLPQFunction(env_spec=augmented_env,
hidden_sizes=[net_size, net_size, net_size])
vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf')
vf = ContinuousMLPQFunction(env_spec=vf_env,
hidden_sizes=[net_size, net_size, net_size])
inner_policy = TanhGaussianMLPPolicy(
env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size])
pearl = PEARL(
env=env,
policy_class=ContextConditionedPolicy,
encoder_class=MLPEncoder,
inner_policy=inner_policy,
qf=qf,
vf=vf,
num_train_tasks=num_train_tasks,
num_test_tasks=num_test_tasks,
latent_dim=latent_size,
encoder_hidden_sizes=encoder_hidden_sizes,
test_env_sampler=test_env_sampler,
meta_batch_size=meta_batch_size,
num_steps_per_epoch=num_steps_per_epoch,
num_initial_steps=num_initial_steps,
num_tasks_sample=num_tasks_sample,
num_steps_prior=num_steps_prior,
num_extra_rl_steps_posterior=num_extra_rl_steps_posterior,
batch_size=batch_size,
embedding_batch_size=embedding_batch_size,
embedding_mini_batch_size=embedding_mini_batch_size,
reward_scale=reward_scale,
)
set_gpu_mode(use_gpu, gpu_id=0)
if use_gpu:
pearl.to()
trainer.setup(algo=pearl,
env=env[0](),
sampler_cls=LocalSampler,
sampler_args=dict(max_episode_length=max_episode_length),
n_workers=1,
worker_class=PEARLWorker)
trainer.train(n_epochs=num_epochs, batch_size=batch_size)
pearl_half_cheetah_vel()
pearl_metaworld_ml1_push¶
#!/usr/bin/env python3
"""PEARL ML1 example."""
import click
import metaworld
from garage import wrap_experiment
from garage.envs import MetaWorldSetTaskEnv, normalize
from garage.experiment.deterministic import set_seed
from garage.experiment.task_sampler import SetTaskSampler
from garage.sampler import LocalSampler
from garage.torch import set_gpu_mode
from garage.torch.algos import PEARL
from garage.torch.algos.pearl import PEARLWorker
from garage.torch.embeddings import MLPEncoder
from garage.torch.policies import (ContextConditionedPolicy,
TanhGaussianMLPPolicy)
from garage.torch.q_functions import ContinuousMLPQFunction
from garage.trainer import Trainer
@click.command()
@click.option('--num_epochs', default=1000)
@click.option('--num_train_tasks', default=50)
@click.option('--encoder_hidden_size', default=200)
@click.option('--net_size', default=300)
@click.option('--num_steps_per_epoch', default=4000)
@click.option('--num_initial_steps', default=4000)
@click.option('--num_steps_prior', default=750)
@click.option('--num_extra_rl_steps_posterior', default=750)
@click.option('--batch_size', default=256)
@click.option('--embedding_batch_size', default=64)
@click.option('--embedding_mini_batch_size', default=64)
@wrap_experiment
def pearl_metaworld_ml1_push(ctxt=None,
seed=1,
num_epochs=1000,
num_train_tasks=50,
latent_size=7,
encoder_hidden_size=200,
net_size=300,
meta_batch_size=16,
num_steps_per_epoch=4000,
num_initial_steps=4000,
num_tasks_sample=15,
num_steps_prior=750,
num_extra_rl_steps_posterior=750,
batch_size=256,
embedding_batch_size=64,
embedding_mini_batch_size=64,
reward_scale=10.,
use_gpu=False):
"""Train PEARL with ML1 environments.
Args:
ctxt (garage.experiment.ExperimentContext): The experiment
configuration used by Trainer to create the snapshotter.
seed (int): Used to seed the random number generator to produce
determinism.
num_epochs (int): Number of training epochs.
num_train_tasks (int): Number of tasks for training.
latent_size (int): Size of latent context vector.
encoder_hidden_size (int): Output dimension of dense layer of the
context encoder.
net_size (int): Output dimension of a dense layer of Q-function and
value function.
meta_batch_size (int): Meta batch size.
num_steps_per_epoch (int): Number of iterations per epoch.
num_initial_steps (int): Number of transitions obtained per task before
training.
num_tasks_sample (int): Number of random tasks to obtain data for each
iteration.
num_steps_prior (int): Number of transitions to obtain per task with
z ~ prior.
num_extra_rl_steps_posterior (int): Number of additional transitions
to obtain per task with z ~ posterior that are only used to train
the policy and NOT the encoder.
batch_size (int): Number of transitions in RL batch.
embedding_batch_size (int): Number of transitions in context batch.
embedding_mini_batch_size (int): Number of transitions in mini context
batch; should be same as embedding_batch_size for non-recurrent
encoder.
reward_scale (int): Reward scale.
use_gpu (bool): Whether or not to use GPU for training.
"""
set_seed(seed)
encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size,
encoder_hidden_size)
# create multi-task environment and sample tasks
ml1 = metaworld.ML1('push-v1')
train_env = MetaWorldSetTaskEnv(ml1, 'train')
env_sampler = SetTaskSampler(MetaWorldSetTaskEnv,
env=train_env,
wrapper=lambda env, _: normalize(env))
env = env_sampler.sample(num_train_tasks)
test_env = MetaWorldSetTaskEnv(ml1, 'test')
test_env_sampler = SetTaskSampler(MetaWorldSetTaskEnv,
env=test_env,
wrapper=lambda env, _: normalize(env))
trainer = Trainer(ctxt)
# instantiate networks
augmented_env = PEARL.augment_env_spec(env[0](), latent_size)
qf = ContinuousMLPQFunction(env_spec=augmented_env,
hidden_sizes=[net_size, net_size, net_size])
vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf')
vf = ContinuousMLPQFunction(env_spec=vf_env,
hidden_sizes=[net_size, net_size, net_size])
inner_policy = TanhGaussianMLPPolicy(
env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size])
pearl = PEARL(
env=env,
policy_class=ContextConditionedPolicy,
encoder_class=MLPEncoder,
inner_policy=inner_policy,
qf=qf,
vf=vf,
num_train_tasks=num_train_tasks,
latent_dim=latent_size,
encoder_hidden_sizes=encoder_hidden_sizes,
test_env_sampler=test_env_sampler,
meta_batch_size=meta_batch_size,
num_steps_per_epoch=num_steps_per_epoch,
num_initial_steps=num_initial_steps,
num_tasks_sample=num_tasks_sample,
num_steps_prior=num_steps_prior,
num_extra_rl_steps_posterior=num_extra_rl_steps_posterior,
batch_size=batch_size,
embedding_batch_size=embedding_batch_size,
embedding_mini_batch_size=embedding_mini_batch_size,
reward_scale=reward_scale,
)
set_gpu_mode(use_gpu, gpu_id=0)
if use_gpu:
pearl.to()
trainer.setup(algo=pearl,
env=env[0](),
sampler_cls=LocalSampler,
n_workers=1,
worker_class=PEARLWorker)
trainer.train(n_epochs=num_epochs, batch_size=batch_size)
pearl_metaworld_ml1_push()
pearl_metaworld_ml10¶
#!/usr/bin/env python3
"""PEARL ML10 example."""
import click
import metaworld
from garage import wrap_experiment
from garage.envs import MetaWorldSetTaskEnv, normalize
from garage.experiment.deterministic import set_seed
from garage.experiment.task_sampler import SetTaskSampler
from garage.sampler import LocalSampler
from garage.torch import set_gpu_mode
from garage.torch.algos import PEARL
from garage.torch.algos.pearl import PEARLWorker
from garage.torch.embeddings import MLPEncoder
from garage.torch.policies import (ContextConditionedPolicy,
TanhGaussianMLPPolicy)
from garage.torch.q_functions import ContinuousMLPQFunction
from garage.trainer import Trainer
@click.command()
@click.option('--num_epochs', default=1000)
@click.option('--num_train_tasks', default=10)
@click.option('--encoder_hidden_size', default=200)
@click.option('--net_size', default=300)
@click.option('--num_steps_per_epoch', default=4000)
@click.option('--num_initial_steps', default=4000)
@click.option('--num_steps_prior', default=750)
@click.option('--num_extra_rl_steps_posterior', default=750)
@click.option('--batch_size', default=256)
@click.option('--embedding_batch_size', default=64)
@click.option('--embedding_mini_batch_size', default=64)
@wrap_experiment
def pearl_metaworld_ml10(ctxt=None,
seed=1,
num_epochs=1000,
num_train_tasks=10,
latent_size=7,
encoder_hidden_size=200,
net_size=300,
meta_batch_size=16,
num_steps_per_epoch=4000,
num_initial_steps=4000,
num_tasks_sample=15,
num_steps_prior=750,
num_extra_rl_steps_posterior=750,
batch_size=256,
embedding_batch_size=64,
embedding_mini_batch_size=64,
reward_scale=10.,
use_gpu=False):
"""Train PEARL with ML10 environments.
Args:
ctxt (garage.experiment.ExperimentContext): The experiment
configuration used by Trainer to create the snapshotter.
seed (int): Used to seed the random number generator to produce
determinism.
num_epochs (int): Number of training epochs.
num_train_tasks (int): Number of tasks for training.
latent_size (int): Size of latent context vector.
encoder_hidden_size (int): Output dimension of dense layer of the
context encoder.
net_size (int): Output dimension of a dense layer of Q-function and
value function.
meta_batch_size (int): Meta batch size.
num_steps_per_epoch (int): Number of iterations per epoch.
num_initial_steps (int): Number of transitions obtained per task before
training.
num_tasks_sample (int): Number of random tasks to obtain data for each
iteration.
num_steps_prior (int): Number of transitions to obtain per task with
z ~ prior.
num_extra_rl_steps_posterior (int): Number of additional transitions
to obtain per task with z ~ posterior that are only used to train
the policy and NOT the encoder.
batch_size (int): Number of transitions in RL batch.
embedding_batch_size (int): Number of transitions in context batch.
embedding_mini_batch_size (int): Number of transitions in mini context
batch; should be same as embedding_batch_size for non-recurrent
encoder.
reward_scale (int): Reward scale.
use_gpu (bool): Whether or not to use GPU for training.
"""
set_seed(seed)
encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size,
encoder_hidden_size)
ml10 = metaworld.ML10()
train_env = MetaWorldSetTaskEnv(ml10, 'train')
env_sampler = SetTaskSampler(MetaWorldSetTaskEnv,
env=train_env,
wrapper=lambda env, _: normalize(env))
env = env_sampler.sample(num_train_tasks)
test_env = MetaWorldSetTaskEnv(ml10, 'test')
test_env_sampler = SetTaskSampler(MetaWorldSetTaskEnv,
env=test_env,
wrapper=lambda env, _: normalize(env))
trainer = Trainer(ctxt)
# instantiate networks
augmented_env = PEARL.augment_env_spec(env[0](), latent_size)
qf = ContinuousMLPQFunction(env_spec=augmented_env,
hidden_sizes=[net_size, net_size, net_size])
vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf')
vf = ContinuousMLPQFunction(env_spec=vf_env,
hidden_sizes=[net_size, net_size, net_size])
inner_policy = TanhGaussianMLPPolicy(
env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size])
pearl = PEARL(
env=env,
policy_class=ContextConditionedPolicy,
encoder_class=MLPEncoder,
inner_policy=inner_policy,
qf=qf,
vf=vf,
num_train_tasks=num_train_tasks,
latent_dim=latent_size,
encoder_hidden_sizes=encoder_hidden_sizes,
test_env_sampler=test_env_sampler,
meta_batch_size=meta_batch_size,
num_steps_per_epoch=num_steps_per_epoch,
num_initial_steps=num_initial_steps,
num_tasks_sample=num_tasks_sample,
num_steps_prior=num_steps_prior,
num_extra_rl_steps_posterior=num_extra_rl_steps_posterior,
batch_size=batch_size,
embedding_batch_size=embedding_batch_size,
embedding_mini_batch_size=embedding_mini_batch_size,
reward_scale=reward_scale,
)
set_gpu_mode(use_gpu, gpu_id=0)
if use_gpu:
pearl.to()
trainer.setup(algo=pearl,
env=env[0](),
sampler_cls=LocalSampler,
n_workers=1,
worker_class=PEARLWorker)
trainer.train(n_epochs=num_epochs, batch_size=batch_size)
pearl_metaworld_ml10()
pearl_metaworld_ml45¶
#!/usr/bin/env python3
"""PEARL ML45 example."""
import click
import metaworld
from garage import wrap_experiment
from garage.envs import MetaWorldSetTaskEnv, normalize
from garage.experiment.deterministic import set_seed
from garage.experiment.task_sampler import SetTaskSampler
from garage.sampler import LocalSampler
from garage.torch import set_gpu_mode
from garage.torch.algos import PEARL
from garage.torch.algos.pearl import PEARLWorker
from garage.torch.embeddings import MLPEncoder
from garage.torch.policies import (ContextConditionedPolicy,
TanhGaussianMLPPolicy)
from garage.torch.q_functions import ContinuousMLPQFunction
from garage.trainer import Trainer
@click.command()
@click.option('--num_epochs', default=1000)
@click.option('--num_train_tasks', default=45)
@click.option('--encoder_hidden_size', default=200)
@click.option('--net_size', default=300)
@click.option('--num_steps_per_epoch', default=4000)
@click.option('--num_initial_steps', default=4000)
@click.option('--num_steps_prior', default=750)
@click.option('--num_extra_rl_steps_posterior', default=750)
@click.option('--batch_size', default=256)
@click.option('--embedding_batch_size', default=64)
@click.option('--embedding_mini_batch_size', default=64)
@wrap_experiment
def pearl_metaworld_ml45(ctxt=None,
seed=1,
num_epochs=1000,
num_train_tasks=45,
latent_size=7,
encoder_hidden_size=200,
net_size=300,
meta_batch_size=16,
num_steps_per_epoch=4000,
num_initial_steps=4000,
num_tasks_sample=15,
num_steps_prior=750,
num_extra_rl_steps_posterior=750,
batch_size=256,
embedding_batch_size=64,
embedding_mini_batch_size=64,
reward_scale=10.,
use_gpu=False):
"""Train PEARL with ML45 environments.
Args:
ctxt (garage.experiment.ExperimentContext): The experiment
configuration used by Trainer to create the snapshotter.
seed (int): Used to seed the random number generator to produce
determinism.
num_epochs (int): Number of training epochs.
num_train_tasks (int): Number of tasks for training.
latent_size (int): Size of latent context vector.
encoder_hidden_size (int): Output dimension of dense layer of the
context encoder.
net_size (int): Output dimension of a dense layer of Q-function and
value function.
meta_batch_size (int): Meta batch size.
num_steps_per_epoch (int): Number of iterations per epoch.
num_initial_steps (int): Number of transitions obtained per task before
training.
num_tasks_sample (int): Number of random tasks to obtain data for each
iteration.
num_steps_prior (int): Number of transitions to obtain per task with
z ~ prior.
num_extra_rl_steps_posterior (int): Number of additional transitions
to obtain per task with z ~ posterior that are only used to train
the policy and NOT the encoder.
batch_size (int): Number of transitions in RL batch.
embedding_batch_size (int): Number of transitions in context batch.
embedding_mini_batch_size (int): Number of transitions in mini context
batch; should be same as embedding_batch_size for non-recurrent
encoder.
reward_scale (int): Reward scale.
use_gpu (bool): Whether or not to use GPU for training.
"""
set_seed(seed)
encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size,
encoder_hidden_size)
ml45 = metaworld.ML45()
train_env = MetaWorldSetTaskEnv(ml45, 'train')
env_sampler = SetTaskSampler(MetaWorldSetTaskEnv,
env=train_env,
wrapper=lambda env, _: normalize(env))
env = env_sampler.sample(num_train_tasks)
test_env = MetaWorldSetTaskEnv(ml45, 'test')
test_env_sampler = SetTaskSampler(MetaWorldSetTaskEnv,
env=test_env,
wrapper=lambda env, _: normalize(env))
trainer = Trainer(ctxt)
# instantiate networks
augmented_env = PEARL.augment_env_spec(env[0](), latent_size)
qf = ContinuousMLPQFunction(env_spec=augmented_env,
hidden_sizes=[net_size, net_size, net_size])
vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf')
vf = ContinuousMLPQFunction(env_spec=vf_env,
hidden_sizes=[net_size, net_size, net_size])
inner_policy = TanhGaussianMLPPolicy(
env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size])
pearl = PEARL(
env=env,
policy_class=ContextConditionedPolicy,
encoder_class=MLPEncoder,
inner_policy=inner_policy,
qf=qf,
vf=vf,
num_train_tasks=num_train_tasks,
latent_dim=latent_size,
encoder_hidden_sizes=encoder_hidden_sizes,
test_env_sampler=test_env_sampler,
meta_batch_size=meta_batch_size,
num_steps_per_epoch=num_steps_per_epoch,
num_initial_steps=num_initial_steps,
num_tasks_sample=num_tasks_sample,
num_steps_prior=num_steps_prior,
num_extra_rl_steps_posterior=num_extra_rl_steps_posterior,
batch_size=batch_size,
embedding_batch_size=embedding_batch_size,
embedding_mini_batch_size=embedding_mini_batch_size,
reward_scale=reward_scale,
)
set_gpu_mode(use_gpu, gpu_id=0)
if use_gpu:
pearl.to()
trainer.setup(algo=pearl,
env=env[0](),
sampler_cls=LocalSampler,
n_workers=1,
worker_class=PEARLWorker)
trainer.train(n_epochs=num_epochs, batch_size=batch_size)
pearl_metaworld_ml45()
References¶
- 1
Kate Rakelly, Aurick Zhou, Deirdre Quillen, Chelsea Finn, and Sergey Levine. Efficient off-policy meta-reinforcement learning via probabilistic context variables. arXiv preprint arXiv:1903.08254, 2019.
This page was authored by Iris Liu (@irisliucy).