Behavioral Cloning

Paper

Model-Free Imitation Learning with Policy Optimization [1]

Framework(s)

../_images/pytorch.png

PyTorch

API Reference

garage.torch.algos.BC

Code

garage/torch/algos/bc.py

Examples

bc_point, bc_point_deterministic_policy

Behavioral cloning is a simple immitation learning algorithm which maxmizes the likelhood of an expert demonstration’s actions under the apprentice policy using direct policy optimization. Garage’s implementation may use either a policy or dataset as the expert.

Default Parameters

policy_optimizer = torch.optim.Adam
policy_lr = 1e-3
loss = 'log_prob'
batch_size = 1000

Examples

bc_point

../_images/pytorch.png
#!/usr/bin/env python3
"""Example of using Behavioral Cloning."""
import click
import numpy as np

from garage import wrap_experiment
from garage.envs import PointEnv
from garage.torch.algos import BC
from garage.torch.policies import GaussianMLPPolicy, Policy
from garage.trainer import Trainer


class OptimalPolicy(Policy):
    """Optimal policy for PointEnv.

    Args:
        env_spec (EnvSpec): The environment spec.
        goal (np.ndarray): The goal location of the environment.

    """

    # No forward method
    # pylint: disable=abstract-method

    def __init__(self, env_spec, goal):
        super().__init__(env_spec, 'OptimalPolicy')
        self.goal = goal

    def get_action(self, observation):
        """Get action given observation.

        Args:
            observation (np.ndarray): Observation from PointEnv. Should have
                length at least 2.

        Returns:
            tuple:
                * np.ndarray: Optimal action in the environment. Has length 2.
                * dict[str, np.ndarray]: Agent info (empty).

        """
        return self.goal - observation[:2], {}

    def get_actions(self, observations):
        """Get actions given observations.

        Args:
            observations (np.ndarray): Observations from the environment.
                Has shape :math:`(B, O)`, where :math:`B` is the batch
                dimension and :math:`O` is the observation dimensionality (at
                least 2).

        Returns:
            tuple:
                * np.ndarray: Batch of optimal actions.
                    Has shape :math:`(B, 2)`, where :math:`B` is the batch
                    dimension.
                Optimal action in the environment.
                * dict[str, np.ndarray]: Agent info (empty).

        """
        return (self.goal[np.newaxis, :].repeat(len(observations), axis=0) -
                observations[:, :2]), {}


@click.command()
@click.option('--loss', type=str, default='log_prob')
@wrap_experiment
def bc_point(ctxt=None, loss='log_prob'):
    """Run Behavioral Cloning on garage.envs.PointEnv.

    Args:
        ctxt (ExperimentContext): Provided by wrap_experiment.
        loss (str): Either 'log_prob' or 'mse'

    """
    trainer = Trainer(ctxt)
    goal = np.array([1., 1.])
    env = PointEnv(goal=goal, max_episode_length=200)
    expert = OptimalPolicy(env.spec, goal=goal)
    policy = GaussianMLPPolicy(env.spec, [8, 8])
    batch_size = 1000
    algo = BC(env.spec,
              policy,
              batch_size=batch_size,
              source=expert,
              policy_lr=1e-2,
              loss=loss)
    trainer.setup(algo, env)
    trainer.train(100, batch_size=batch_size)


bc_point()

bc_point_deterministic_policy

Experiment Results

../_images/bc_meanLoss.pngBC Mean Loss ../_images/bc_stdLoss.pngBC Mean Loss

../_images/pytorch.png
#!/usr/bin/env python3
"""Example of using Behavioral Cloning."""
import numpy as np

from garage import wrap_experiment
from garage.envs import PointEnv
from garage.torch.algos import BC
from garage.torch.policies import DeterministicMLPPolicy, Policy
from garage.trainer import Trainer


class OptimalPolicy(Policy):
    """Optimal policy for PointEnv.

    Args:
        env_spec (EnvSpec): The environment spec.
        goal (np.ndarray): The goal location of the environment.

    """

    # No forward method
    # pylint: disable=abstract-method

    def __init__(self, env_spec, goal):
        super().__init__(env_spec, 'OptimalPolicy')
        self.goal = goal

    def get_action(self, observation):
        """Get action given observation.

        Args:
            observation (np.ndarray): Observation from PointEnv. Should have
                length at least 2.

        Returns:
            tuple:
                * np.ndarray: Optimal action in the environment. Has length 2.
                * dict[str, np.ndarray]: Agent info (empty).

        """
        return self.goal - observation[:2], {}

    def get_actions(self, observations):
        """Get actions given observations.

        Args:
            observations (np.ndarray): Observations from the environment.
                Has shape :math:`(B, O)`, where :math:`B` is the batch
                dimension and :math:`O` is the observation dimensionality (at
                least 2).

        Returns:
            tuple:
                * np.ndarray: Batch of optimal actions.
                    Has shape :math:`(B, 2)`, where :math:`B` is the batch
                    dimension.
                Optimal action in the environment.
                * dict[str, np.ndarray]: Agent info (empty).

        """
        return (self.goal[np.newaxis, :].repeat(len(observations), axis=0) -
                observations[:, :2]), {}


@wrap_experiment
def bc_point(ctxt=None):
    """Run Behavioral Cloning on garage.envs.PointEnv.

    Args:
        ctxt (ExperimentContext): Provided by wrap_experiment.

    """
    trainer = Trainer(ctxt)
    goal = np.array([1., 1.])
    env = PointEnv(goal=goal, max_episode_length=200)
    expert = OptimalPolicy(env.spec, goal=goal)
    policy = DeterministicMLPPolicy(env.spec, hidden_sizes=[8, 8])
    batch_size = 1000
    algo = BC(env.spec,
              policy,
              batch_size=batch_size,
              source=expert,
              policy_lr=1e-2,
              loss='mse')
    trainer.setup(algo, env)
    trainer.train(100, batch_size=batch_size)


bc_point()

References

1

Jonathan Ho, Jayesh Gupta, and Stefano Ermon. Model-free imitation learning with policy optimization. In International Conference on Machine Learning, 2760–2769. 2016. URL: https://arxiv.org/abs/1605.08478.


This page was authored by Iris Liu (@irisliucy) with contributions from Ryan Julian (@ryanjulian).