Source code for garage.np.exploration_strategies.ou_strategy

"""
This module creates an OU exploration strategy.

Ornstein Uhlenbeck exploration strategy comes from the Ornstein-Uhlenbeck
process. It is often used in DDPG algorithm because in continuous control task
it is better to have temporally correlated exploration to get smoother
transitions. And OU process is relatively smooth in time.
"""
import numpy as np

from garage.np.exploration_strategies.base import ExplorationStrategy


[docs]class OUStrategy(ExplorationStrategy): """ An OU exploration strategy to add noise to environment actions. Args: env_spec: Environment for OUStrategy to explore. mu: A parameter to simulate the process. sigma: A parameter to simulate the process. theta: A parameter to simulate the process. dt: A parameter to simulate the process. x0: Initial state. Example: $ python garage/tf/exploration_strategies/ou_strategy.py """ def __init__(self, env_spec, mu=0, sigma=0.3, theta=0.15, dt=1e-2, x0=None): self.env_spec = env_spec self.action_space = env_spec.action_space self.action_dim = self.action_space.flat_dim self.mu = mu self.sigma = sigma self.theta = theta self.dt = dt self.x0 = x0 self.reset()
[docs] def simulate(self): """ Compute the next state of the exploration. Returns: self.state: Next state of the exploration. """ x = self.state dx = self.theta * (self.mu - x) * self.dt + self.sigma * np.sqrt( self.dt) * np.random.normal(size=len(x)) self.state = x + dx return self.state
[docs] def reset(self): """Reset the state of the exploration.""" self.state = self.x0 if self.x0 is not None else self.mu * np.zeros( self.action_dim)
[docs] def get_action(self, t, observation, policy, **kwargs): """Return an action with noise. Args: t: Iteration. observation: Observation from the environment. policy: Policy network to predict action based on the observation. Returns: An action with noise explored by OUStrategy. """ action, agent_infos = policy.get_action(observation) ou_state = self.simulate() return np.clip(action + ou_state, self.action_space.low, self.action_space.high), agent_infos
[docs] def get_actions(self, t, observations, policy, **kwargs): actions, agent_infos = policy.get_actions(observations) ou_state = self.simulate() return np.clip(actions + ou_state, self.action_space.low, self.action_space.high), agent_infos
if __name__ == '__main__': import gym import matplotlib.pyplot as plt ou = OUStrategy(env_spec=gym.make('Pendulum-v0'), mu=0, theta=0.15, sigma=0.3) states = [] for i in range(1000): states.append(ou.simulate()[0]) plt.plot(states) plt.show()