Source code for garage.np.exploration_strategies.epsilon_greedy_strategy

"""
ϵ-greedy exploration strategy.

Random exploration according to the value of epsilon.
"""
import numpy as np

from garage.np.exploration_strategies.base import ExplorationStrategy


[docs]class EpsilonGreedyStrategy(ExplorationStrategy): """ ϵ-greedy exploration strategy. Select action based on the value of ϵ. ϵ will decrease from max_epsilon to min_epsilon within decay_ratio * total_timesteps. At state s, with probability 1 − ϵ: select action = argmax Q(s, a) ϵ : select a random action from an uniform distribution. Args: env_spec (garage.envs.env_spec.EnvSpec): Environment specification. total_timesteps (int): Total steps in the training, equivalent to max_path_length * n_epochs. max_epsilon (float): The maximum(starting) value of epsilon. min_epsilon (float): The minimum(terminal) value of epsilon. decay_ratio (float): Fraction of total steps for epsilon decay. """ def __init__(self, env_spec, total_timesteps, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1): self._env_spec = env_spec self._max_epsilon = max_epsilon self._min_epsilon = min_epsilon self._decay_period = int(total_timesteps * decay_ratio) self._action_space = env_spec.action_space self._epsilon = self._max_epsilon self._decrement = (self._max_epsilon - self._min_epsilon) / self._decay_period
[docs] def get_action(self, t, observation, policy, **kwargs): """ Get action from this policy for the input observation. Args: t: Iteration. observation: Observation from the environment. policy: Policy network to predict action based on the observation. Returns: opt_action: optimal action from this policy. """ opt_action = policy.get_action(observation) self._decay() if np.random.random() < self._epsilon: opt_action = self._action_space.sample() return opt_action, dict()
[docs] def get_actions(self, t, observations, policy, **kwargs): """ Get actions from this policy for the input observations. Args: t: Iteration. observation: Observation from the environment. policy: Policy network to predict action based on the observation. Returns: opt_action: optimal actions from this policy. """ opt_actions = policy.get_actions(observations) for itr in range(len(opt_actions)): self._decay() if np.random.random() < self._epsilon: opt_actions[itr] = self._action_space.sample() return opt_actions, dict()
def _decay(self): if self._epsilon > self._min_epsilon: self._epsilon -= self._decrement