Source code for garage.envs.normalized_env

import gym
import gym.spaces
import gym.spaces.utils
import numpy as np


[docs]class NormalizedEnv(gym.Wrapper): def __init__( self, env, scale_reward=1., normalize_obs=False, normalize_reward=False, flatten_obs=True, obs_alpha=0.001, reward_alpha=0.001, ): super().__init__(env) self._scale_reward = scale_reward self._normalize_obs = normalize_obs self._normalize_reward = normalize_reward self._flatten_obs = flatten_obs self._obs_alpha = obs_alpha flat_obs_dim = gym.spaces.utils.flatdim(env.observation_space) self._obs_mean = np.zeros(flat_obs_dim) self._obs_var = np.ones(flat_obs_dim) self._reward_alpha = reward_alpha self._reward_mean = 0. self._reward_var = 1. def _update_obs_estimate(self, obs): flat_obs = gym.spaces.utils.flatten(self.env.observation_space, obs) self._obs_mean = ( 1 - self._obs_alpha) * self._obs_mean + self._obs_alpha * flat_obs self._obs_var = ( 1 - self._obs_alpha) * self._obs_var + self._obs_alpha * np.square( flat_obs - self._obs_mean) def _update_reward_estimate(self, reward): self._reward_mean = (1 - self._reward_alpha) * \ self._reward_mean + self._reward_alpha * reward self._reward_var = ( 1 - self._reward_alpha ) * self._reward_var + self._reward_alpha * np.square( reward - self._reward_mean) def _apply_normalize_obs(self, obs): self._update_obs_estimate(obs) flat_obs = gym.spaces.utils.flatten(self.env.observation_space, obs) normalized_obs = (flat_obs - self._obs_mean) / (np.sqrt(self._obs_var) + 1e-8) if not self._flatten_obs: normalized_obs = gym.spaces.utils.unflatten( self.env.observation_space, normalized_obs) return normalized_obs def _apply_normalize_reward(self, reward): self._update_reward_estimate(reward) return reward / (np.sqrt(self._reward_var) + 1e-8)
[docs] def reset(self, **kwargs): ret = self.env.reset(**kwargs) if self._normalize_obs: return self._apply_normalize_obs(ret) else: return ret
[docs] def step(self, action): if isinstance(self.action_space, gym.spaces.Box): # rescale the action when the bounds are not inf lb, ub = self.action_space.low, self.action_space.high if np.all(lb != -np.inf) and np.all(ub != -np.inf): scaled_action = lb + (action + 1.) * 0.5 * (ub - lb) scaled_action = np.clip(scaled_action, lb, ub) else: scaled_action = action else: scaled_action = action next_obs, reward, done, info = self.env.step(scaled_action) if self._normalize_obs: next_obs = self._apply_normalize_obs(next_obs) if self._normalize_reward: reward = self._apply_normalize_reward(reward) return next_obs, reward * self._scale_reward, done, info
[docs] def log_diagnostics(self, paths): pass
[docs] def render(self, *args, **kwargs): return self.env.render(*args, **kwargs)
[docs] def max_episode_steps(self): return self.env.spec.max_episode_steps
normalize = NormalizedEnv