Source code for garage.sampler.off_policy_vectorized_sampler

"""This module implements a Vectorized Sampler used for OffPolicy Algorithms.

It diffs from OnPolicyVectorizedSampler in two parts:
 - The num of envs is defined by rollout_batch_size. In
 OnPolicyVectorizedSampler, the number of envs can be decided by batch_size
 and max_path_length. But OffPolicy algorithms usually samples transitions
 from replay buffer, which only has buffer_batch_size.
 - It needs to add transitions to replay buffer throughout the rollout.
"""

import itertools
import warnings

import cloudpickle
import numpy as np

from garage.experiment import deterministic
from garage.misc import tensor_utils
from garage.sampler.batch_sampler import BatchSampler
from garage.sampler.vec_env_executor import VecEnvExecutor


[docs]class OffPolicyVectorizedSampler(BatchSampler): """This class implements OffPolicyVectorizedSampler. Args: algo (garage.np.RLAlgorithm): Algorithm. env (garage.envs.GarageEnv): Environment. n_envs (int): Number of parallel environments managed by sampler. no_reset (bool): Reset environment between samples or not. """ def __init__(self, algo, env, n_envs=None, no_reset=True): if n_envs is None: n_envs = int(algo.rollout_batch_size) super().__init__(algo, env) self._n_envs = n_envs self._no_reset = no_reset self._vec_env = None self._env_spec = self.env.spec self._last_obses = None self._last_uncounted_discount = [0] * n_envs self._last_running_length = [0] * n_envs self._last_success_count = [0] * n_envs warnings.warn( DeprecationWarning( 'OffPolicyVectoriizedSampler is deprecated, and will be ' 'removed in the next release. Please use VecWorker and one of ' 'the new samplers which implement garage.sampler.Sampler, ' 'such as RaySampler.'))
[docs] def start_worker(self): """Initialize the sampler.""" n_envs = self._n_envs envs = [ cloudpickle.loads(cloudpickle.dumps(self.env)) for _ in range(n_envs) ] # Deterministically set environment seeds based on the global seed. seed0 = deterministic.get_seed() if seed0 is not None: for (i, e) in enumerate(envs): e.seed(seed0 + i) self._vec_env = VecEnvExecutor( envs=envs, max_path_length=self.algo.max_path_length)
[docs] def shutdown_worker(self): """Terminate workers if necessary.""" self._vec_env.close()
# pylint: disable=too-many-branches, too-many-statements
[docs] def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Collect samples for the given iteration number. Args: itr(int): Iteration number. batch_size(int): Number of environment interactions in one batch. whole_paths(bool): Not effective. Only keep here to comply with base class. Raises: ValueError: If the algorithm doesn't have an exploration_policy field. Returns: list: A list of paths. """ assert batch_size is not None paths = [] if not self._no_reset or self._last_obses is None: obses = self._vec_env.reset() else: obses = self._last_obses completes = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs n_samples = 0 policy = self.algo.exploration_policy if policy is None: raise ValueError('OffPolicyVectoriizedSampler should only be used ' 'with an exploration_policy.') while n_samples < batch_size: policy.reset(completes) obs_space = self.algo.env_spec.observation_space input_obses = obs_space.flatten_n(obses) actions, agent_infos = policy.get_actions(input_obses) next_obses, rewards, dones, env_infos, completes = \ self._vec_env.step(actions) self._last_obses = next_obses agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) n_samples += len(next_obses) if agent_infos is None: agent_infos = [dict() for _ in range(self._vec_env.num_envs)] if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] for (idx, reward, env_info, done, complete, obs, next_obs, action) in zip(itertools.count(), rewards, env_infos, dones, completes, obses, next_obses, actions): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], observations=[], next_observations=[], actions=[], env_infos=[], dones=[], undiscounted_return=self._last_uncounted_discount[idx], # running_length: Length of path up to now # Note that running_length is not len(rewards) # Because a path may not be complete in one batch running_length=self._last_running_length[idx], success_count=self._last_success_count[idx]) running_paths[idx]['rewards'].append(reward) running_paths[idx]['observations'].append(obs) running_paths[idx]['next_observations'].append(next_obs) running_paths[idx]['actions'].append(action) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['dones'].append(done) running_paths[idx]['running_length'] += 1 running_paths[idx]['undiscounted_return'] += reward running_paths[idx]['success_count'] += env_info.get( 'is_success') or 0 self._last_uncounted_discount[idx] += reward self._last_success_count[idx] += env_info.get( 'is_success') or 0 self._last_running_length[idx] += 1 if complete or n_samples >= batch_size: paths.append( dict( rewards=np.asarray(running_paths[idx]['rewards']), dones=np.asarray(running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), running_length=running_paths[idx] ['running_length'], undiscounted_return=running_paths[idx] ['undiscounted_return'], success_count=running_paths[idx]['success_count'])) act_space = self._env_spec.action_space path_dict = {} path_dict['observations'] = obs_space.flatten_n( running_paths[idx]['observations']) path_dict['next_observations'] = obs_space.flatten_n( running_paths[idx]['next_observations']) path_dict['rewards'] = np.asarray( running_paths[idx]['rewards']).reshape(-1, 1) path_dict['terminals'] = np.asarray( running_paths[idx]['dones']).reshape(-1, 1) path_dict['actions'] = act_space.flatten_n( running_paths[idx]['actions']) self.algo.replay_buffer.add_path(path_dict) running_paths[idx] = None if done: self._last_running_length[idx] = 0 self._last_success_count[idx] = 0 self._last_uncounted_discount[idx] = 0 obses = next_obses return paths