diff --git a/benchmarks/src/garage_benchmarks/experiments/algos/ppo_garage_pytorch.py b/benchmarks/src/garage_benchmarks/experiments/algos/ppo_garage_pytorch.py index dc205c3562..666afa1732 100644 --- a/benchmarks/src/garage_benchmarks/experiments/algos/ppo_garage_pytorch.py +++ b/benchmarks/src/garage_benchmarks/experiments/algos/ppo_garage_pytorch.py @@ -6,7 +6,7 @@ from garage.experiment import deterministic from garage.sampler import RaySampler from garage.torch.algos import PPO as PyTorch_PPO -from garage.torch.optimizers import OptimizerWrapper +from garage.torch.optimizers import MinibatchOptimizer from garage.torch.policies import GaussianMLPPolicy as PyTorch_GMP from garage.torch.value_functions import GaussianMLPValueFunction from garage.trainer import Trainer @@ -45,15 +45,15 @@ def ppo_garage_pytorch(ctxt, env_id, seed): hidden_nonlinearity=torch.tanh, output_nonlinearity=None) - policy_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)), - policy, - max_optimization_epochs=10, - minibatch_size=64) + policy_optimizer = MinibatchOptimizer((torch.optim.Adam, dict(lr=2.5e-4)), + policy, + max_optimization_epochs=10, + minibatch_size=64) - vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)), - value_function, - max_optimization_epochs=10, - minibatch_size=64) + vf_optimizer = MinibatchOptimizer((torch.optim.Adam, dict(lr=2.5e-4)), + value_function, + max_optimization_epochs=10, + minibatch_size=64) sampler = RaySampler(agents=policy, envs=env, diff --git a/src/garage/torch/__init__.py b/src/garage/torch/__init__.py index 87c9ce4329..713dac97a6 100644 --- a/src/garage/torch/__init__.py +++ b/src/garage/torch/__init__.py @@ -1,23 +1,24 @@ """PyTorch-backed modules and algorithms.""" # yapf: disable -from garage.torch._dtypes import (ObservationBatch, ObservationOrder, - ShuffledOptimizationNotSupported, - observation_batch_to_packed_sequence) -from garage.torch._functions import (compute_advantages, dict_np_to_torch, - filter_valids, flatten_batch, - flatten_to_single_vector, global_device, - NonLinearity, np_to_torch, pad_to_last, - prefer_gpu, product_of_gaussians, - set_gpu_mode, soft_update_model, - torch_to_np, TransposeImage, - update_module_params) +from garage.torch._dtypes import (observation_batch_to_packed_sequence, + ObservationBatch, ObservationOrder, + ShuffledOptimizationNotSupported) +from garage.torch._functions import (as_tensor, compute_advantages, + dict_np_to_torch, filter_valids, + flatten_batch, flatten_to_single_vector, + global_device, NonLinearity, np_to_torch, + pad_to_last, prefer_gpu, + product_of_gaussians, set_gpu_mode, + soft_update_model, torch_to_np, + TransposeImage, update_module_params) # yapf: enable __all__ = [ - 'compute_advantages', 'dict_np_to_torch', 'filter_valids', 'flatten_batch', - 'global_device', 'np_to_torch', 'pad_to_last', 'prefer_gpu', - 'product_of_gaussians', 'set_gpu_mode', 'soft_update_model', 'torch_to_np', - 'update_module_params', 'NonLinearity', 'flatten_to_single_vector', - 'TransposeImage', 'ObservationBatch', 'ObservationOrder', - 'ShuffledOptimizationNotSupported', 'observation_batch_to_packed_sequence' + 'as_tensor', 'compute_advantages', 'dict_np_to_torch', 'filter_valids', + 'flatten_batch', 'global_device', 'np_to_torch', 'pad_to_last', + 'prefer_gpu', 'product_of_gaussians', 'set_gpu_mode', 'soft_update_model', + 'torch_to_np', 'update_module_params', 'NonLinearity', + 'flatten_to_single_vector', 'TransposeImage', 'ObservationBatch', + 'ObservationOrder', 'ShuffledOptimizationNotSupported', + 'observation_batch_to_packed_sequence' ] diff --git a/src/garage/torch/_dtypes.py b/src/garage/torch/_dtypes.py index f0be030ce4..3421bf8df3 100644 --- a/src/garage/torch/_dtypes.py +++ b/src/garage/torch/_dtypes.py @@ -79,12 +79,12 @@ def __new__(cls, observations, order, lengths=None): f'lengths has dtype {self.lengths.dtype}, but must have ' f'an integer dtype') total_size = sum(self.lengths) - if self.observations.shape[0] != total_size: + if self.shape[0] != total_size: raise ValueError( f'observations has batch size ' f'{self.observations.shape[0]}, but must have batch ' f'size {total_size} to match lengths') - assert self.observations.shape[0] == total_size + assert self.shape[0] == total_size elif self.lengths is not None: raise ValueError( f'lengths has value {self.lengths}, but must be None ' diff --git a/src/garage/torch/_functions.py b/src/garage/torch/_functions.py index 76d096f603..e795b20bcd 100644 --- a/src/garage/torch/_functions.py +++ b/src/garage/torch/_functions.py @@ -92,13 +92,13 @@ def discount_cumsum(x, discount): discount, dtype=torch.float, device=x.device) + discount_x[0] = 1.0 filter = torch.cumprod(discount_x, dim=0) - returns = F.conv1d(x, filter, stride=1) - assert returns.shape == (len(x), ) - from garage.np import discount_cumsum as np_discout_cumsum - import numpy as np - expected = np_discout_cumsum(torch_to_np(x), discount) - assert np.array_equal(expected, torch_to_np(returns)) + pad = len(x) - 1 + # minibatch of 1, with 1 channel + filter = filter.reshape(1, 1, -1) + returns = F.conv1d(x.reshape(1, 1, -1), filter, stride=1, padding=pad) + returns = returns[0, 0, pad:] return returns @@ -372,6 +372,19 @@ def product_of_gaussians(mus, sigmas_squared): return mu, sigma_squared +def as_tensor(data): + """Convert a list to a PyTorch tensor + + Args: + data (list): Data to convert to tensor + + Returns: + torch.Tensor: A float tensor + + """ + return torch.as_tensor(data, dtype=torch.float32, device=global_device()) + + # pylint: disable=W0223 class NonLinearity(nn.Module): """Wrapper class for non linear function or module. diff --git a/src/garage/torch/algos/maml_ppo.py b/src/garage/torch/algos/maml_ppo.py index 93e4e76145..b7627247fa 100644 --- a/src/garage/torch/algos/maml_ppo.py +++ b/src/garage/torch/algos/maml_ppo.py @@ -4,7 +4,7 @@ from garage import _Default from garage.torch.algos import PPO from garage.torch.algos.maml import MAML -from garage.torch.optimizers import OptimizerWrapper +from garage.torch.optimizers import MinibatchOptimizer class MAMLPPO(MAML): @@ -70,10 +70,10 @@ def __init__(self, meta_evaluator=None, evaluate_every_n_epochs=1): - policy_optimizer = OptimizerWrapper( + policy_optimizer = MinibatchOptimizer( (torch.optim.Adam, dict(lr=inner_lr)), policy) - vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=inner_lr)), - value_function) + vf_optimizer = MinibatchOptimizer( + (torch.optim.Adam, dict(lr=inner_lr)), value_function) inner_algo = PPO(env.spec, policy, diff --git a/src/garage/torch/algos/maml_trpo.py b/src/garage/torch/algos/maml_trpo.py index b4236b4cba..f19a29a817 100644 --- a/src/garage/torch/algos/maml_trpo.py +++ b/src/garage/torch/algos/maml_trpo.py @@ -5,7 +5,7 @@ from garage.torch.algos import VPG from garage.torch.algos.maml import MAML from garage.torch.optimizers import (ConjugateGradientOptimizer, - OptimizerWrapper) + MinibatchOptimizer) class MAMLTRPO(MAML): @@ -71,10 +71,10 @@ def __init__(self, meta_evaluator=None, evaluate_every_n_epochs=1): - policy_optimizer = OptimizerWrapper( + policy_optimizer = MinibatchOptimizer( (torch.optim.Adam, dict(lr=inner_lr)), policy) - vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=inner_lr)), - value_function) + vf_optimizer = MinibatchOptimizer( + (torch.optim.Adam, dict(lr=inner_lr)), value_function) inner_algo = VPG(env.spec, policy, diff --git a/src/garage/torch/algos/maml_vpg.py b/src/garage/torch/algos/maml_vpg.py index cf32d8e6d5..ffb61a2e0f 100644 --- a/src/garage/torch/algos/maml_vpg.py +++ b/src/garage/torch/algos/maml_vpg.py @@ -4,7 +4,7 @@ from garage import _Default from garage.torch.algos import VPG from garage.torch.algos.maml import MAML -from garage.torch.optimizers import OptimizerWrapper +from garage.torch.optimizers import MinibatchOptimizer class MAMLVPG(MAML): @@ -66,10 +66,10 @@ def __init__(self, num_grad_updates=1, meta_evaluator=None, evaluate_every_n_epochs=1): - policy_optimizer = OptimizerWrapper( + policy_optimizer = MinibatchOptimizer( (torch.optim.Adam, dict(lr=inner_lr)), policy) - vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=inner_lr)), - value_function) + vf_optimizer = MinibatchOptimizer( + (torch.optim.Adam, dict(lr=inner_lr)), value_function) inner_algo = VPG(env.spec, policy, diff --git a/src/garage/torch/algos/ppo.py b/src/garage/torch/algos/ppo.py index 73668ac810..f3fc115daa 100644 --- a/src/garage/torch/algos/ppo.py +++ b/src/garage/torch/algos/ppo.py @@ -2,7 +2,7 @@ import torch from garage.torch.algos import VPG -from garage.torch.optimizers import OptimizerWrapper +from garage.torch.optimizers import MinibatchOptimizer class PPO(VPG): @@ -14,9 +14,9 @@ class PPO(VPG): value_function (garage.torch.value_functions.ValueFunction): The value function. sampler (garage.sampler.Sampler): Sampler. - policy_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer + policy_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer for policy. - vf_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer for + vf_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer for value function. lr_clip_range (float): The limit on the likelihood ratio between policies. @@ -63,13 +63,13 @@ def __init__(self, entropy_method='no_entropy'): if policy_optimizer is None: - policy_optimizer = OptimizerWrapper( + policy_optimizer = MinibatchOptimizer( (torch.optim.Adam, dict(lr=2.5e-4)), policy, max_optimization_epochs=10, minibatch_size=64) if vf_optimizer is None: - vf_optimizer = OptimizerWrapper( + vf_optimizer = MinibatchOptimizer( (torch.optim.Adam, dict(lr=2.5e-4)), value_function, max_optimization_epochs=10, diff --git a/src/garage/torch/algos/trpo.py b/src/garage/torch/algos/trpo.py index ba17af59b4..1e30f9ee25 100644 --- a/src/garage/torch/algos/trpo.py +++ b/src/garage/torch/algos/trpo.py @@ -3,7 +3,7 @@ from garage.torch.algos import VPG from garage.torch.optimizers import (ConjugateGradientOptimizer, - OptimizerWrapper) + MinibatchOptimizer) class TRPO(VPG): @@ -15,9 +15,9 @@ class TRPO(VPG): value_function (garage.torch.value_functions.ValueFunction): The value function. sampler (garage.sampler.Sampler): Sampler. - policy_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer + policy_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer for policy. - vf_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer for + vf_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer for value function. num_train_per_epoch (int): Number of train_once calls per epoch. discount (float): Discount. @@ -61,11 +61,11 @@ def __init__(self, entropy_method='no_entropy'): if policy_optimizer is None: - policy_optimizer = OptimizerWrapper( + policy_optimizer = MinibatchOptimizer( (ConjugateGradientOptimizer, dict(max_constraint_value=0.01)), policy) if vf_optimizer is None: - vf_optimizer = OptimizerWrapper( + vf_optimizer = MinibatchOptimizer( (torch.optim.Adam, dict(lr=2.5e-4)), value_function, max_optimization_epochs=10, @@ -116,7 +116,8 @@ def _compute_objective(self, advantages, obs, actions, rewards): return surrogate - def _train_policy(self, obs, actions, rewards, advantages): + def _train_policy(self, observations, actions, rewards, advantages, + lengths): r"""Train the policy. Args: @@ -128,17 +129,19 @@ def _train_policy(self, obs, actions, rewards, advantages): with shape :math:`(N, )`. advantages (torch.Tensor): Advantage value at each step with shape :math:`(N, )`. + lengths (torch.Tensor): Lengths of episodes. Returns: torch.Tensor: Calculated mean scalar value of policy loss (float). """ - self._policy_optimizer.zero_grad() - loss = self._compute_loss_with_adv(obs, actions, rewards, advantages) - loss.backward() - self._policy_optimizer.step( - f_loss=lambda: self._compute_loss_with_adv(obs, actions, rewards, - advantages), - f_constraint=lambda: self._compute_kl_constraint(obs)) - - return loss + data = { + 'observations': observations, + 'actions': actions, + 'rewards': rewards, + 'advantages': advantages, + 'lengths': lengths + } + f_constraint = lambda: self._compute_kl_constraint(observations) + return self._policy_optimizer.step(data, self._loss_function, + f_constraint) diff --git a/src/garage/torch/algos/vpg.py b/src/garage/torch/algos/vpg.py index 5903bb88cd..3ad212bda6 100644 --- a/src/garage/torch/algos/vpg.py +++ b/src/garage/torch/algos/vpg.py @@ -10,11 +10,11 @@ from garage import log_performance from garage.np import discount_cumsum from garage.np.algos import RLAlgorithm -from garage.torch import (compute_advantages, filter_valids, ObservationBatch, - ObservationOrder) +from garage.torch import (as_tensor, compute_advantages, filter_valids, + global_device, ObservationBatch, ObservationOrder) from garage.torch._functions import (discount_cumsum, pad_packed_tensor, split_packed_tensor) -from garage.torch.optimizers import OptimizerWrapper +from garage.torch.optimizers import MinibatchOptimizer class VPG(RLAlgorithm): @@ -28,9 +28,9 @@ class VPG(RLAlgorithm): value_function (garage.torch.value_functions.ValueFunction): The value function. sampler (garage.sampler.Sampler): Sampler. - policy_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer + policy_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer for policy. - vf_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer for + vf_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer for value function. steps_per_epoch (int): Number of train_once calls per epoch. discount (float): Discount. @@ -44,6 +44,8 @@ class VPG(RLAlgorithm): standardized before shifting. policy_ent_coeff (float): The coefficient of the policy entropy. Setting it to zero would mean no entropy regularization. + use_neg_logli_entropy (bool): Whether to estimate the entropy as the + negative log likelihood of the action. use_softplus_entropy (bool): Whether to estimate the softmax distribution of the entropy to prevent the entropy from being negative. @@ -70,8 +72,9 @@ def __init__( center_adv=True, positive_adv=False, policy_ent_coeff=0.0, + use_neg_logli_entropy=True, use_softplus_entropy=False, - stop_entropy_gradient=False, + stop_entropy_gradient=True, entropy_method='no_entropy', recurrent=None, ): @@ -86,6 +89,7 @@ def __init__( self._policy_ent_coeff = policy_ent_coeff self._use_softplus_entropy = use_softplus_entropy self._stop_entropy_gradient = stop_entropy_gradient + self._use_neg_logli_entropy = use_neg_logli_entropy self._entropy_method = entropy_method self._steps_per_epoch = steps_per_epoch self._env_spec = env_spec @@ -96,17 +100,18 @@ def __init__( stop_entropy_gradient, policy_ent_coeff) self._episode_reward_mean = collections.deque(maxlen=100) - self._sampler = sampler + self.sampler = sampler if policy_optimizer: self._policy_optimizer = policy_optimizer else: - self._policy_optimizer = OptimizerWrapper(torch.optim.Adam, policy) + self._policy_optimizer = MinibatchOptimizer( + torch.optim.Adam, policy) if vf_optimizer: self._vf_optimizer = vf_optimizer else: - self._vf_optimizer = OptimizerWrapper(torch.optim.Adam, - value_function) + self._vf_optimizer = MinibatchOptimizer(torch.optim.Adam, + value_function) self._old_policy = copy.deepcopy(self.policy) self._recurrent = recurrent @@ -146,8 +151,8 @@ def _train_once(self, eps): """ # Conver to torch and compute returns, etc. - lengths = eps.lengths - obs = ObservationBatch(eps.observations, + lengths = torch.from_numpy(eps.lengths).to(global_device()) + obs = ObservationBatch(as_tensor(eps.observations), order=ObservationOrder.EPISODES, lengths=lengths) actions = torch.Tensor(eps.actions) @@ -165,9 +170,9 @@ def _train_once(self, eps): # Log before training with torch.no_grad(): - policy_loss_before = self._compute_loss_with_adv( - obs, actions, rewards, advantages) - vf_loss_before = self._value_function.compute_loss(obs, returns) + policy_loss_before = self._loss_function(obs, actions, rewards, + advantages, lengths) + vf_loss_before = self._value_function.loss_function(obs, returns) with tabular.prefix(self.policy.name): tabular.record('/LossBefore', policy_loss_before.item()) @@ -181,14 +186,14 @@ def _train_once(self, eps): # Save the current policy state and train self._old_policy.load_state_dict(self.policy.state_dict()) - self._train_policy(obs, actions, rewards, returns, advantages) - self._train_vf(obs, actions, rewards, returns, advantages) + self._train_policy(obs, actions, rewards, returns, advantages, lengths) + self._train_value_function(obs, returns, lengths) # Log after training with torch.no_grad(): - policy_loss_after = self._compute_loss_with_adv( - obs, actions, rewards, advantages) + policy_loss_after = self._loss_function(obs, actions, rewards, + advantages, lengths) with tabular.prefix(self.policy.name): tabular.record('/LossAfter', policy_loss_after.item()) tabular.record('/dLoss', @@ -198,10 +203,9 @@ def _train_once(self, eps): '/EntropyAfter', self._compute_policy_entropy(obs, actions).mean().item()) + vf_loss_after = self._value_function.loss_function(obs, returns) with tabular.prefix(self._value_function.name): - tabular.record( - '/LossAfter', - self._value_function.compute_loss(obs, returns).item()) + tabular.record('/vfLossAfter', vf_loss_after.item()) tabular.record('/dLoss', vf_loss_before.item() - vf_loss_after.item()) @@ -221,7 +225,7 @@ def train(self, trainer): for epoch in trainer.step_epochs(): for _ in range(self._steps_per_epoch): - trainer.step_path = self._sampler.obtain_episodes(epoch) + trainer.step_path = trainer.obtain_episodes(epoch) self._train_once(trainer.step_path) last_return = np.mean( log_performance(epoch, @@ -229,31 +233,12 @@ def train(self, trainer): discount=self._discount)) return last_return - def _train(self, obs, actions, rewards, returns, advs): - r"""Train the policy and value function with minibatch. - - Args: - obs (torch.Tensor): Observation from the environment with shape - :math:`(N, O*)`. - actions (torch.Tensor): Actions fed to the environment with shape - :math:`(N, A*)`. - rewards (torch.Tensor): Acquired rewards with shape :math:`(N, )`. - returns (torch.Tensor): Acquired returns with shape :math:`(N, )`. - advs (torch.Tensor): Advantage value at each step with shape - :math:`(N, )`. - - """ - for dataset in self._policy_optimizer.get_minibatch( - obs, actions, rewards, advs): - self._train_policy(*dataset) - for dataset in self._vf_optimizer.get_minibatch(obs, returns): - self._train_value_function(*dataset) - - def _train_policy(self, obs, actions, rewards, returns, advantages): + def _train_policy(self, observations, actions, rewards, returns, + advantages, lengths): r"""Train the policy. Args: - obs (torch.Tensor): Observation from the environment + observations (torch.Tensor): Observation from the environment with shape :math:`(N, O*)`. actions (torch.Tensor): Actions fed to the environment with shape :math:`(N, A*)`. @@ -262,42 +247,49 @@ def _train_policy(self, obs, actions, rewards, returns, advantages): returns (torch.Tensor): Acquired returns with shape :math:`(N, )`. advantages (torch.Tensor): Advantage value at each step with shape :math:`(N, )`. + lengths (torch.Tensor): Lengths of episodes. Returns: torch.Tensor: Calculated mean scalar value of policy loss (float). """ - self._policy_optimizer.zero_grad() - loss = self._compute_loss_with_adv(obs, actions, rewards, advantages) - loss.backward() - self._policy_optimizer.step() - - return loss - - def _train_value_function(self, obs, returns): + data = { + 'observations': observations, + 'actions': actions, + 'rewards': rewards, + 'advantages': advantages, + 'lengths': lengths + } + return self._policy_optimizer.step(data, self._loss_function) + + def _train_value_function(self, observations, returns, lengths): r"""Train the value function. Args: - obs (torch.Tensor): Observation from the environment + observations (torch.Tensor): Observation from the environment with shape :math:`(N, O*)`. returns (torch.Tensor): Acquired returns with shape :math:`(N, )`. + lengths (torch.Tensor): Lengths of episodes. Returns: torch.Tensor: Calculated mean scalar value of value function loss (float). """ - self._vf_optimizer.zero_grad() - loss = self._value_function.compute_loss(obs, returns) - loss.backward() - self._vf_optimizer.step() - - return loss + data = { + 'observations': observations, + 'returns': returns, + 'lengths': lengths + } + return self._vf_optimizer.step(data, + self._value_function.loss_function) def _compute_loss(self, obs, actions, rewards, lengths, baselines): r"""Compute mean value of loss. + Note that this function is private, but used by MAML. + Notes: P is the maximum episode length (self.max_episode_length) Args: @@ -321,14 +313,19 @@ def _compute_loss(self, obs, actions, rewards, lengths, baselines): rewards_flat = torch.cat(filter_valids(rewards, lengths)) advantages_flat = self._compute_advantage(rewards, lengths, baselines) - return self._compute_loss_with_adv(obs_flat, actions_flat, - rewards_flat, advantages_flat) + return self._loss_function(obs_flat, actions_flat, rewards_flat, + advantages_flat, lengths) - def _compute_loss_with_adv(self, obs, actions, rewards, advantages): + def _loss_function(self, + observations, + actions, + rewards, + advantages, + lengths=None): r"""Compute mean value of loss. Args: - obs (torch.Tensor): Observation from the environment + observations (torch.Tensor): Observation from the environment with shape :math:`(N \dot [T], O*)`. actions (torch.Tensor): Actions fed to the environment with shape :math:`(N \dot [T], A*)`. @@ -336,15 +333,19 @@ def _compute_loss_with_adv(self, obs, actions, rewards, advantages): with shape :math:`(N \dot [T], )`. advantages (torch.Tensor): Advantage value at each step with shape :math:`(N \dot [T], )`. + lengths (torch.Tensor or None): Lengths of episodes, if operating + on full episodes. Returns: torch.Tensor: Calculated negative mean scalar value of objective. """ - objectives = self._compute_objective(advantages, obs, actions, rewards) + objectives = self._compute_objective(advantages, observations, actions, + rewards) if self._entropy_regularzied: - policy_entropies = self._compute_policy_entropy(obs) + policy_entropies = self._compute_policy_entropy( + observation, actions) objectives += self._policy_ent_coeff * policy_entropies return -objectives.mean() @@ -371,7 +372,7 @@ def _compute_advantage(self, rewards, lengths, baselines): self.max_episode_length, padded_baselines, padded_rewards) - advantages = torch.cat(filter_valids(advantages, lengths)) + advantages = torch.cat(filter_valids(padded_advantages, lengths)) if self._center_adv: means = advantages.mean() @@ -410,25 +411,27 @@ def _compute_kl_constraint(self, obs): return kl_constraint.mean() - def _compute_policy_entropy(self, obs): + def _compute_policy_entropy(self, obs, actions): r"""Compute entropy value of probability distribution. Notes: P is the maximum episode length (self.max_episode_length) Args: - obs (torch.Tensor): Observation from the environment + observations (torch.Tensor): Observation from the environment with shape :math:`(N, P, O*)`. + actions (torch.Tensor): Actions fed to the environment + with shape :math:`(N \dot [T], A*)`. Returns: torch.Tensor: Calculated entropy values given observation with shape :math:`(N, P)`. """ - if self._stop_entropy_gradient: - with torch.no_grad(): + with torch.set_grad_enabled(not self._stop_entropy_gradient): + if self._use_neg_logli_entropy: + policy_entropy = -self.policy(obs)[0].log_prob(actions) + else: policy_entropy = self.policy(obs)[0].entropy() - else: - policy_entropy = self.policy(obs)[0].entropy() # This prevents entropy from becoming negative for small policy std if self._use_softplus_entropy: diff --git a/src/garage/torch/optimizers/__init__.py b/src/garage/torch/optimizers/__init__.py index cba847617d..bf9dec1f12 100644 --- a/src/garage/torch/optimizers/__init__.py +++ b/src/garage/torch/optimizers/__init__.py @@ -2,10 +2,9 @@ from garage.torch.optimizers.conjugate_gradient_optimizer import ( ConjugateGradientOptimizer) from garage.torch.optimizers.differentiable_sgd import DifferentiableSGD +from garage.torch.optimizers.episode_batch_optimizer import ( + EpisodeBatchOptimizer) +from garage.torch.optimizers.minibatch_optimizer import MinibatchOptimizer from garage.torch.optimizers.optimizer import Optimizer -from garage.torch.optimizers.optimizer_wrapper import OptimizerWrapper -__all__ = [ - 'Optimizer', 'OptimizerWrapper', 'ConjugateGradientOptimizer', - 'DifferentiableSGD' -] +__all__ = ['Optimizer', 'ConjugateGradientOptimizer', 'DifferentiableSGD'] diff --git a/src/garage/torch/optimizers/conjugate_gradient_optimizer.py b/src/garage/torch/optimizers/conjugate_gradient_optimizer.py index 489587e672..0157d2c908 100644 --- a/src/garage/torch/optimizers/conjugate_gradient_optimizer.py +++ b/src/garage/torch/optimizers/conjugate_gradient_optimizer.py @@ -138,12 +138,12 @@ def __init__(self, self._hvp_reg_coeff = hvp_reg_coeff self._accept_violation = accept_violation - def step(self, f_loss, f_constraint): # pylint: disable=arguments-differ + def step(self, loss_function, constraint_function): # pylint: disable=arguments-differ """Take an optimization step. Args: - f_loss (callable): Function to compute the loss. - f_constraint (callable): Function to compute the constraint value. + loss_function (callable): Function to compute the loss. + constraint_function (callable): Function to compute the constraint value. """ # Collect trainable parameters and gradients @@ -157,7 +157,7 @@ def step(self, f_loss, f_constraint): # pylint: disable=arguments-differ flat_loss_grads = torch.cat(grads) # Build Hessian-vector-product function - f_Ax = _build_hessian_vector_product(f_constraint, params, + f_Ax = _build_hessian_vector_product(constraint_function, params, self._hvp_reg_coeff) # Compute step direction @@ -177,8 +177,8 @@ def step(self, f_loss, f_constraint): # pylint: disable=arguments-differ descent_step = step_size * step_dir # Update parameters using backtracking line search - self._backtracking_line_search(params, descent_step, f_loss, - f_constraint) + self._backtracking_line_search(params, descent_step, loss_function, + constraint_function) @property def state(self): diff --git a/src/garage/torch/optimizers/optimizer_wrapper.py b/src/garage/torch/optimizers/optimizer_wrapper.py deleted file mode 100644 index 9f69ce565d..0000000000 --- a/src/garage/torch/optimizers/optimizer_wrapper.py +++ /dev/null @@ -1,63 +0,0 @@ -"""A PyTorch optimizer wrapper that compute loss and optimize module.""" -from garage import make_optimizer -from garage.np.optimizers import BatchDataset - - -class OptimizerWrapper: - """A wrapper class to handle torch.optim.optimizer. - - Args: - optimizer (Union[type, tuple[type, dict]]): Type of optimizer - for policy. This can be an optimizer type such as - `torch.optim.Adam` or a tuple of type and dictionary, where - dictionary contains arguments to initialize the optimizer. - e.g. `(torch.optim.Adam, {'lr' : 1e-3})` - Sample strategy to be used when sampling a new task. - module (torch.nn.Module): Module to be optimized. - max_optimization_epochs (int): Maximum number of epochs for update. - minibatch_size (int): Batch size for optimization. - - """ - - def __init__(self, - optimizer, - module, - max_optimization_epochs=1, - minibatch_size=None): - self._optimizer = make_optimizer(optimizer, module=module) - self._max_optimization_epochs = max_optimization_epochs - self._minibatch_size = minibatch_size - - def get_minibatch(self, *inputs): - r"""Yields a batch of inputs. - - Notes: P is the size of minibatch (self._minibatch_size) - - Args: - *inputs (list[torch.Tensor]): A list of inputs. Each input has - shape :math:`(N \dot [T], *)`. - - Yields: - list[torch.Tensor]: A list batch of inputs. Each batch has shape - :math:`(P, *)`. - - """ - batch_dataset = BatchDataset(inputs, self._minibatch_size) - - for _ in range(self._max_optimization_epochs): - for dataset in batch_dataset.iterate(): - yield dataset - - def zero_grad(self): - r"""Clears the gradients of all optimized :class:`torch.Tensor` s.""" - self._optimizer.zero_grad() - - def step(self, **closure): - """Performs a single optimization step. - - Arguments: - **closure (callable, optional): A closure that reevaluates the - model and returns the loss. - - """ - self._optimizer.step(**closure) diff --git a/src/garage/torch/value_functions/gaussian_mlp_value_function.py b/src/garage/torch/value_functions/gaussian_mlp_value_function.py index d340fef2a9..7f0670841c 100644 --- a/src/garage/torch/value_functions/gaussian_mlp_value_function.py +++ b/src/garage/torch/value_functions/gaussian_mlp_value_function.py @@ -78,11 +78,11 @@ def __init__(self, std_parameterization='exp', layer_normalization=layer_normalization) - def compute_loss(self, obs, returns): + def loss_function(self, observations, returns, lengths=None): r"""Compute mean value of loss. Args: - obs (torch.Tensor): Observation from the environment + observations (torch.Tensor): Observation from the environment with shape :math:`(N \dot [T], O*)`. returns (torch.Tensor): Acquired returns with shape :math:`(N, )`. @@ -91,7 +91,7 @@ def compute_loss(self, obs, returns): objective (float). """ - dist = self.module(obs) + dist = self.module(observations) ll = dist.log_prob(returns.reshape(-1, 1)) loss = -ll.mean() return loss diff --git a/src/garage/torch/value_functions/value_function.py b/src/garage/torch/value_functions/value_function.py index 1cc533f33d..7aa54e7968 100644 --- a/src/garage/torch/value_functions/value_function.py +++ b/src/garage/torch/value_functions/value_function.py @@ -20,11 +20,11 @@ def __init__(self, env_spec, name): self.name = name @abc.abstractmethod - def compute_loss(self, obs, returns): + def loss_function(self, observations, returns, lengths=None): r"""Compute mean value of loss. Args: - obs (torch.Tensor): Observation from the environment + observations (torch.Tensor): Observation from the environment with shape :math:`(N \dot [T], O*)`. returns (torch.Tensor): Acquired returns with shape :math:`(N, )`. diff --git a/src/garage/trainer.py b/src/garage/trainer.py index 5cdcc79d64..05657406f5 100644 --- a/src/garage/trainer.py +++ b/src/garage/trainer.py @@ -157,7 +157,7 @@ def setup(self, algo, env): self._env = env self._seed = get_seed() - self._sampler = self._algo.sampler + self._sampler = getattr(self._algo, 'sampler', None) self._has_setup = True diff --git a/tests/garage/torch/test_functions.py b/tests/garage/torch/test_functions.py index 8e12f31fad..11db456706 100644 --- a/tests/garage/torch/test_functions.py +++ b/tests/garage/torch/test_functions.py @@ -5,15 +5,11 @@ import torch import torch.nn.functional as F -from garage.torch import (compute_advantages, - dict_np_to_torch, - flatten_to_single_vector, - global_device, - pad_to_last, - product_of_gaussians, - set_gpu_mode, - torch_to_np, - TransposeImage) +from garage.np import discount_cumsum as np_discout_cumsum +from garage.torch import (compute_advantages, dict_np_to_torch, + discount_cumsum, flatten_to_single_vector, + global_device, pad_to_last, product_of_gaussians, + set_gpu_mode, torch_to_np, TransposeImage) import garage.torch._functions as tu from tests.fixtures import TfGraphTestCase @@ -96,6 +92,15 @@ def test_transpose_image(): assert (original_env.observation_space.shape[2] == transposed_env.observation_space.shape[0]) +def test_discount_cumsum(): + discount = 0.99 + x = tensor([9.3217, 9.3003, 9.3406, 9.2251, 9.0715, 9.0134, 8.9026, + 8.6619]) + returns = discount_cumsum(x, discount) + expected = np_discout_cumsum(torch_to_np(x), discount) + assert returns.shape == (len(x), ) + assert np.allclose(expected, torch_to_np(returns) + class TestTorchAlgoUtils(TfGraphTestCase): """Test class for torch algo utility functions."""