From df3a137c6bcceb68621b66bc10624c928fd8ae7a Mon Sep 17 00:00:00 2001 From: "K.R. Zentner" Date: Wed, 16 Dec 2020 15:50:51 -0800 Subject: [PATCH] Torch VPG rework --- .../experiments/algos/ppo_garage_pytorch.py | 18 +- src/garage/torch/__init__.py | 6 +- src/garage/torch/_dtypes.py | 4 +- src/garage/torch/_functions.py | 55 ++++ src/garage/torch/algos/maml_ppo.py | 8 +- src/garage/torch/algos/maml_trpo.py | 8 +- src/garage/torch/algos/maml_vpg.py | 8 +- src/garage/torch/algos/ppo.py | 10 +- src/garage/torch/algos/td3.py | 2 +- src/garage/torch/algos/trpo.py | 34 +- src/garage/torch/algos/vpg.py | 309 +++++++++--------- src/garage/torch/optimizers/__init__.py | 9 +- .../conjugate_gradient_optimizer.py | 12 +- .../torch/optimizers/optimizer_wrapper.py | 63 ---- .../gaussian_mlp_value_function.py | 6 +- .../torch/value_functions/value_function.py | 4 +- tests/garage/torch/test_functions.py | 10 + 17 files changed, 289 insertions(+), 277 deletions(-) delete mode 100644 src/garage/torch/optimizers/optimizer_wrapper.py diff --git a/benchmarks/src/garage_benchmarks/experiments/algos/ppo_garage_pytorch.py b/benchmarks/src/garage_benchmarks/experiments/algos/ppo_garage_pytorch.py index dc205c3562..666afa1732 100644 --- a/benchmarks/src/garage_benchmarks/experiments/algos/ppo_garage_pytorch.py +++ b/benchmarks/src/garage_benchmarks/experiments/algos/ppo_garage_pytorch.py @@ -6,7 +6,7 @@ from garage.experiment import deterministic from garage.sampler import RaySampler from garage.torch.algos import PPO as PyTorch_PPO -from garage.torch.optimizers import OptimizerWrapper +from garage.torch.optimizers import MinibatchOptimizer from garage.torch.policies import GaussianMLPPolicy as PyTorch_GMP from garage.torch.value_functions import GaussianMLPValueFunction from garage.trainer import Trainer @@ -45,15 +45,15 @@ def ppo_garage_pytorch(ctxt, env_id, seed): hidden_nonlinearity=torch.tanh, output_nonlinearity=None) - policy_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)), - policy, - max_optimization_epochs=10, - minibatch_size=64) + policy_optimizer = MinibatchOptimizer((torch.optim.Adam, dict(lr=2.5e-4)), + policy, + max_optimization_epochs=10, + minibatch_size=64) - vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)), - value_function, - max_optimization_epochs=10, - minibatch_size=64) + vf_optimizer = MinibatchOptimizer((torch.optim.Adam, dict(lr=2.5e-4)), + value_function, + max_optimization_epochs=10, + minibatch_size=64) sampler = RaySampler(agents=policy, envs=env, diff --git a/src/garage/torch/__init__.py b/src/garage/torch/__init__.py index 71b119b7dd..8ac1bf783e 100644 --- a/src/garage/torch/__init__.py +++ b/src/garage/torch/__init__.py @@ -3,8 +3,9 @@ from garage.torch._dtypes import (ObservationBatch, ObservationOrder, ShuffledOptimizationNotSupported, observation_batch_to_packed_sequence) -from garage.torch._functions import (as_torch_dict, compute_advantages, - expand_var, filter_valids, flatten_batch, +from garage.torch._functions import (as_tensor, as_torch_dict, + compute_advantages, expand_var, + filter_valids, flatten_batch, flatten_to_single_vector, global_device, NonLinearity, np_to_torch, output_height_2d, output_width_2d, @@ -18,6 +19,7 @@ __all__ = [ 'NonLinearity', 'as_torch_dict', + 'as_tensor', 'compute_advantages', 'expand_var', 'filter_valids', diff --git a/src/garage/torch/_dtypes.py b/src/garage/torch/_dtypes.py index f0be030ce4..3421bf8df3 100644 --- a/src/garage/torch/_dtypes.py +++ b/src/garage/torch/_dtypes.py @@ -79,12 +79,12 @@ def __new__(cls, observations, order, lengths=None): f'lengths has dtype {self.lengths.dtype}, but must have ' f'an integer dtype') total_size = sum(self.lengths) - if self.observations.shape[0] != total_size: + if self.shape[0] != total_size: raise ValueError( f'observations has batch size ' f'{self.observations.shape[0]}, but must have batch ' f'size {total_size} to match lengths') - assert self.observations.shape[0] == total_size + assert self.shape[0] == total_size elif self.lengths is not None: raise ValueError( f'lengths has value {self.lengths}, but must be None ' diff --git a/src/garage/torch/_functions.py b/src/garage/torch/_functions.py index 2c834ebfa6..1319c20da0 100644 --- a/src/garage/torch/_functions.py +++ b/src/garage/torch/_functions.py @@ -106,6 +106,48 @@ def compute_advantages(discount, gae_lambda, max_episode_length, baselines, return advantages +def discount_cumsum(x, discount): + discount_x = torch.full((len(x), ), + discount, + dtype=torch.float, + device=x.device) + discount_x[0] = 1.0 + filter = torch.cumprod(discount_x, dim=0) + pad = len(x) - 1 + # minibatch of 1, with 1 channel + filter = filter.reshape(1, 1, -1) + returns = F.conv1d(x.reshape(1, 1, -1), filter, stride=1, padding=pad) + returns = returns[0, 0, pad:] + return returns + + +def split_packed_tensor(t, lengths): + """Split a tensor using a sequence of (start, stop) tuples.""" + start = 0 + for length in lengths: + stop = start + length + yield t[start:stop] + start = stop + + +def pad_packed_tensor(t, lengths, max_length=None): + if max_length is None: + max_length = max(lengths) + if max(lengths) > max_length: + raise ValueError(f'packed tensor contains a sequence of length ' + f'{max(lengths)}, but was asked to pad to ' + f'length {max_length}') + out = torch.zeros(( + len(lengths), + max_length, + ) + t.shape[1:], + dtype=t.dtype, + device=t.device) + for i, seq in enumerate(split_packed_tensor(t, lengths)): + out[i][:len(seq)] = seq + return out + + def pad_to_last(nums, total_length, axis=-1, val=0): """Pad val to last in nums in given axis. @@ -383,6 +425,19 @@ def state_dict_to(state_dict, device): return state_dict +def as_tensor(data): + """Convert a list to a PyTorch tensor + + Args: + data (list): Data to convert to tensor + + Returns: + torch.Tensor: A float tensor + + """ + return torch.as_tensor(data, dtype=torch.float32, device=global_device()) + + # pylint: disable=W0223 class NonLinearity(nn.Module): """Wrapper class for non linear function or module. diff --git a/src/garage/torch/algos/maml_ppo.py b/src/garage/torch/algos/maml_ppo.py index 93e4e76145..b7627247fa 100644 --- a/src/garage/torch/algos/maml_ppo.py +++ b/src/garage/torch/algos/maml_ppo.py @@ -4,7 +4,7 @@ from garage import _Default from garage.torch.algos import PPO from garage.torch.algos.maml import MAML -from garage.torch.optimizers import OptimizerWrapper +from garage.torch.optimizers import MinibatchOptimizer class MAMLPPO(MAML): @@ -70,10 +70,10 @@ def __init__(self, meta_evaluator=None, evaluate_every_n_epochs=1): - policy_optimizer = OptimizerWrapper( + policy_optimizer = MinibatchOptimizer( (torch.optim.Adam, dict(lr=inner_lr)), policy) - vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=inner_lr)), - value_function) + vf_optimizer = MinibatchOptimizer( + (torch.optim.Adam, dict(lr=inner_lr)), value_function) inner_algo = PPO(env.spec, policy, diff --git a/src/garage/torch/algos/maml_trpo.py b/src/garage/torch/algos/maml_trpo.py index b4236b4cba..f19a29a817 100644 --- a/src/garage/torch/algos/maml_trpo.py +++ b/src/garage/torch/algos/maml_trpo.py @@ -5,7 +5,7 @@ from garage.torch.algos import VPG from garage.torch.algos.maml import MAML from garage.torch.optimizers import (ConjugateGradientOptimizer, - OptimizerWrapper) + MinibatchOptimizer) class MAMLTRPO(MAML): @@ -71,10 +71,10 @@ def __init__(self, meta_evaluator=None, evaluate_every_n_epochs=1): - policy_optimizer = OptimizerWrapper( + policy_optimizer = MinibatchOptimizer( (torch.optim.Adam, dict(lr=inner_lr)), policy) - vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=inner_lr)), - value_function) + vf_optimizer = MinibatchOptimizer( + (torch.optim.Adam, dict(lr=inner_lr)), value_function) inner_algo = VPG(env.spec, policy, diff --git a/src/garage/torch/algos/maml_vpg.py b/src/garage/torch/algos/maml_vpg.py index cf32d8e6d5..ffb61a2e0f 100644 --- a/src/garage/torch/algos/maml_vpg.py +++ b/src/garage/torch/algos/maml_vpg.py @@ -4,7 +4,7 @@ from garage import _Default from garage.torch.algos import VPG from garage.torch.algos.maml import MAML -from garage.torch.optimizers import OptimizerWrapper +from garage.torch.optimizers import MinibatchOptimizer class MAMLVPG(MAML): @@ -66,10 +66,10 @@ def __init__(self, num_grad_updates=1, meta_evaluator=None, evaluate_every_n_epochs=1): - policy_optimizer = OptimizerWrapper( + policy_optimizer = MinibatchOptimizer( (torch.optim.Adam, dict(lr=inner_lr)), policy) - vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=inner_lr)), - value_function) + vf_optimizer = MinibatchOptimizer( + (torch.optim.Adam, dict(lr=inner_lr)), value_function) inner_algo = VPG(env.spec, policy, diff --git a/src/garage/torch/algos/ppo.py b/src/garage/torch/algos/ppo.py index 73668ac810..f3fc115daa 100644 --- a/src/garage/torch/algos/ppo.py +++ b/src/garage/torch/algos/ppo.py @@ -2,7 +2,7 @@ import torch from garage.torch.algos import VPG -from garage.torch.optimizers import OptimizerWrapper +from garage.torch.optimizers import MinibatchOptimizer class PPO(VPG): @@ -14,9 +14,9 @@ class PPO(VPG): value_function (garage.torch.value_functions.ValueFunction): The value function. sampler (garage.sampler.Sampler): Sampler. - policy_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer + policy_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer for policy. - vf_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer for + vf_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer for value function. lr_clip_range (float): The limit on the likelihood ratio between policies. @@ -63,13 +63,13 @@ def __init__(self, entropy_method='no_entropy'): if policy_optimizer is None: - policy_optimizer = OptimizerWrapper( + policy_optimizer = MinibatchOptimizer( (torch.optim.Adam, dict(lr=2.5e-4)), policy, max_optimization_epochs=10, minibatch_size=64) if vf_optimizer is None: - vf_optimizer = OptimizerWrapper( + vf_optimizer = MinibatchOptimizer( (torch.optim.Adam, dict(lr=2.5e-4)), value_function, max_optimization_epochs=10, diff --git a/src/garage/torch/algos/td3.py b/src/garage/torch/algos/td3.py index edb10083ff..ca833f41bb 100644 --- a/src/garage/torch/algos/td3.py +++ b/src/garage/torch/algos/td3.py @@ -84,9 +84,9 @@ def __init__( replay_buffer, sampler, *, # Everything after this is numbers. - max_episode_length_eval=None, grad_steps_per_env_step, exploration_policy, + max_episode_length_eval=None, uniform_random_policy=None, max_action=None, target_update_tau=0.005, diff --git a/src/garage/torch/algos/trpo.py b/src/garage/torch/algos/trpo.py index c2becfc1c9..a9fe5939aa 100644 --- a/src/garage/torch/algos/trpo.py +++ b/src/garage/torch/algos/trpo.py @@ -4,7 +4,7 @@ from garage.torch._functions import zero_optim_grads from garage.torch.algos import VPG from garage.torch.optimizers import (ConjugateGradientOptimizer, - OptimizerWrapper) + MinibatchOptimizer) class TRPO(VPG): @@ -16,9 +16,9 @@ class TRPO(VPG): value_function (garage.torch.value_functions.ValueFunction): The value function. sampler (garage.sampler.Sampler): Sampler. - policy_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer + policy_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer for policy. - vf_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer for + vf_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer for value function. num_train_per_epoch (int): Number of train_once calls per epoch. discount (float): Discount. @@ -62,11 +62,11 @@ def __init__(self, entropy_method='no_entropy'): if policy_optimizer is None: - policy_optimizer = OptimizerWrapper( + policy_optimizer = MinibatchOptimizer( (ConjugateGradientOptimizer, dict(max_constraint_value=0.01)), policy) if vf_optimizer is None: - vf_optimizer = OptimizerWrapper( + vf_optimizer = MinibatchOptimizer( (torch.optim.Adam, dict(lr=2.5e-4)), value_function, max_optimization_epochs=10, @@ -117,7 +117,8 @@ def _compute_objective(self, advantages, obs, actions, rewards): return surrogate - def _train_policy(self, obs, actions, rewards, advantages): + def _train_policy(self, observations, actions, rewards, advantages, + lengths): r"""Train the policy. Args: @@ -129,18 +130,19 @@ def _train_policy(self, obs, actions, rewards, advantages): with shape :math:`(N, )`. advantages (torch.Tensor): Advantage value at each step with shape :math:`(N, )`. + lengths (torch.Tensor): Lengths of episodes. Returns: torch.Tensor: Calculated mean scalar value of policy loss (float). """ - # pylint: disable=protected-access - zero_optim_grads(self._policy_optimizer._optimizer) - loss = self._compute_loss_with_adv(obs, actions, rewards, advantages) - loss.backward() - self._policy_optimizer.step( - f_loss=lambda: self._compute_loss_with_adv(obs, actions, rewards, - advantages), - f_constraint=lambda: self._compute_kl_constraint(obs)) - - return loss + data = { + 'observations': observations, + 'actions': actions, + 'rewards': rewards, + 'advantages': advantages, + 'lengths': lengths + } + f_constraint = lambda: self._compute_kl_constraint(observations) + return self._policy_optimizer.step(data, self._loss_function, + f_constraint) diff --git a/src/garage/torch/algos/vpg.py b/src/garage/torch/algos/vpg.py index 42a75444fb..3586b322f4 100644 --- a/src/garage/torch/algos/vpg.py +++ b/src/garage/torch/algos/vpg.py @@ -10,9 +10,11 @@ from garage import log_performance from garage.np import discount_cumsum from garage.np.algos import RLAlgorithm -from garage.torch import compute_advantages, filter_valids -from garage.torch._functions import np_to_torch, zero_optim_grads -from garage.torch.optimizers import OptimizerWrapper +from garage.torch import (as_tensor, compute_advantages, filter_valids, + global_device, ObservationBatch, ObservationOrder) +from garage.torch._functions import (np_to_torch, pad_packed_tensor, + split_packed_tensor) +from garage.torch.optimizers import MinibatchOptimizer class VPG(RLAlgorithm): @@ -26,11 +28,11 @@ class VPG(RLAlgorithm): value_function (garage.torch.value_functions.ValueFunction): The value function. sampler (garage.sampler.Sampler): Sampler. - policy_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer + policy_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer for policy. - vf_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer for + vf_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer for value function. - num_train_per_epoch (int): Number of train_once calls per epoch. + steps_per_epoch (int): Number of train_once calls per epoch. discount (float): Discount. gae_lambda (float): Lambda used for generalized advantage estimation. @@ -42,6 +44,8 @@ class VPG(RLAlgorithm): standardized before shifting. policy_ent_coeff (float): The coefficient of the policy entropy. Setting it to zero would mean no entropy regularization. + use_neg_logli_entropy (bool): Whether to estimate the entropy as the + negative log likelihood of the action. use_softplus_entropy (bool): Whether to estimate the softmax distribution of the entropy to prevent the entropy from being negative. @@ -62,15 +66,17 @@ def __init__( sampler, policy_optimizer=None, vf_optimizer=None, - num_train_per_epoch=1, + steps_per_epoch=1, discount=0.99, gae_lambda=1, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, + use_neg_logli_entropy=True, use_softplus_entropy=False, - stop_entropy_gradient=False, + stop_entropy_gradient=True, entropy_method='no_entropy', + recurrent=None, ): self._discount = discount self.policy = policy @@ -83,8 +89,9 @@ def __init__( self._policy_ent_coeff = policy_ent_coeff self._use_softplus_entropy = use_softplus_entropy self._stop_entropy_gradient = stop_entropy_gradient + self._use_neg_logli_entropy = use_neg_logli_entropy self._entropy_method = entropy_method - self._n_samples = num_train_per_epoch + self._steps_per_epoch = steps_per_epoch self._env_spec = env_spec self._maximum_entropy = (entropy_method == 'max') @@ -93,19 +100,21 @@ def __init__( stop_entropy_gradient, policy_ent_coeff) self._episode_reward_mean = collections.deque(maxlen=100) - self._sampler = sampler + self.sampler = sampler if policy_optimizer: self._policy_optimizer = policy_optimizer else: - self._policy_optimizer = OptimizerWrapper(torch.optim.Adam, policy) + self._policy_optimizer = MinibatchOptimizer( + torch.optim.Adam, policy) if vf_optimizer: self._vf_optimizer = vf_optimizer else: - self._vf_optimizer = OptimizerWrapper(torch.optim.Adam, - value_function) + self._vf_optimizer = MinibatchOptimizer(torch.optim.Adam, + value_function) self._old_policy = copy.deepcopy(self.policy) + self._recurrent = recurrent @staticmethod def _check_entropy_configuration(entropy_method, center_adv, @@ -134,77 +143,71 @@ def discount(self): """ return self._discount - def _train_once(self, itr, eps): + def _train_once(self, eps): """Train the algorithm once. Args: - itr (int): Iteration number. eps (EpisodeBatch): A batch of collected paths. - Returns: - numpy.float64: Calculated mean value of undiscounted returns. - """ - obs = np_to_torch(eps.padded_observations) - rewards = np_to_torch(eps.padded_rewards) - returns = np_to_torch( - np.stack([ - discount_cumsum(reward, self.discount) - for reward in eps.padded_rewards - ])) - valids = eps.lengths - with torch.no_grad(): - baselines = self._value_function(obs) - + # Conver to torch and compute returns, etc. + lengths = torch.from_numpy(eps.lengths).to(global_device()) + obs = ObservationBatch(as_tensor(eps.observations), + order=ObservationOrder.EPISODES, + lengths=lengths) + actions = torch.Tensor(eps.actions) + rewards = torch.Tensor(eps.rewards) + policy_entropies = self._compute_policy_entropy(obs, actions) if self._maximum_entropy: - policy_entropies = self._compute_policy_entropy(obs) rewards += self._policy_ent_coeff * policy_entropies + returns = torch.hstack([ + discount_cumsum(r, self._discount) + for r in split_packed_tensor(rewards, lengths) + ]) + with torch.no_grad(): + baselines = self._value_function(obs) + advantages = self._compute_advantage(rewards, lengths, baselines) - obs_flat = np_to_torch(eps.observations) - actions_flat = np_to_torch(eps.actions) - rewards_flat = np_to_torch(eps.rewards) - returns_flat = torch.cat(filter_valids(returns, valids)) - advs_flat = self._compute_advantage(rewards, valids, baselines) - + # Log before training with torch.no_grad(): - policy_loss_before = self._compute_loss_with_adv( - obs_flat, actions_flat, rewards_flat, advs_flat) - vf_loss_before = self._value_function.compute_loss( - obs_flat, returns_flat) - kl_before = self._compute_kl_constraint(obs) + policy_loss_before = self._loss_function(obs, actions, rewards, + advantages, lengths) + vf_loss_before = self._value_function.loss_function(obs, returns) - self._train(obs_flat, actions_flat, rewards_flat, returns_flat, - advs_flat) + with tabular.prefix(self.policy.name): + tabular.record('/LossBefore', policy_loss_before.item()) + tabular.record('/KLBefore', + self._compute_kl_constraint(obs).item()) + tabular.record('/EntropyBefore', + policy_entropies.mean().item()) - with torch.no_grad(): - policy_loss_after = self._compute_loss_with_adv( - obs_flat, actions_flat, rewards_flat, advs_flat) - vf_loss_after = self._value_function.compute_loss( - obs_flat, returns_flat) - kl_after = self._compute_kl_constraint(obs) - policy_entropy = self._compute_policy_entropy(obs) - - with tabular.prefix(self.policy.name): - tabular.record('/LossBefore', policy_loss_before.item()) - tabular.record('/LossAfter', policy_loss_after.item()) - tabular.record('/dLoss', - (policy_loss_before - policy_loss_after).item()) - tabular.record('/KLBefore', kl_before.item()) - tabular.record('/KL', kl_after.item()) - tabular.record('/Entropy', policy_entropy.mean().item()) - - with tabular.prefix(self._value_function.name): - tabular.record('/LossBefore', vf_loss_before.item()) - tabular.record('/LossAfter', vf_loss_after.item()) - tabular.record('/dLoss', - vf_loss_before.item() - vf_loss_after.item()) + with tabular.prefix(self._value_function.name): + tabular.record('/LossBefore', vf_loss_before.item()) + # Save the current policy state and train self._old_policy.load_state_dict(self.policy.state_dict()) + self._train_policy(obs, actions, rewards, returns, advantages, lengths) + self._train_value_function(obs, returns, lengths) + + # Log after training + with torch.no_grad(): - undiscounted_returns = log_performance(itr, - eps, - discount=self._discount) - return np.mean(undiscounted_returns) + policy_loss_after = self._loss_function(obs, actions, rewards, + advantages, lengths) + with tabular.prefix(self.policy.name): + tabular.record('/LossAfter', policy_loss_after.item()) + tabular.record('/dLoss', + (policy_loss_before - policy_loss_after).item()) + tabular.record('/KL', self._compute_kl_constraint(obs).item()) + tabular.record( + '/EntropyAfter', + self._compute_policy_entropy(obs, actions).mean().item()) + + vf_loss_after = self._value_function.loss_function(obs, returns) + with tabular.prefix(self._value_function.name): + tabular.record('/vfLossAfter', vf_loss_after.item()) + tabular.record('/dLoss', + vf_loss_before.item() - vf_loss_after.item()) def train(self, trainer): """Obtain samplers and start actual training for each epoch. @@ -220,84 +223,73 @@ def train(self, trainer): """ last_return = None - for _ in trainer.step_epochs(): - for _ in range(self._n_samples): - eps = trainer.obtain_episodes(trainer.step_itr) - last_return = self._train_once(trainer.step_itr, eps) - trainer.step_itr += 1 - + for epoch in trainer.step_epochs(): + for _ in range(self._steps_per_epoch): + trainer.step_path = trainer.obtain_episodes(epoch) + self._train_once(trainer.step_path) + last_return = np.mean( + log_performance(epoch, + trainer.step_path, + discount=self._discount)) return last_return - def _train(self, obs, actions, rewards, returns, advs): - r"""Train the policy and value function with minibatch. - - Args: - obs (torch.Tensor): Observation from the environment with shape - :math:`(N, O*)`. - actions (torch.Tensor): Actions fed to the environment with shape - :math:`(N, A*)`. - rewards (torch.Tensor): Acquired rewards with shape :math:`(N, )`. - returns (torch.Tensor): Acquired returns with shape :math:`(N, )`. - advs (torch.Tensor): Advantage value at each step with shape - :math:`(N, )`. - - """ - for dataset in self._policy_optimizer.get_minibatch( - obs, actions, rewards, advs): - self._train_policy(*dataset) - for dataset in self._vf_optimizer.get_minibatch(obs, returns): - self._train_value_function(*dataset) - - def _train_policy(self, obs, actions, rewards, advantages): + def _train_policy(self, observations, actions, rewards, returns, + advantages, lengths): r"""Train the policy. Args: - obs (torch.Tensor): Observation from the environment + observations (torch.Tensor): Observation from the environment with shape :math:`(N, O*)`. actions (torch.Tensor): Actions fed to the environment with shape :math:`(N, A*)`. rewards (torch.Tensor): Acquired rewards with shape :math:`(N, )`. + returns (torch.Tensor): Acquired returns with shape :math:`(N, )`. advantages (torch.Tensor): Advantage value at each step with shape :math:`(N, )`. + lengths (torch.Tensor): Lengths of episodes. Returns: torch.Tensor: Calculated mean scalar value of policy loss (float). """ - # pylint: disable=protected-access - zero_optim_grads(self._policy_optimizer._optimizer) - loss = self._compute_loss_with_adv(obs, actions, rewards, advantages) - loss.backward() - self._policy_optimizer.step() - - return loss - - def _train_value_function(self, obs, returns): + data = { + 'observations': observations, + 'actions': actions, + 'rewards': rewards, + 'advantages': advantages, + 'lengths': lengths + } + return self._policy_optimizer.step(data, self._loss_function) + + def _train_value_function(self, observations, returns, lengths): r"""Train the value function. Args: - obs (torch.Tensor): Observation from the environment + observations (torch.Tensor): Observation from the environment with shape :math:`(N, O*)`. returns (torch.Tensor): Acquired returns with shape :math:`(N, )`. + lengths (torch.Tensor): Lengths of episodes. Returns: torch.Tensor: Calculated mean scalar value of value function loss (float). """ - # pylint: disable=protected-access - zero_optim_grads(self._vf_optimizer._optimizer) - loss = self._value_function.compute_loss(obs, returns) - loss.backward() - self._vf_optimizer.step() - - return loss - - def _compute_loss(self, obs, actions, rewards, valids, baselines): + data = { + 'observations': observations, + 'returns': returns, + 'lengths': lengths + } + return self._vf_optimizer.step(data, + self._value_function.loss_function) + + def _compute_loss(self, obs, actions, rewards, lengths, baselines): r"""Compute mean value of loss. + Note that this function is private, but used by MAML. + Notes: P is the maximum episode length (self.max_episode_length) Args: @@ -307,7 +299,7 @@ def _compute_loss(self, obs, actions, rewards, valids, baselines): with shape :math:`(N, P, A*)`. rewards (torch.Tensor): Acquired rewards with shape :math:`(N, P)`. - valids (list[int]): Numbers of valid steps in each episode + lengths (list[int]): Numbers of valid steps in each episode baselines (torch.Tensor): Value function estimation at each step with shape :math:`(N, P)`. @@ -316,19 +308,24 @@ def _compute_loss(self, obs, actions, rewards, valids, baselines): objective (float). """ - obs_flat = torch.cat(filter_valids(obs, valids)) - actions_flat = torch.cat(filter_valids(actions, valids)) - rewards_flat = torch.cat(filter_valids(rewards, valids)) - advantages_flat = self._compute_advantage(rewards, valids, baselines) - - return self._compute_loss_with_adv(obs_flat, actions_flat, - rewards_flat, advantages_flat) - - def _compute_loss_with_adv(self, obs, actions, rewards, advantages): + obs_flat = torch.cat(filter_valids(obs, lengths)) + actions_flat = torch.cat(filter_valids(actions, lengths)) + rewards_flat = torch.cat(filter_valids(rewards, lengths)) + advantages_flat = self._compute_advantage(rewards, lengths, baselines) + + return self._loss_function(obs_flat, actions_flat, rewards_flat, + advantages_flat, lengths) + + def _loss_function(self, + observations, + actions, + rewards, + advantages, + lengths=None): r"""Compute mean value of loss. Args: - obs (torch.Tensor): Observation from the environment + observations (torch.Tensor): Observation from the environment with shape :math:`(N \dot [T], O*)`. actions (torch.Tensor): Actions fed to the environment with shape :math:`(N \dot [T], A*)`. @@ -336,50 +333,56 @@ def _compute_loss_with_adv(self, obs, actions, rewards, advantages): with shape :math:`(N \dot [T], )`. advantages (torch.Tensor): Advantage value at each step with shape :math:`(N \dot [T], )`. + lengths (torch.Tensor or None): Lengths of episodes, if operating + on full episodes. Returns: torch.Tensor: Calculated negative mean scalar value of objective. """ - objectives = self._compute_objective(advantages, obs, actions, rewards) + objectives = self._compute_objective(advantages, observations, actions, + rewards) if self._entropy_regularzied: - policy_entropies = self._compute_policy_entropy(obs) + policy_entropies = self._compute_policy_entropy( + observations, actions) objectives += self._policy_ent_coeff * policy_entropies return -objectives.mean() - def _compute_advantage(self, rewards, valids, baselines): + def _compute_advantage(self, rewards, lengths, baselines): r"""Compute mean value of loss. - Notes: P is the maximum episode length (self.max_episode_length) - Args: - rewards (torch.Tensor): Acquired rewards - with shape :math:`(N, P)`. - valids (list[int]): Numbers of valid steps in each episode - baselines (torch.Tensor): Value function estimation at each step - with shape :math:`(N, P)`. + rewards (torch.Tensor): Packed acquired rewards + with shape :math:`(N \bullet [T])`. + lengths (list[int]): Numbers of valid steps in each episode + baselines (torch.Tensor): Packed value function estimation of + returns with shape :math:`(N \bullet [T])`. Returns: torch.Tensor: Calculated advantage values given rewards and baselines with shape :math:`(N \dot [T], )`. """ - advantages = compute_advantages(self._discount, self._gae_lambda, - self.max_episode_length, baselines, - rewards) - advantage_flat = torch.cat(filter_valids(advantages, valids)) + padded_rewards = pad_packed_tensor(rewards, lengths) + padded_baselines = pad_packed_tensor(baselines, lengths) + padded_advantages = compute_advantages(self._discount, + self._gae_lambda, + self.max_episode_length, + padded_baselines, + padded_rewards) + advantages = torch.cat(filter_valids(padded_advantages, lengths)) if self._center_adv: - means = advantage_flat.mean() - variance = advantage_flat.var() - advantage_flat = (advantage_flat - means) / (variance + 1e-8) + means = advantages.mean() + variance = advantages.var() + advantages = (advantages - means) / (variance + 1e-8) if self._positive_adv: - advantage_flat -= advantage_flat.min() + advantages -= advantages.min() - return advantage_flat + return advantages def _compute_kl_constraint(self, obs): r"""Compute KL divergence. @@ -408,25 +411,27 @@ def _compute_kl_constraint(self, obs): return kl_constraint.mean() - def _compute_policy_entropy(self, obs): + def _compute_policy_entropy(self, obs, actions): r"""Compute entropy value of probability distribution. Notes: P is the maximum episode length (self.max_episode_length) Args: - obs (torch.Tensor): Observation from the environment + observations (torch.Tensor): Observation from the environment with shape :math:`(N, P, O*)`. + actions (torch.Tensor): Actions fed to the environment + with shape :math:`(N \dot [T], A*)`. Returns: torch.Tensor: Calculated entropy values given observation with shape :math:`(N, P)`. """ - if self._stop_entropy_gradient: - with torch.no_grad(): + with torch.set_grad_enabled(not self._stop_entropy_gradient): + if self._use_neg_logli_entropy: + policy_entropy = -self.policy(obs)[0].log_prob(actions) + else: policy_entropy = self.policy(obs)[0].entropy() - else: - policy_entropy = self.policy(obs)[0].entropy() # This prevents entropy from becoming negative for small policy std if self._use_softplus_entropy: diff --git a/src/garage/torch/optimizers/__init__.py b/src/garage/torch/optimizers/__init__.py index bc21022af4..bf9dec1f12 100644 --- a/src/garage/torch/optimizers/__init__.py +++ b/src/garage/torch/optimizers/__init__.py @@ -2,8 +2,9 @@ from garage.torch.optimizers.conjugate_gradient_optimizer import ( ConjugateGradientOptimizer) from garage.torch.optimizers.differentiable_sgd import DifferentiableSGD -from garage.torch.optimizers.optimizer_wrapper import OptimizerWrapper +from garage.torch.optimizers.episode_batch_optimizer import ( + EpisodeBatchOptimizer) +from garage.torch.optimizers.minibatch_optimizer import MinibatchOptimizer +from garage.torch.optimizers.optimizer import Optimizer -__all__ = [ - 'OptimizerWrapper', 'ConjugateGradientOptimizer', 'DifferentiableSGD' -] +__all__ = ['Optimizer', 'ConjugateGradientOptimizer', 'DifferentiableSGD'] diff --git a/src/garage/torch/optimizers/conjugate_gradient_optimizer.py b/src/garage/torch/optimizers/conjugate_gradient_optimizer.py index 489587e672..0157d2c908 100644 --- a/src/garage/torch/optimizers/conjugate_gradient_optimizer.py +++ b/src/garage/torch/optimizers/conjugate_gradient_optimizer.py @@ -138,12 +138,12 @@ def __init__(self, self._hvp_reg_coeff = hvp_reg_coeff self._accept_violation = accept_violation - def step(self, f_loss, f_constraint): # pylint: disable=arguments-differ + def step(self, loss_function, constraint_function): # pylint: disable=arguments-differ """Take an optimization step. Args: - f_loss (callable): Function to compute the loss. - f_constraint (callable): Function to compute the constraint value. + loss_function (callable): Function to compute the loss. + constraint_function (callable): Function to compute the constraint value. """ # Collect trainable parameters and gradients @@ -157,7 +157,7 @@ def step(self, f_loss, f_constraint): # pylint: disable=arguments-differ flat_loss_grads = torch.cat(grads) # Build Hessian-vector-product function - f_Ax = _build_hessian_vector_product(f_constraint, params, + f_Ax = _build_hessian_vector_product(constraint_function, params, self._hvp_reg_coeff) # Compute step direction @@ -177,8 +177,8 @@ def step(self, f_loss, f_constraint): # pylint: disable=arguments-differ descent_step = step_size * step_dir # Update parameters using backtracking line search - self._backtracking_line_search(params, descent_step, f_loss, - f_constraint) + self._backtracking_line_search(params, descent_step, loss_function, + constraint_function) @property def state(self): diff --git a/src/garage/torch/optimizers/optimizer_wrapper.py b/src/garage/torch/optimizers/optimizer_wrapper.py deleted file mode 100644 index 9f69ce565d..0000000000 --- a/src/garage/torch/optimizers/optimizer_wrapper.py +++ /dev/null @@ -1,63 +0,0 @@ -"""A PyTorch optimizer wrapper that compute loss and optimize module.""" -from garage import make_optimizer -from garage.np.optimizers import BatchDataset - - -class OptimizerWrapper: - """A wrapper class to handle torch.optim.optimizer. - - Args: - optimizer (Union[type, tuple[type, dict]]): Type of optimizer - for policy. This can be an optimizer type such as - `torch.optim.Adam` or a tuple of type and dictionary, where - dictionary contains arguments to initialize the optimizer. - e.g. `(torch.optim.Adam, {'lr' : 1e-3})` - Sample strategy to be used when sampling a new task. - module (torch.nn.Module): Module to be optimized. - max_optimization_epochs (int): Maximum number of epochs for update. - minibatch_size (int): Batch size for optimization. - - """ - - def __init__(self, - optimizer, - module, - max_optimization_epochs=1, - minibatch_size=None): - self._optimizer = make_optimizer(optimizer, module=module) - self._max_optimization_epochs = max_optimization_epochs - self._minibatch_size = minibatch_size - - def get_minibatch(self, *inputs): - r"""Yields a batch of inputs. - - Notes: P is the size of minibatch (self._minibatch_size) - - Args: - *inputs (list[torch.Tensor]): A list of inputs. Each input has - shape :math:`(N \dot [T], *)`. - - Yields: - list[torch.Tensor]: A list batch of inputs. Each batch has shape - :math:`(P, *)`. - - """ - batch_dataset = BatchDataset(inputs, self._minibatch_size) - - for _ in range(self._max_optimization_epochs): - for dataset in batch_dataset.iterate(): - yield dataset - - def zero_grad(self): - r"""Clears the gradients of all optimized :class:`torch.Tensor` s.""" - self._optimizer.zero_grad() - - def step(self, **closure): - """Performs a single optimization step. - - Arguments: - **closure (callable, optional): A closure that reevaluates the - model and returns the loss. - - """ - self._optimizer.step(**closure) diff --git a/src/garage/torch/value_functions/gaussian_mlp_value_function.py b/src/garage/torch/value_functions/gaussian_mlp_value_function.py index d340fef2a9..7f0670841c 100644 --- a/src/garage/torch/value_functions/gaussian_mlp_value_function.py +++ b/src/garage/torch/value_functions/gaussian_mlp_value_function.py @@ -78,11 +78,11 @@ def __init__(self, std_parameterization='exp', layer_normalization=layer_normalization) - def compute_loss(self, obs, returns): + def loss_function(self, observations, returns, lengths=None): r"""Compute mean value of loss. Args: - obs (torch.Tensor): Observation from the environment + observations (torch.Tensor): Observation from the environment with shape :math:`(N \dot [T], O*)`. returns (torch.Tensor): Acquired returns with shape :math:`(N, )`. @@ -91,7 +91,7 @@ def compute_loss(self, obs, returns): objective (float). """ - dist = self.module(obs) + dist = self.module(observations) ll = dist.log_prob(returns.reshape(-1, 1)) loss = -ll.mean() return loss diff --git a/src/garage/torch/value_functions/value_function.py b/src/garage/torch/value_functions/value_function.py index 1cc533f33d..7aa54e7968 100644 --- a/src/garage/torch/value_functions/value_function.py +++ b/src/garage/torch/value_functions/value_function.py @@ -20,11 +20,11 @@ def __init__(self, env_spec, name): self.name = name @abc.abstractmethod - def compute_loss(self, obs, returns): + def loss_function(self, observations, returns, lengths=None): r"""Compute mean value of loss. Args: - obs (torch.Tensor): Observation from the environment + observations (torch.Tensor): Observation from the environment with shape :math:`(N \dot [T], O*)`. returns (torch.Tensor): Acquired returns with shape :math:`(N, )`. diff --git a/tests/garage/torch/test_functions.py b/tests/garage/torch/test_functions.py index b89b0c0042..0235a68f54 100644 --- a/tests/garage/torch/test_functions.py +++ b/tests/garage/torch/test_functions.py @@ -10,6 +10,7 @@ from garage.envs import GymEnv, normalize from garage.experiment.deterministic import set_seed +from garage.np import discount_cumsum as np_discout_cumsum from garage.torch import (as_torch_dict, compute_advantages, flatten_to_single_vector, global_device, pad_to_last, product_of_gaussians, set_gpu_mode, state_dict_to, @@ -129,6 +130,15 @@ def test_state_dict_to(): assert np.all( [moved_state_dict[key].is_cuda for key in moved_state_dict.keys()]) +def test_discount_cumsum(): + discount = 0.99 + x = tensor([9.3217, 9.3003, 9.3406, 9.2251, 9.0715, 9.0134, 8.9026, + 8.6619]) + returns = discount_cumsum(x, discount) + expected = np_discout_cumsum(torch_to_np(x), discount) + assert returns.shape == (len(x), ) + assert np.allclose(expected, torch_to_np(returns) + class TestTorchAlgoUtils(TfGraphTestCase): """Test class for torch algo utility functions."""