From d45045000eefdeebe2a02294709b92906de47678 Mon Sep 17 00:00:00 2001 From: "K.R. Zentner" Date: Tue, 1 Dec 2020 06:41:19 -0800 Subject: [PATCH 1/6] Add garage.torch.ObservationBatch --- src/garage/torch/__init__.py | 7 ++ src/garage/torch/_dtypes.py | 126 +++++++++++++++++++++++++++++++++++ tests/garage/test_dtypes.py | 2 + 3 files changed, 135 insertions(+) create mode 100644 src/garage/torch/_dtypes.py diff --git a/src/garage/torch/__init__.py b/src/garage/torch/__init__.py index f2add664be..2a439112b2 100644 --- a/src/garage/torch/__init__.py +++ b/src/garage/torch/__init__.py @@ -1,5 +1,8 @@ """PyTorch-backed modules and algorithms.""" # yapf: disable +from garage.torch._dtypes import (ObservationBatch, ObservationOrder, + ShuffledOptimizationNotSupported, + observation_batch_to_packed_sequence) from garage.torch._functions import (as_torch_dict, compute_advantages, expand_var, filter_valids, flatten_batch, flatten_to_single_vector, global_device, @@ -21,12 +24,16 @@ 'flatten_to_single_vector', 'global_device', 'np_to_torch', + 'ObservationBatch', + 'observation_batch_to_packed_sequence', + 'ObservationOrder', 'output_height_2d', 'output_width_2d', 'pad_to_last', 'prefer_gpu', 'product_of_gaussians', 'set_gpu_mode', + 'ShuffledOptimizationNotSupported', 'soft_update_model', 'state_dict_to', 'torch_to_np', diff --git a/src/garage/torch/_dtypes.py b/src/garage/torch/_dtypes.py new file mode 100644 index 0000000000..dec76c2c7d --- /dev/null +++ b/src/garage/torch/_dtypes.py @@ -0,0 +1,126 @@ +"""Data structures used in garage.torch.""" +from dataclasses import dataclass +import enum + +import torch +from torch import nn + + +class ShuffledOptimizationNotSupported(ValueError): + """Raised by recurrent policies if they're passed a shuffled batch.""" + + +class ObservationOrder(enum.IntEnum): + """Defines the order of observations in an ObservationBatch. + + See :class:`ObservationBatch` for detailed documentation. + + """ + # Tensor contains a batch of "most recent" observations. + # This ordering is typcially used when performing rollouts, and it is + # expected that stateful policies maintain there own state when using this + # ordering. + LAST = 0 + # Tensor contains observations with timesteps from potentially different + # episodes in a shuffled order. Recurrent policies should raise + # ShuffledOptimizationNotSupported if they encounter this ordering. + SHUFFLED = 1 + # Tensor contains all observations for a batch of episodes, in order. + EPISODES = 2 + + +@dataclass(init=False) +class ObservationBatch(torch.Tensor): + r"""The (differentiable) input to all pytorch policies. + + Args: + observations (torch.Tensor): A torch tensor containing flattened + observations in a batch. Stateless policies should always operate + on this input. This input is passed to the super-constructor. + Shape depends on the order: + * If `order == ROLLOUT`, has shape :math:`(V, O)` (where V is the + vectorization level). + * If `order == SHUFFLED`, has shape :math:`(B, O)` (where B is the + mini-batch size). + * If order == EPISODES, has shape :math:`(N \bullet [T], O)` + (where N is the number of episodes, and T is the episode + lengths). + order (ObservationOrder): The order of observations in this batch. If + this is set to EPISODES, lengths must not be None. + lengths (torch.Tensor or None): Integer tensor containing the lengths + of each episode. Only has a value if `order == EPISODES`. + """ + + order: ObservationOrder + lengths: torch.Tensor = None + + def __init__(self, observations, order, lengths): + """Check that lengths is consistent with the rest of the fields. + + Raises: + ValueError: If lengths is not consistent with another field. + + """ + super().__init__(observations) + self.order = order + self.lengths = lengths + if self.order == ObservationOrder.EPISODES: + if self.lengths is None: + raise ValueError( + 'lengths is None, but must be a torch.Tensor when ' + 'order == ObservationOrder.EPISODES') + assert self.lengths is not None + if self.lengths.dtype not in (torch.uint8, torch.int8, torch.int16, + torch.int32, torch.int64): + raise ValueError( + f'lengths has dtype {self.lengths.dtype}, but must have ' + f'an integer dtype') + total_size = sum(self.lengths) + if self.observations.shape[0] != total_size: + raise ValueError( + f'observations has batch size ' + f'{self.observations.shape[0]}, but must have batch ' + f'size {total_size} to match lengths') + assert self.observations.shape[0] == total_size + elif self.lengths is not None: + raise ValueError( + f'lengths has value {self.lengths}, but must be None ' + f'when order == {self.order}') + + +def observation_batch_to_packed_sequence(observations): + """Turn ObservationBatch into a torch.nn.utils.rnn.PackedSequence. + + This function is not a method on ObservationBatch so that it can be called + on a observation Tensor that is not an ObservationBatch. This simplifies + the implementation of recurrent policies. + + Args: + observations (torch.Tensor or ObservationBatch): Observations to + convert to PackedSequence. + + Raises: + ShuffledOptimizationNotSupported: If called with an input that is not + an ObservationBatch or when `order != EPISODES` + + Returns: + torch.nn.utils.rnn.PackedSequence: The sequence of flattened + observations. + + """ + if not isinstance(observations, ObservationBatch): + raise ShuffledOptimizationNotSupported( + f'observations should be an ObservationBatch, but was of ' + f'type {type(observations)!r} instead.') + if observations.order != ObservationOrder.EPISODES: + raise ShuffledOptimizationNotSupported( + f'order has value {observations.order} but must have order ' + f'{ObservationOrder.EPISODES} to use to_packed_sequence') + sequence = [] + start = 0 + for length in observations.lengths: + stop = start + length + sequence.append(observations.observations[start:stop]) + start = stop + pack_sequence = nn.utils.rnn.pack_sequence + return pack_sequence(sequence, enforce_sorted=False) diff --git a/tests/garage/test_dtypes.py b/tests/garage/test_dtypes.py index e5d23b3af6..4e2bcb9031 100644 --- a/tests/garage/test_dtypes.py +++ b/tests/garage/test_dtypes.py @@ -6,6 +6,7 @@ # yapf: disable from garage import (EnvSpec, EnvStep, EpisodeBatch, StepType, TimeStep, TimeStepBatch) +from garage._dtypes import check_timestep_batch # yapf: enable @@ -77,6 +78,7 @@ def test_new_eps(eps_data): assert t.episode_infos_by_episode is eps_data['episode_infos'] assert (t.episode_infos['task_one_hot'][0].shape == eps_data['episode_infos']['task_one_hot'][0].shape) + check_timestep_batch(t, np.ndarray) def test_lengths_shape_mismatch_eps(eps_data): From b540f212ab8a0086caa4064e617d2ef068a28d30 Mon Sep 17 00:00:00 2001 From: "K.R. Zentner" Date: Tue, 1 Dec 2020 06:47:58 -0800 Subject: [PATCH 2/6] Use ObservationBatch in StochasticPolicy --- src/garage/torch/__init__.py | 4 +++- src/garage/torch/_dtypes.py | 2 +- src/garage/torch/policies/stochastic_policy.py | 12 ++++++++++-- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/garage/torch/__init__.py b/src/garage/torch/__init__.py index 2a439112b2..71b119b7dd 100644 --- a/src/garage/torch/__init__.py +++ b/src/garage/torch/__init__.py @@ -11,7 +11,8 @@ pad_to_last, prefer_gpu, product_of_gaussians, set_gpu_mode, soft_update_model, state_dict_to, - torch_to_np, update_module_params) + torch_to_np, update_module_params, + list_to_tensor) # yapf: enable __all__ = [ @@ -23,6 +24,7 @@ 'flatten_batch', 'flatten_to_single_vector', 'global_device', + 'list_to_tensor', 'np_to_torch', 'ObservationBatch', 'observation_batch_to_packed_sequence', diff --git a/src/garage/torch/_dtypes.py b/src/garage/torch/_dtypes.py index dec76c2c7d..a1c33d3e0b 100644 --- a/src/garage/torch/_dtypes.py +++ b/src/garage/torch/_dtypes.py @@ -54,7 +54,7 @@ class ObservationBatch(torch.Tensor): order: ObservationOrder lengths: torch.Tensor = None - def __init__(self, observations, order, lengths): + def __init__(self, observations, order, lengths=None): """Check that lengths is consistent with the rest of the fields. Raises: diff --git a/src/garage/torch/policies/stochastic_policy.py b/src/garage/torch/policies/stochastic_policy.py index 84a60ca7a8..6d6eafdfbd 100644 --- a/src/garage/torch/policies/stochastic_policy.py +++ b/src/garage/torch/policies/stochastic_policy.py @@ -5,7 +5,8 @@ import numpy as np import torch -from garage.torch._functions import list_to_tensor, np_to_torch +from garage.torch import (list_to_tensor, np_to_torch, ObservationBatch, + ObservationOrder) from garage.torch.policies.policy import Policy @@ -92,6 +93,8 @@ def get_actions(self, observations): if isinstance(self._env_spec.observation_space, akro.Image): observations /= 255.0 # scale image + observations = ObservationBatch(observations, + order=ObservationOrder.LAST) dist, info = self.forward(observations) return dist.sample().cpu().numpy(), { k: v.detach().cpu().numpy() @@ -105,7 +108,12 @@ def forward(self, observations): Args: observations (torch.Tensor): Batch of observations on default - torch device. + torch device. Stateful policies may require this input to be a + garage.torch.ObservationBatch. + + Raises: + ShuffledOptimizationNotSupported: If this policy is a stateful + policy and the required an ObservationBatch. Returns: torch.distributions.Distribution: Batch distribution of actions. From 9985301ed22bc389f1b7aa06a7f95565c11f6eca Mon Sep 17 00:00:00 2001 From: "K.R. Zentner" Date: Sun, 13 Dec 2020 14:04:10 -0800 Subject: [PATCH 3/6] Make garage.torch.ObservationBatch constructable --- src/garage/torch/_dtypes.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/garage/torch/_dtypes.py b/src/garage/torch/_dtypes.py index a1c33d3e0b..f0be030ce4 100644 --- a/src/garage/torch/_dtypes.py +++ b/src/garage/torch/_dtypes.py @@ -54,14 +54,17 @@ class ObservationBatch(torch.Tensor): order: ObservationOrder lengths: torch.Tensor = None - def __init__(self, observations, order, lengths=None): + def __new__(cls, observations, order, lengths=None): """Check that lengths is consistent with the rest of the fields. Raises: ValueError: If lengths is not consistent with another field. + Returns: + ObservationBatch: A new observation batch. + """ - super().__init__(observations) + self = super().__new__(cls, observations) self.order = order self.lengths = lengths if self.order == ObservationOrder.EPISODES: @@ -86,6 +89,7 @@ def __init__(self, observations, order, lengths=None): raise ValueError( f'lengths has value {self.lengths}, but must be None ' f'when order == {self.order}') + return self def observation_batch_to_packed_sequence(observations): From 19e4dbb7e7c048b5fe28b8156927bbe5875e9b97 Mon Sep 17 00:00:00 2001 From: "K.R. Zentner" Date: Sun, 13 Dec 2020 14:10:08 -0800 Subject: [PATCH 4/6] Implement garage.torch.GaussianLSTMPolicy --- src/garage/torch/policies/__init__.py | 2 + .../torch/policies/gaussian_lstm_policy.py | 228 ++++++++++++++++++ .../policies/test_gaussian_lstm_policy.py | 67 +++++ 3 files changed, 297 insertions(+) create mode 100644 src/garage/torch/policies/gaussian_lstm_policy.py create mode 100644 tests/garage/torch/policies/test_gaussian_lstm_policy.py diff --git a/src/garage/torch/policies/__init__.py b/src/garage/torch/policies/__init__.py index c50d46bc2f..194078680d 100644 --- a/src/garage/torch/policies/__init__.py +++ b/src/garage/torch/policies/__init__.py @@ -7,6 +7,7 @@ from garage.torch.policies.discrete_cnn_policy import DiscreteCNNPolicy from garage.torch.policies.discrete_qf_argmax_policy import ( DiscreteQFArgmaxPolicy) +from garage.torch.policies.gaussian_lstm_policy import GaussianLSTMPolicy from garage.torch.policies.gaussian_mlp_policy import GaussianMLPPolicy from garage.torch.policies.policy import Policy from garage.torch.policies.tanh_gaussian_mlp_policy import ( @@ -21,4 +22,5 @@ 'Policy', 'TanhGaussianMLPPolicy', 'ContextConditionedPolicy', + 'GaussianLSTMPolicy', ] diff --git a/src/garage/torch/policies/gaussian_lstm_policy.py b/src/garage/torch/policies/gaussian_lstm_policy.py new file mode 100644 index 0000000000..d028673cb1 --- /dev/null +++ b/src/garage/torch/policies/gaussian_lstm_policy.py @@ -0,0 +1,228 @@ +"""GaussianLSTMPolicy.""" +import torch +from torch import nn +from torch.distributions import Normal + +from garage.torch import (observation_batch_to_packed_sequence, + ObservationBatch, ObservationOrder, + ShuffledOptimizationNotSupported) +from garage.torch.modules import GaussianMLPModule +from garage.torch.policies.stochastic_policy import StochasticPolicy + + +class GaussianLSTMPolicy(StochasticPolicy): + """LSTM whose outputs are fed into a Normal distribution.. + + A policy that contains a LSTM to make prediction based on a gaussian + distribution. + + Args: + env_spec (EnvSpec): Environment specification. + name (str): Name of policy. + hidden_size (int): Hidden dimension for LSTM cell for mean. + num_layers (int): Number of recurrent layers. + hidden_mlp_nonlinearity (Callable): Activation function for + intermediate dense layer(s). It should return a torch.Tensor. Set + it to None to maintain a linear activation. + hidden_mlp_sizes (list[int]): Output dimension of dense layer(s) for + the MLP for mean. For example, (32, 32) means the MLP consists + of two hidden layers, each with 32 hidden units. + hidden_mlp_w_init (Callable): Initializer function for the weight + of intermediate dense layer(s). Should modify a torch.Tensor. + hidden_mlp_b_init (Callable): Initializer function for the bias + of intermediate dense layer(s). Should modify a torch.Tensor. + output_nonlinearity (callable): Activation function for output dense + layer. It should return a torch.Tensor. Set it to None to + maintain a linear activation. + output_w_init (Callable): Initializer function for the weight + of output dense layer(s). Should modify a torch.Tensor. + output_b_init (Callable): Initializer function for the bias + of output dense layer(s). Should modify a torch.Tensor. + recurrent_w_init (Callable): Initializer function for the weight + of recurrent layer(s). Should modify a torch.Tensor. + hidden_state_init (Callable): Initializer function for the + initial hidden state. Should modify a torch.Tensor. + hidden_state_init_trainable (bool): Bool for whether the initial + hidden state is trainable. + cell_state_init (Callable): Initializer function for the + initial cell state. Should modify a torch.Tensor. + cell_state_init_trainable (bool): Bool for whether the initial + cell state is trainable. + learn_std (bool): Is std trainable. + init_std (float): Initial value for std. + min_std (float): Minimum value for std. + max_std (float): Maximum value for std. + std_parameterization (str): How the std should be parametrized. There + are two options: + - exp: the logarithm of the std will be stored, and applied a + exponential transformation + - softplus: the std will be computed as log(1+exp(x)) + layer_normalization (bool): Bool for using layer normalization or not. + std_parameterization (str): How the std should be parametrized. There + are two options: + - exp: the logarithm of the std will be stored, and applied a + exponential transformation. + - softplus: the std will be computed as log(1+exp(x)). + normal_distribution_cls (torch.distribution): normal distribution class + to be constructed and returned by a call to forward. By default, is + `torch.distributions.Normal`. + + """ + + def __init__(self, + env_spec, + *, + name='GaussianLSTMPolicy', + hidden_size=32, + num_layers=2, + cell_state_init_trainable=True, + cell_state_init=None, + hidden_state_init=None, + hidden_state_init_trainable=True, + recurrent_w_init=None, + hidden_mlp_sizes=(32, ), + hidden_mlp_w_init=nn.init.xavier_uniform_, + hidden_mlp_b_init=nn.init.zeros_, + hidden_mlp_nonlinearity=torch.tanh, + output_nonlinearity=None, + output_w_init=nn.init.xavier_uniform_, + output_b_init=nn.init.zeros_, + learn_std=True, + init_std=1.0, + min_std=1e-6, + max_std=None, + std_parameterization='exp', + layer_normalization=False, + normal_distribution_cls=Normal): + super().__init__(env_spec, name) + + if std_parameterization not in ('exp', 'softplus'): + raise NotImplementedError + + self._obs_dim = env_spec.observation_space.flat_dim + self._action_dim = env_spec.action_space.flat_dim + self._hidden_size = hidden_size + self._num_layers = num_layers + h0 = torch.zeros(num_layers, hidden_size) + c0 = torch.zeros(num_layers, hidden_size) + if cell_state_init is not None: + hidden_state_init(h0) + cell_state_init(c0) + if cell_state_init_trainable: + self._c0 = nn.Parameter(c0) + else: + self._c0 = c0 + self.register_buffer('_c0', self._c0) + if hidden_state_init_trainable: + self._h0 = nn.Parameter(h0) + else: + self._h0 = h0 + self.register_buffer('_h0', self._h0) + self._rnn = nn.LSTM(input_size=self._obs_dim, + hidden_size=hidden_size, + batch_first=False, + num_layers=num_layers) + if recurrent_w_init is not None: + recurrent_w_init(self._rnn) + self._mlp = GaussianMLPModule( + input_dim=hidden_size, + output_dim=self._action_dim, + hidden_sizes=hidden_mlp_sizes, + hidden_nonlinearity=hidden_mlp_nonlinearity, + hidden_w_init=hidden_mlp_w_init, + hidden_b_init=hidden_mlp_b_init, + output_nonlinearity=output_nonlinearity, + output_w_init=output_w_init, + output_b_init=output_b_init, + learn_std=learn_std, + init_std=init_std, + min_std=min_std, + max_std=max_std, + std_parameterization=std_parameterization, + layer_normalization=layer_normalization, + normal_distribution_cls=normal_distribution_cls) + self._state = None + + def _new_state(self, n_envs): + """Compute a new state for running n_envs in parallel. + + Args: + n_envs (int): Number of observations and actions each call to + get_actions(). + + Returns: + tuple[torch.Tensor, torch.Tensor]: Tensor containing h0 and c0 with + "batch_dim" repeated n_envs times. + + """ + h0 = self._h0.unsqueeze(1).expand( + (self._num_layers, n_envs, self._hidden_size)) + c0 = self._c0.unsqueeze(1).expand( + (self._num_layers, n_envs, self._hidden_size)) + return h0, c0 + + def reset(self, do_resets=None): + """Reset the policy. + + This is effective only to recurrent policies. + + do_resets is an array of boolean indicating + which internal states to be reset. The length of do_resets should be + equal to the length of inputs, i.e. batch size. + + Args: + do_resets (numpy.ndarray): Bool array indicating which states + to be reset. + + """ + if do_resets is None: + do_resets = [True] + h0, c0 = self._new_state(len(do_resets)) + if all(do_resets): + self._state = (h0, c0) + for i, do_reset in enumerate(do_resets): + if do_reset: + # Reset all layer's state + self._state[0][:, i] = h0[:, i] + self._state[1][:, i] = c0[:, i] + + def forward(self, observations): + """Compute the action distributions from the observations. + + Args: + observations (torch.Tensor): Batch of observations on default + torch device. + + Raises: + ValueError: If observations is not consistent with reset(). + ShuffledOptimizationNotSupported: If passed a shuffled + ObservationBatch or a tensor that is not an ObservationBatch. + + Returns: + torch.distributions.Distribution: Batch distribution of actions. + dict[str, torch.Tensor]: Additional agent_info, as torch Tensors + + """ + if not isinstance(observations, ObservationBatch): + raise ShuffledOptimizationNotSupported( + f'observations are of type {type(observations)!r}, but should ' + f'be an ObservationBatch') + if observations.order == ObservationOrder.LAST: + if self._state is None: + raise ValueError('get_action() called before reset()') + if self._state[0].shape[1] != len(observations): + raise ValueError(f'observations has length ' + f'{len(observations)} but should have length ' + f'{len(self._state[0])} to match the length ' + f'of do_resets in reset()') + # Add sequence dimension. + rnn_out, self._state = self._rnn(observations.unsqueeze(0), + self._state) + else: + sequence = observation_batch_to_packed_sequence(observations) + n_episodes = len(observations.lengths) + start = self._new_state(n_episodes) + rnn_out, _ = self._rnn(sequence, start) + # Remove sequence dimension. + dist = self._mlp(rnn_out.squeeze(0)) + return (dist, dict(mean=dist.mean, log_std=(dist.variance**.5).log())) diff --git a/tests/garage/torch/policies/test_gaussian_lstm_policy.py b/tests/garage/torch/policies/test_gaussian_lstm_policy.py new file mode 100644 index 0000000000..989bdf617f --- /dev/null +++ b/tests/garage/torch/policies/test_gaussian_lstm_policy.py @@ -0,0 +1,67 @@ +import pickle + +import numpy as np +import pytest + +from garage.envs import GymEnv +from garage.torch.policies import GaussianLSTMPolicy + +# yapf: disable +from tests.fixtures.envs.dummy import DummyBoxEnv, DummyDictEnv + +# yapf: enable + + +def test_get_action_dict_space(): + env = GymEnv(DummyDictEnv(obs_space_type='box', act_space_type='box')) + policy = GaussianLSTMPolicy(env_spec=env.spec, hidden_size=4) + policy.reset(do_resets=None) + obs = env.reset()[0] + + action, _ = policy.get_action(obs) + assert env.action_space.contains(action) + + policy.reset(do_resets=[True, True]) + + actions, _ = policy.get_actions([obs, obs]) + for action in actions: + assert env.action_space.contains(action) + + +# yapf: disable +@pytest.mark.parametrize('obs_dim, action_dim, hidden_size', [ + ((1, ), (1, ), 4), + ((2, ), (2, ), 4), + ((1, 1), (1, ), 4), + ((2, 2), (2, ), 4) +]) +# yapf: enable +def test_get_action(obs_dim, action_dim, hidden_size): + env = GymEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) + policy = GaussianLSTMPolicy(env_spec=env.spec, hidden_size=hidden_size) + + policy.reset() + obs = env.reset()[0] + + action, _ = policy.get_action(obs.flatten()) + assert env.action_space.contains(action) + + actions, _ = policy.get_actions([obs.flatten()]) + for action in actions: + assert env.action_space.contains(action) + + +# pylint: disable=no-member +def test_is_pickleable(): + env = GymEnv(DummyBoxEnv(obs_dim=(1, ), action_dim=(1, ))) + policy = GaussianLSTMPolicy(env_spec=env.spec) + policy.reset() + obs = env.reset()[0] + _, info = policy.get_action(obs) + + p = pickle.dumps(policy) + policy_pickled = pickle.loads(p) + policy_pickled.reset() + _, info2 = policy_pickled.get_action(obs) + assert np.array_equal(info['mean'], info2['mean']) + assert np.array_equal(info['log_std'], info2['log_std']) From df3a137c6bcceb68621b66bc10624c928fd8ae7a Mon Sep 17 00:00:00 2001 From: "K.R. Zentner" Date: Wed, 16 Dec 2020 15:50:51 -0800 Subject: [PATCH 5/6] Torch VPG rework --- .../experiments/algos/ppo_garage_pytorch.py | 18 +- src/garage/torch/__init__.py | 6 +- src/garage/torch/_dtypes.py | 4 +- src/garage/torch/_functions.py | 55 ++++ src/garage/torch/algos/maml_ppo.py | 8 +- src/garage/torch/algos/maml_trpo.py | 8 +- src/garage/torch/algos/maml_vpg.py | 8 +- src/garage/torch/algos/ppo.py | 10 +- src/garage/torch/algos/td3.py | 2 +- src/garage/torch/algos/trpo.py | 34 +- src/garage/torch/algos/vpg.py | 309 +++++++++--------- src/garage/torch/optimizers/__init__.py | 9 +- .../conjugate_gradient_optimizer.py | 12 +- .../torch/optimizers/optimizer_wrapper.py | 63 ---- .../gaussian_mlp_value_function.py | 6 +- .../torch/value_functions/value_function.py | 4 +- tests/garage/torch/test_functions.py | 10 + 17 files changed, 289 insertions(+), 277 deletions(-) delete mode 100644 src/garage/torch/optimizers/optimizer_wrapper.py diff --git a/benchmarks/src/garage_benchmarks/experiments/algos/ppo_garage_pytorch.py b/benchmarks/src/garage_benchmarks/experiments/algos/ppo_garage_pytorch.py index dc205c3562..666afa1732 100644 --- a/benchmarks/src/garage_benchmarks/experiments/algos/ppo_garage_pytorch.py +++ b/benchmarks/src/garage_benchmarks/experiments/algos/ppo_garage_pytorch.py @@ -6,7 +6,7 @@ from garage.experiment import deterministic from garage.sampler import RaySampler from garage.torch.algos import PPO as PyTorch_PPO -from garage.torch.optimizers import OptimizerWrapper +from garage.torch.optimizers import MinibatchOptimizer from garage.torch.policies import GaussianMLPPolicy as PyTorch_GMP from garage.torch.value_functions import GaussianMLPValueFunction from garage.trainer import Trainer @@ -45,15 +45,15 @@ def ppo_garage_pytorch(ctxt, env_id, seed): hidden_nonlinearity=torch.tanh, output_nonlinearity=None) - policy_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)), - policy, - max_optimization_epochs=10, - minibatch_size=64) + policy_optimizer = MinibatchOptimizer((torch.optim.Adam, dict(lr=2.5e-4)), + policy, + max_optimization_epochs=10, + minibatch_size=64) - vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)), - value_function, - max_optimization_epochs=10, - minibatch_size=64) + vf_optimizer = MinibatchOptimizer((torch.optim.Adam, dict(lr=2.5e-4)), + value_function, + max_optimization_epochs=10, + minibatch_size=64) sampler = RaySampler(agents=policy, envs=env, diff --git a/src/garage/torch/__init__.py b/src/garage/torch/__init__.py index 71b119b7dd..8ac1bf783e 100644 --- a/src/garage/torch/__init__.py +++ b/src/garage/torch/__init__.py @@ -3,8 +3,9 @@ from garage.torch._dtypes import (ObservationBatch, ObservationOrder, ShuffledOptimizationNotSupported, observation_batch_to_packed_sequence) -from garage.torch._functions import (as_torch_dict, compute_advantages, - expand_var, filter_valids, flatten_batch, +from garage.torch._functions import (as_tensor, as_torch_dict, + compute_advantages, expand_var, + filter_valids, flatten_batch, flatten_to_single_vector, global_device, NonLinearity, np_to_torch, output_height_2d, output_width_2d, @@ -18,6 +19,7 @@ __all__ = [ 'NonLinearity', 'as_torch_dict', + 'as_tensor', 'compute_advantages', 'expand_var', 'filter_valids', diff --git a/src/garage/torch/_dtypes.py b/src/garage/torch/_dtypes.py index f0be030ce4..3421bf8df3 100644 --- a/src/garage/torch/_dtypes.py +++ b/src/garage/torch/_dtypes.py @@ -79,12 +79,12 @@ def __new__(cls, observations, order, lengths=None): f'lengths has dtype {self.lengths.dtype}, but must have ' f'an integer dtype') total_size = sum(self.lengths) - if self.observations.shape[0] != total_size: + if self.shape[0] != total_size: raise ValueError( f'observations has batch size ' f'{self.observations.shape[0]}, but must have batch ' f'size {total_size} to match lengths') - assert self.observations.shape[0] == total_size + assert self.shape[0] == total_size elif self.lengths is not None: raise ValueError( f'lengths has value {self.lengths}, but must be None ' diff --git a/src/garage/torch/_functions.py b/src/garage/torch/_functions.py index 2c834ebfa6..1319c20da0 100644 --- a/src/garage/torch/_functions.py +++ b/src/garage/torch/_functions.py @@ -106,6 +106,48 @@ def compute_advantages(discount, gae_lambda, max_episode_length, baselines, return advantages +def discount_cumsum(x, discount): + discount_x = torch.full((len(x), ), + discount, + dtype=torch.float, + device=x.device) + discount_x[0] = 1.0 + filter = torch.cumprod(discount_x, dim=0) + pad = len(x) - 1 + # minibatch of 1, with 1 channel + filter = filter.reshape(1, 1, -1) + returns = F.conv1d(x.reshape(1, 1, -1), filter, stride=1, padding=pad) + returns = returns[0, 0, pad:] + return returns + + +def split_packed_tensor(t, lengths): + """Split a tensor using a sequence of (start, stop) tuples.""" + start = 0 + for length in lengths: + stop = start + length + yield t[start:stop] + start = stop + + +def pad_packed_tensor(t, lengths, max_length=None): + if max_length is None: + max_length = max(lengths) + if max(lengths) > max_length: + raise ValueError(f'packed tensor contains a sequence of length ' + f'{max(lengths)}, but was asked to pad to ' + f'length {max_length}') + out = torch.zeros(( + len(lengths), + max_length, + ) + t.shape[1:], + dtype=t.dtype, + device=t.device) + for i, seq in enumerate(split_packed_tensor(t, lengths)): + out[i][:len(seq)] = seq + return out + + def pad_to_last(nums, total_length, axis=-1, val=0): """Pad val to last in nums in given axis. @@ -383,6 +425,19 @@ def state_dict_to(state_dict, device): return state_dict +def as_tensor(data): + """Convert a list to a PyTorch tensor + + Args: + data (list): Data to convert to tensor + + Returns: + torch.Tensor: A float tensor + + """ + return torch.as_tensor(data, dtype=torch.float32, device=global_device()) + + # pylint: disable=W0223 class NonLinearity(nn.Module): """Wrapper class for non linear function or module. diff --git a/src/garage/torch/algos/maml_ppo.py b/src/garage/torch/algos/maml_ppo.py index 93e4e76145..b7627247fa 100644 --- a/src/garage/torch/algos/maml_ppo.py +++ b/src/garage/torch/algos/maml_ppo.py @@ -4,7 +4,7 @@ from garage import _Default from garage.torch.algos import PPO from garage.torch.algos.maml import MAML -from garage.torch.optimizers import OptimizerWrapper +from garage.torch.optimizers import MinibatchOptimizer class MAMLPPO(MAML): @@ -70,10 +70,10 @@ def __init__(self, meta_evaluator=None, evaluate_every_n_epochs=1): - policy_optimizer = OptimizerWrapper( + policy_optimizer = MinibatchOptimizer( (torch.optim.Adam, dict(lr=inner_lr)), policy) - vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=inner_lr)), - value_function) + vf_optimizer = MinibatchOptimizer( + (torch.optim.Adam, dict(lr=inner_lr)), value_function) inner_algo = PPO(env.spec, policy, diff --git a/src/garage/torch/algos/maml_trpo.py b/src/garage/torch/algos/maml_trpo.py index b4236b4cba..f19a29a817 100644 --- a/src/garage/torch/algos/maml_trpo.py +++ b/src/garage/torch/algos/maml_trpo.py @@ -5,7 +5,7 @@ from garage.torch.algos import VPG from garage.torch.algos.maml import MAML from garage.torch.optimizers import (ConjugateGradientOptimizer, - OptimizerWrapper) + MinibatchOptimizer) class MAMLTRPO(MAML): @@ -71,10 +71,10 @@ def __init__(self, meta_evaluator=None, evaluate_every_n_epochs=1): - policy_optimizer = OptimizerWrapper( + policy_optimizer = MinibatchOptimizer( (torch.optim.Adam, dict(lr=inner_lr)), policy) - vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=inner_lr)), - value_function) + vf_optimizer = MinibatchOptimizer( + (torch.optim.Adam, dict(lr=inner_lr)), value_function) inner_algo = VPG(env.spec, policy, diff --git a/src/garage/torch/algos/maml_vpg.py b/src/garage/torch/algos/maml_vpg.py index cf32d8e6d5..ffb61a2e0f 100644 --- a/src/garage/torch/algos/maml_vpg.py +++ b/src/garage/torch/algos/maml_vpg.py @@ -4,7 +4,7 @@ from garage import _Default from garage.torch.algos import VPG from garage.torch.algos.maml import MAML -from garage.torch.optimizers import OptimizerWrapper +from garage.torch.optimizers import MinibatchOptimizer class MAMLVPG(MAML): @@ -66,10 +66,10 @@ def __init__(self, num_grad_updates=1, meta_evaluator=None, evaluate_every_n_epochs=1): - policy_optimizer = OptimizerWrapper( + policy_optimizer = MinibatchOptimizer( (torch.optim.Adam, dict(lr=inner_lr)), policy) - vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=inner_lr)), - value_function) + vf_optimizer = MinibatchOptimizer( + (torch.optim.Adam, dict(lr=inner_lr)), value_function) inner_algo = VPG(env.spec, policy, diff --git a/src/garage/torch/algos/ppo.py b/src/garage/torch/algos/ppo.py index 73668ac810..f3fc115daa 100644 --- a/src/garage/torch/algos/ppo.py +++ b/src/garage/torch/algos/ppo.py @@ -2,7 +2,7 @@ import torch from garage.torch.algos import VPG -from garage.torch.optimizers import OptimizerWrapper +from garage.torch.optimizers import MinibatchOptimizer class PPO(VPG): @@ -14,9 +14,9 @@ class PPO(VPG): value_function (garage.torch.value_functions.ValueFunction): The value function. sampler (garage.sampler.Sampler): Sampler. - policy_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer + policy_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer for policy. - vf_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer for + vf_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer for value function. lr_clip_range (float): The limit on the likelihood ratio between policies. @@ -63,13 +63,13 @@ def __init__(self, entropy_method='no_entropy'): if policy_optimizer is None: - policy_optimizer = OptimizerWrapper( + policy_optimizer = MinibatchOptimizer( (torch.optim.Adam, dict(lr=2.5e-4)), policy, max_optimization_epochs=10, minibatch_size=64) if vf_optimizer is None: - vf_optimizer = OptimizerWrapper( + vf_optimizer = MinibatchOptimizer( (torch.optim.Adam, dict(lr=2.5e-4)), value_function, max_optimization_epochs=10, diff --git a/src/garage/torch/algos/td3.py b/src/garage/torch/algos/td3.py index edb10083ff..ca833f41bb 100644 --- a/src/garage/torch/algos/td3.py +++ b/src/garage/torch/algos/td3.py @@ -84,9 +84,9 @@ def __init__( replay_buffer, sampler, *, # Everything after this is numbers. - max_episode_length_eval=None, grad_steps_per_env_step, exploration_policy, + max_episode_length_eval=None, uniform_random_policy=None, max_action=None, target_update_tau=0.005, diff --git a/src/garage/torch/algos/trpo.py b/src/garage/torch/algos/trpo.py index c2becfc1c9..a9fe5939aa 100644 --- a/src/garage/torch/algos/trpo.py +++ b/src/garage/torch/algos/trpo.py @@ -4,7 +4,7 @@ from garage.torch._functions import zero_optim_grads from garage.torch.algos import VPG from garage.torch.optimizers import (ConjugateGradientOptimizer, - OptimizerWrapper) + MinibatchOptimizer) class TRPO(VPG): @@ -16,9 +16,9 @@ class TRPO(VPG): value_function (garage.torch.value_functions.ValueFunction): The value function. sampler (garage.sampler.Sampler): Sampler. - policy_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer + policy_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer for policy. - vf_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer for + vf_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer for value function. num_train_per_epoch (int): Number of train_once calls per epoch. discount (float): Discount. @@ -62,11 +62,11 @@ def __init__(self, entropy_method='no_entropy'): if policy_optimizer is None: - policy_optimizer = OptimizerWrapper( + policy_optimizer = MinibatchOptimizer( (ConjugateGradientOptimizer, dict(max_constraint_value=0.01)), policy) if vf_optimizer is None: - vf_optimizer = OptimizerWrapper( + vf_optimizer = MinibatchOptimizer( (torch.optim.Adam, dict(lr=2.5e-4)), value_function, max_optimization_epochs=10, @@ -117,7 +117,8 @@ def _compute_objective(self, advantages, obs, actions, rewards): return surrogate - def _train_policy(self, obs, actions, rewards, advantages): + def _train_policy(self, observations, actions, rewards, advantages, + lengths): r"""Train the policy. Args: @@ -129,18 +130,19 @@ def _train_policy(self, obs, actions, rewards, advantages): with shape :math:`(N, )`. advantages (torch.Tensor): Advantage value at each step with shape :math:`(N, )`. + lengths (torch.Tensor): Lengths of episodes. Returns: torch.Tensor: Calculated mean scalar value of policy loss (float). """ - # pylint: disable=protected-access - zero_optim_grads(self._policy_optimizer._optimizer) - loss = self._compute_loss_with_adv(obs, actions, rewards, advantages) - loss.backward() - self._policy_optimizer.step( - f_loss=lambda: self._compute_loss_with_adv(obs, actions, rewards, - advantages), - f_constraint=lambda: self._compute_kl_constraint(obs)) - - return loss + data = { + 'observations': observations, + 'actions': actions, + 'rewards': rewards, + 'advantages': advantages, + 'lengths': lengths + } + f_constraint = lambda: self._compute_kl_constraint(observations) + return self._policy_optimizer.step(data, self._loss_function, + f_constraint) diff --git a/src/garage/torch/algos/vpg.py b/src/garage/torch/algos/vpg.py index 42a75444fb..3586b322f4 100644 --- a/src/garage/torch/algos/vpg.py +++ b/src/garage/torch/algos/vpg.py @@ -10,9 +10,11 @@ from garage import log_performance from garage.np import discount_cumsum from garage.np.algos import RLAlgorithm -from garage.torch import compute_advantages, filter_valids -from garage.torch._functions import np_to_torch, zero_optim_grads -from garage.torch.optimizers import OptimizerWrapper +from garage.torch import (as_tensor, compute_advantages, filter_valids, + global_device, ObservationBatch, ObservationOrder) +from garage.torch._functions import (np_to_torch, pad_packed_tensor, + split_packed_tensor) +from garage.torch.optimizers import MinibatchOptimizer class VPG(RLAlgorithm): @@ -26,11 +28,11 @@ class VPG(RLAlgorithm): value_function (garage.torch.value_functions.ValueFunction): The value function. sampler (garage.sampler.Sampler): Sampler. - policy_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer + policy_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer for policy. - vf_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer for + vf_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer for value function. - num_train_per_epoch (int): Number of train_once calls per epoch. + steps_per_epoch (int): Number of train_once calls per epoch. discount (float): Discount. gae_lambda (float): Lambda used for generalized advantage estimation. @@ -42,6 +44,8 @@ class VPG(RLAlgorithm): standardized before shifting. policy_ent_coeff (float): The coefficient of the policy entropy. Setting it to zero would mean no entropy regularization. + use_neg_logli_entropy (bool): Whether to estimate the entropy as the + negative log likelihood of the action. use_softplus_entropy (bool): Whether to estimate the softmax distribution of the entropy to prevent the entropy from being negative. @@ -62,15 +66,17 @@ def __init__( sampler, policy_optimizer=None, vf_optimizer=None, - num_train_per_epoch=1, + steps_per_epoch=1, discount=0.99, gae_lambda=1, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, + use_neg_logli_entropy=True, use_softplus_entropy=False, - stop_entropy_gradient=False, + stop_entropy_gradient=True, entropy_method='no_entropy', + recurrent=None, ): self._discount = discount self.policy = policy @@ -83,8 +89,9 @@ def __init__( self._policy_ent_coeff = policy_ent_coeff self._use_softplus_entropy = use_softplus_entropy self._stop_entropy_gradient = stop_entropy_gradient + self._use_neg_logli_entropy = use_neg_logli_entropy self._entropy_method = entropy_method - self._n_samples = num_train_per_epoch + self._steps_per_epoch = steps_per_epoch self._env_spec = env_spec self._maximum_entropy = (entropy_method == 'max') @@ -93,19 +100,21 @@ def __init__( stop_entropy_gradient, policy_ent_coeff) self._episode_reward_mean = collections.deque(maxlen=100) - self._sampler = sampler + self.sampler = sampler if policy_optimizer: self._policy_optimizer = policy_optimizer else: - self._policy_optimizer = OptimizerWrapper(torch.optim.Adam, policy) + self._policy_optimizer = MinibatchOptimizer( + torch.optim.Adam, policy) if vf_optimizer: self._vf_optimizer = vf_optimizer else: - self._vf_optimizer = OptimizerWrapper(torch.optim.Adam, - value_function) + self._vf_optimizer = MinibatchOptimizer(torch.optim.Adam, + value_function) self._old_policy = copy.deepcopy(self.policy) + self._recurrent = recurrent @staticmethod def _check_entropy_configuration(entropy_method, center_adv, @@ -134,77 +143,71 @@ def discount(self): """ return self._discount - def _train_once(self, itr, eps): + def _train_once(self, eps): """Train the algorithm once. Args: - itr (int): Iteration number. eps (EpisodeBatch): A batch of collected paths. - Returns: - numpy.float64: Calculated mean value of undiscounted returns. - """ - obs = np_to_torch(eps.padded_observations) - rewards = np_to_torch(eps.padded_rewards) - returns = np_to_torch( - np.stack([ - discount_cumsum(reward, self.discount) - for reward in eps.padded_rewards - ])) - valids = eps.lengths - with torch.no_grad(): - baselines = self._value_function(obs) - + # Conver to torch and compute returns, etc. + lengths = torch.from_numpy(eps.lengths).to(global_device()) + obs = ObservationBatch(as_tensor(eps.observations), + order=ObservationOrder.EPISODES, + lengths=lengths) + actions = torch.Tensor(eps.actions) + rewards = torch.Tensor(eps.rewards) + policy_entropies = self._compute_policy_entropy(obs, actions) if self._maximum_entropy: - policy_entropies = self._compute_policy_entropy(obs) rewards += self._policy_ent_coeff * policy_entropies + returns = torch.hstack([ + discount_cumsum(r, self._discount) + for r in split_packed_tensor(rewards, lengths) + ]) + with torch.no_grad(): + baselines = self._value_function(obs) + advantages = self._compute_advantage(rewards, lengths, baselines) - obs_flat = np_to_torch(eps.observations) - actions_flat = np_to_torch(eps.actions) - rewards_flat = np_to_torch(eps.rewards) - returns_flat = torch.cat(filter_valids(returns, valids)) - advs_flat = self._compute_advantage(rewards, valids, baselines) - + # Log before training with torch.no_grad(): - policy_loss_before = self._compute_loss_with_adv( - obs_flat, actions_flat, rewards_flat, advs_flat) - vf_loss_before = self._value_function.compute_loss( - obs_flat, returns_flat) - kl_before = self._compute_kl_constraint(obs) + policy_loss_before = self._loss_function(obs, actions, rewards, + advantages, lengths) + vf_loss_before = self._value_function.loss_function(obs, returns) - self._train(obs_flat, actions_flat, rewards_flat, returns_flat, - advs_flat) + with tabular.prefix(self.policy.name): + tabular.record('/LossBefore', policy_loss_before.item()) + tabular.record('/KLBefore', + self._compute_kl_constraint(obs).item()) + tabular.record('/EntropyBefore', + policy_entropies.mean().item()) - with torch.no_grad(): - policy_loss_after = self._compute_loss_with_adv( - obs_flat, actions_flat, rewards_flat, advs_flat) - vf_loss_after = self._value_function.compute_loss( - obs_flat, returns_flat) - kl_after = self._compute_kl_constraint(obs) - policy_entropy = self._compute_policy_entropy(obs) - - with tabular.prefix(self.policy.name): - tabular.record('/LossBefore', policy_loss_before.item()) - tabular.record('/LossAfter', policy_loss_after.item()) - tabular.record('/dLoss', - (policy_loss_before - policy_loss_after).item()) - tabular.record('/KLBefore', kl_before.item()) - tabular.record('/KL', kl_after.item()) - tabular.record('/Entropy', policy_entropy.mean().item()) - - with tabular.prefix(self._value_function.name): - tabular.record('/LossBefore', vf_loss_before.item()) - tabular.record('/LossAfter', vf_loss_after.item()) - tabular.record('/dLoss', - vf_loss_before.item() - vf_loss_after.item()) + with tabular.prefix(self._value_function.name): + tabular.record('/LossBefore', vf_loss_before.item()) + # Save the current policy state and train self._old_policy.load_state_dict(self.policy.state_dict()) + self._train_policy(obs, actions, rewards, returns, advantages, lengths) + self._train_value_function(obs, returns, lengths) + + # Log after training + with torch.no_grad(): - undiscounted_returns = log_performance(itr, - eps, - discount=self._discount) - return np.mean(undiscounted_returns) + policy_loss_after = self._loss_function(obs, actions, rewards, + advantages, lengths) + with tabular.prefix(self.policy.name): + tabular.record('/LossAfter', policy_loss_after.item()) + tabular.record('/dLoss', + (policy_loss_before - policy_loss_after).item()) + tabular.record('/KL', self._compute_kl_constraint(obs).item()) + tabular.record( + '/EntropyAfter', + self._compute_policy_entropy(obs, actions).mean().item()) + + vf_loss_after = self._value_function.loss_function(obs, returns) + with tabular.prefix(self._value_function.name): + tabular.record('/vfLossAfter', vf_loss_after.item()) + tabular.record('/dLoss', + vf_loss_before.item() - vf_loss_after.item()) def train(self, trainer): """Obtain samplers and start actual training for each epoch. @@ -220,84 +223,73 @@ def train(self, trainer): """ last_return = None - for _ in trainer.step_epochs(): - for _ in range(self._n_samples): - eps = trainer.obtain_episodes(trainer.step_itr) - last_return = self._train_once(trainer.step_itr, eps) - trainer.step_itr += 1 - + for epoch in trainer.step_epochs(): + for _ in range(self._steps_per_epoch): + trainer.step_path = trainer.obtain_episodes(epoch) + self._train_once(trainer.step_path) + last_return = np.mean( + log_performance(epoch, + trainer.step_path, + discount=self._discount)) return last_return - def _train(self, obs, actions, rewards, returns, advs): - r"""Train the policy and value function with minibatch. - - Args: - obs (torch.Tensor): Observation from the environment with shape - :math:`(N, O*)`. - actions (torch.Tensor): Actions fed to the environment with shape - :math:`(N, A*)`. - rewards (torch.Tensor): Acquired rewards with shape :math:`(N, )`. - returns (torch.Tensor): Acquired returns with shape :math:`(N, )`. - advs (torch.Tensor): Advantage value at each step with shape - :math:`(N, )`. - - """ - for dataset in self._policy_optimizer.get_minibatch( - obs, actions, rewards, advs): - self._train_policy(*dataset) - for dataset in self._vf_optimizer.get_minibatch(obs, returns): - self._train_value_function(*dataset) - - def _train_policy(self, obs, actions, rewards, advantages): + def _train_policy(self, observations, actions, rewards, returns, + advantages, lengths): r"""Train the policy. Args: - obs (torch.Tensor): Observation from the environment + observations (torch.Tensor): Observation from the environment with shape :math:`(N, O*)`. actions (torch.Tensor): Actions fed to the environment with shape :math:`(N, A*)`. rewards (torch.Tensor): Acquired rewards with shape :math:`(N, )`. + returns (torch.Tensor): Acquired returns with shape :math:`(N, )`. advantages (torch.Tensor): Advantage value at each step with shape :math:`(N, )`. + lengths (torch.Tensor): Lengths of episodes. Returns: torch.Tensor: Calculated mean scalar value of policy loss (float). """ - # pylint: disable=protected-access - zero_optim_grads(self._policy_optimizer._optimizer) - loss = self._compute_loss_with_adv(obs, actions, rewards, advantages) - loss.backward() - self._policy_optimizer.step() - - return loss - - def _train_value_function(self, obs, returns): + data = { + 'observations': observations, + 'actions': actions, + 'rewards': rewards, + 'advantages': advantages, + 'lengths': lengths + } + return self._policy_optimizer.step(data, self._loss_function) + + def _train_value_function(self, observations, returns, lengths): r"""Train the value function. Args: - obs (torch.Tensor): Observation from the environment + observations (torch.Tensor): Observation from the environment with shape :math:`(N, O*)`. returns (torch.Tensor): Acquired returns with shape :math:`(N, )`. + lengths (torch.Tensor): Lengths of episodes. Returns: torch.Tensor: Calculated mean scalar value of value function loss (float). """ - # pylint: disable=protected-access - zero_optim_grads(self._vf_optimizer._optimizer) - loss = self._value_function.compute_loss(obs, returns) - loss.backward() - self._vf_optimizer.step() - - return loss - - def _compute_loss(self, obs, actions, rewards, valids, baselines): + data = { + 'observations': observations, + 'returns': returns, + 'lengths': lengths + } + return self._vf_optimizer.step(data, + self._value_function.loss_function) + + def _compute_loss(self, obs, actions, rewards, lengths, baselines): r"""Compute mean value of loss. + Note that this function is private, but used by MAML. + Notes: P is the maximum episode length (self.max_episode_length) Args: @@ -307,7 +299,7 @@ def _compute_loss(self, obs, actions, rewards, valids, baselines): with shape :math:`(N, P, A*)`. rewards (torch.Tensor): Acquired rewards with shape :math:`(N, P)`. - valids (list[int]): Numbers of valid steps in each episode + lengths (list[int]): Numbers of valid steps in each episode baselines (torch.Tensor): Value function estimation at each step with shape :math:`(N, P)`. @@ -316,19 +308,24 @@ def _compute_loss(self, obs, actions, rewards, valids, baselines): objective (float). """ - obs_flat = torch.cat(filter_valids(obs, valids)) - actions_flat = torch.cat(filter_valids(actions, valids)) - rewards_flat = torch.cat(filter_valids(rewards, valids)) - advantages_flat = self._compute_advantage(rewards, valids, baselines) - - return self._compute_loss_with_adv(obs_flat, actions_flat, - rewards_flat, advantages_flat) - - def _compute_loss_with_adv(self, obs, actions, rewards, advantages): + obs_flat = torch.cat(filter_valids(obs, lengths)) + actions_flat = torch.cat(filter_valids(actions, lengths)) + rewards_flat = torch.cat(filter_valids(rewards, lengths)) + advantages_flat = self._compute_advantage(rewards, lengths, baselines) + + return self._loss_function(obs_flat, actions_flat, rewards_flat, + advantages_flat, lengths) + + def _loss_function(self, + observations, + actions, + rewards, + advantages, + lengths=None): r"""Compute mean value of loss. Args: - obs (torch.Tensor): Observation from the environment + observations (torch.Tensor): Observation from the environment with shape :math:`(N \dot [T], O*)`. actions (torch.Tensor): Actions fed to the environment with shape :math:`(N \dot [T], A*)`. @@ -336,50 +333,56 @@ def _compute_loss_with_adv(self, obs, actions, rewards, advantages): with shape :math:`(N \dot [T], )`. advantages (torch.Tensor): Advantage value at each step with shape :math:`(N \dot [T], )`. + lengths (torch.Tensor or None): Lengths of episodes, if operating + on full episodes. Returns: torch.Tensor: Calculated negative mean scalar value of objective. """ - objectives = self._compute_objective(advantages, obs, actions, rewards) + objectives = self._compute_objective(advantages, observations, actions, + rewards) if self._entropy_regularzied: - policy_entropies = self._compute_policy_entropy(obs) + policy_entropies = self._compute_policy_entropy( + observations, actions) objectives += self._policy_ent_coeff * policy_entropies return -objectives.mean() - def _compute_advantage(self, rewards, valids, baselines): + def _compute_advantage(self, rewards, lengths, baselines): r"""Compute mean value of loss. - Notes: P is the maximum episode length (self.max_episode_length) - Args: - rewards (torch.Tensor): Acquired rewards - with shape :math:`(N, P)`. - valids (list[int]): Numbers of valid steps in each episode - baselines (torch.Tensor): Value function estimation at each step - with shape :math:`(N, P)`. + rewards (torch.Tensor): Packed acquired rewards + with shape :math:`(N \bullet [T])`. + lengths (list[int]): Numbers of valid steps in each episode + baselines (torch.Tensor): Packed value function estimation of + returns with shape :math:`(N \bullet [T])`. Returns: torch.Tensor: Calculated advantage values given rewards and baselines with shape :math:`(N \dot [T], )`. """ - advantages = compute_advantages(self._discount, self._gae_lambda, - self.max_episode_length, baselines, - rewards) - advantage_flat = torch.cat(filter_valids(advantages, valids)) + padded_rewards = pad_packed_tensor(rewards, lengths) + padded_baselines = pad_packed_tensor(baselines, lengths) + padded_advantages = compute_advantages(self._discount, + self._gae_lambda, + self.max_episode_length, + padded_baselines, + padded_rewards) + advantages = torch.cat(filter_valids(padded_advantages, lengths)) if self._center_adv: - means = advantage_flat.mean() - variance = advantage_flat.var() - advantage_flat = (advantage_flat - means) / (variance + 1e-8) + means = advantages.mean() + variance = advantages.var() + advantages = (advantages - means) / (variance + 1e-8) if self._positive_adv: - advantage_flat -= advantage_flat.min() + advantages -= advantages.min() - return advantage_flat + return advantages def _compute_kl_constraint(self, obs): r"""Compute KL divergence. @@ -408,25 +411,27 @@ def _compute_kl_constraint(self, obs): return kl_constraint.mean() - def _compute_policy_entropy(self, obs): + def _compute_policy_entropy(self, obs, actions): r"""Compute entropy value of probability distribution. Notes: P is the maximum episode length (self.max_episode_length) Args: - obs (torch.Tensor): Observation from the environment + observations (torch.Tensor): Observation from the environment with shape :math:`(N, P, O*)`. + actions (torch.Tensor): Actions fed to the environment + with shape :math:`(N \dot [T], A*)`. Returns: torch.Tensor: Calculated entropy values given observation with shape :math:`(N, P)`. """ - if self._stop_entropy_gradient: - with torch.no_grad(): + with torch.set_grad_enabled(not self._stop_entropy_gradient): + if self._use_neg_logli_entropy: + policy_entropy = -self.policy(obs)[0].log_prob(actions) + else: policy_entropy = self.policy(obs)[0].entropy() - else: - policy_entropy = self.policy(obs)[0].entropy() # This prevents entropy from becoming negative for small policy std if self._use_softplus_entropy: diff --git a/src/garage/torch/optimizers/__init__.py b/src/garage/torch/optimizers/__init__.py index bc21022af4..bf9dec1f12 100644 --- a/src/garage/torch/optimizers/__init__.py +++ b/src/garage/torch/optimizers/__init__.py @@ -2,8 +2,9 @@ from garage.torch.optimizers.conjugate_gradient_optimizer import ( ConjugateGradientOptimizer) from garage.torch.optimizers.differentiable_sgd import DifferentiableSGD -from garage.torch.optimizers.optimizer_wrapper import OptimizerWrapper +from garage.torch.optimizers.episode_batch_optimizer import ( + EpisodeBatchOptimizer) +from garage.torch.optimizers.minibatch_optimizer import MinibatchOptimizer +from garage.torch.optimizers.optimizer import Optimizer -__all__ = [ - 'OptimizerWrapper', 'ConjugateGradientOptimizer', 'DifferentiableSGD' -] +__all__ = ['Optimizer', 'ConjugateGradientOptimizer', 'DifferentiableSGD'] diff --git a/src/garage/torch/optimizers/conjugate_gradient_optimizer.py b/src/garage/torch/optimizers/conjugate_gradient_optimizer.py index 489587e672..0157d2c908 100644 --- a/src/garage/torch/optimizers/conjugate_gradient_optimizer.py +++ b/src/garage/torch/optimizers/conjugate_gradient_optimizer.py @@ -138,12 +138,12 @@ def __init__(self, self._hvp_reg_coeff = hvp_reg_coeff self._accept_violation = accept_violation - def step(self, f_loss, f_constraint): # pylint: disable=arguments-differ + def step(self, loss_function, constraint_function): # pylint: disable=arguments-differ """Take an optimization step. Args: - f_loss (callable): Function to compute the loss. - f_constraint (callable): Function to compute the constraint value. + loss_function (callable): Function to compute the loss. + constraint_function (callable): Function to compute the constraint value. """ # Collect trainable parameters and gradients @@ -157,7 +157,7 @@ def step(self, f_loss, f_constraint): # pylint: disable=arguments-differ flat_loss_grads = torch.cat(grads) # Build Hessian-vector-product function - f_Ax = _build_hessian_vector_product(f_constraint, params, + f_Ax = _build_hessian_vector_product(constraint_function, params, self._hvp_reg_coeff) # Compute step direction @@ -177,8 +177,8 @@ def step(self, f_loss, f_constraint): # pylint: disable=arguments-differ descent_step = step_size * step_dir # Update parameters using backtracking line search - self._backtracking_line_search(params, descent_step, f_loss, - f_constraint) + self._backtracking_line_search(params, descent_step, loss_function, + constraint_function) @property def state(self): diff --git a/src/garage/torch/optimizers/optimizer_wrapper.py b/src/garage/torch/optimizers/optimizer_wrapper.py deleted file mode 100644 index 9f69ce565d..0000000000 --- a/src/garage/torch/optimizers/optimizer_wrapper.py +++ /dev/null @@ -1,63 +0,0 @@ -"""A PyTorch optimizer wrapper that compute loss and optimize module.""" -from garage import make_optimizer -from garage.np.optimizers import BatchDataset - - -class OptimizerWrapper: - """A wrapper class to handle torch.optim.optimizer. - - Args: - optimizer (Union[type, tuple[type, dict]]): Type of optimizer - for policy. This can be an optimizer type such as - `torch.optim.Adam` or a tuple of type and dictionary, where - dictionary contains arguments to initialize the optimizer. - e.g. `(torch.optim.Adam, {'lr' : 1e-3})` - Sample strategy to be used when sampling a new task. - module (torch.nn.Module): Module to be optimized. - max_optimization_epochs (int): Maximum number of epochs for update. - minibatch_size (int): Batch size for optimization. - - """ - - def __init__(self, - optimizer, - module, - max_optimization_epochs=1, - minibatch_size=None): - self._optimizer = make_optimizer(optimizer, module=module) - self._max_optimization_epochs = max_optimization_epochs - self._minibatch_size = minibatch_size - - def get_minibatch(self, *inputs): - r"""Yields a batch of inputs. - - Notes: P is the size of minibatch (self._minibatch_size) - - Args: - *inputs (list[torch.Tensor]): A list of inputs. Each input has - shape :math:`(N \dot [T], *)`. - - Yields: - list[torch.Tensor]: A list batch of inputs. Each batch has shape - :math:`(P, *)`. - - """ - batch_dataset = BatchDataset(inputs, self._minibatch_size) - - for _ in range(self._max_optimization_epochs): - for dataset in batch_dataset.iterate(): - yield dataset - - def zero_grad(self): - r"""Clears the gradients of all optimized :class:`torch.Tensor` s.""" - self._optimizer.zero_grad() - - def step(self, **closure): - """Performs a single optimization step. - - Arguments: - **closure (callable, optional): A closure that reevaluates the - model and returns the loss. - - """ - self._optimizer.step(**closure) diff --git a/src/garage/torch/value_functions/gaussian_mlp_value_function.py b/src/garage/torch/value_functions/gaussian_mlp_value_function.py index d340fef2a9..7f0670841c 100644 --- a/src/garage/torch/value_functions/gaussian_mlp_value_function.py +++ b/src/garage/torch/value_functions/gaussian_mlp_value_function.py @@ -78,11 +78,11 @@ def __init__(self, std_parameterization='exp', layer_normalization=layer_normalization) - def compute_loss(self, obs, returns): + def loss_function(self, observations, returns, lengths=None): r"""Compute mean value of loss. Args: - obs (torch.Tensor): Observation from the environment + observations (torch.Tensor): Observation from the environment with shape :math:`(N \dot [T], O*)`. returns (torch.Tensor): Acquired returns with shape :math:`(N, )`. @@ -91,7 +91,7 @@ def compute_loss(self, obs, returns): objective (float). """ - dist = self.module(obs) + dist = self.module(observations) ll = dist.log_prob(returns.reshape(-1, 1)) loss = -ll.mean() return loss diff --git a/src/garage/torch/value_functions/value_function.py b/src/garage/torch/value_functions/value_function.py index 1cc533f33d..7aa54e7968 100644 --- a/src/garage/torch/value_functions/value_function.py +++ b/src/garage/torch/value_functions/value_function.py @@ -20,11 +20,11 @@ def __init__(self, env_spec, name): self.name = name @abc.abstractmethod - def compute_loss(self, obs, returns): + def loss_function(self, observations, returns, lengths=None): r"""Compute mean value of loss. Args: - obs (torch.Tensor): Observation from the environment + observations (torch.Tensor): Observation from the environment with shape :math:`(N \dot [T], O*)`. returns (torch.Tensor): Acquired returns with shape :math:`(N, )`. diff --git a/tests/garage/torch/test_functions.py b/tests/garage/torch/test_functions.py index b89b0c0042..0235a68f54 100644 --- a/tests/garage/torch/test_functions.py +++ b/tests/garage/torch/test_functions.py @@ -10,6 +10,7 @@ from garage.envs import GymEnv, normalize from garage.experiment.deterministic import set_seed +from garage.np import discount_cumsum as np_discout_cumsum from garage.torch import (as_torch_dict, compute_advantages, flatten_to_single_vector, global_device, pad_to_last, product_of_gaussians, set_gpu_mode, state_dict_to, @@ -129,6 +130,15 @@ def test_state_dict_to(): assert np.all( [moved_state_dict[key].is_cuda for key in moved_state_dict.keys()]) +def test_discount_cumsum(): + discount = 0.99 + x = tensor([9.3217, 9.3003, 9.3406, 9.2251, 9.0715, 9.0134, 8.9026, + 8.6619]) + returns = discount_cumsum(x, discount) + expected = np_discout_cumsum(torch_to_np(x), discount) + assert returns.shape == (len(x), ) + assert np.allclose(expected, torch_to_np(returns) + class TestTorchAlgoUtils(TfGraphTestCase): """Test class for torch algo utility functions.""" From bca7b682598cd598ec427bbfbe9f04b47ad630f6 Mon Sep 17 00:00:00 2001 From: "K.R. Zentner" Date: Wed, 16 Dec 2020 15:50:51 -0800 Subject: [PATCH 6/6] Torch VPG rework WIP torch optimizer refactor WIP torch optimizer refactor WIP --- src/garage/examples/torch/vpg_pendulum.py | 8 +- src/garage/torch/_dtypes.py | 42 ++++++- src/garage/torch/_functions.py | 13 ++- src/garage/torch/algos/vpg.py | 31 +++-- src/garage/torch/optimizers/__init__.py | 11 +- .../optimizers/episode_batch_optimizer.py | 110 ++++++++++++++++++ .../torch/optimizers/minibatch_optimizer.py | 87 ++++++++++++++ src/garage/torch/optimizers/optimizer.py | 43 +++++++ .../optimizers/single_batch_optimizer.py | 45 +++++++ tests/garage/torch/test_functions.py | 7 +- 10 files changed, 371 insertions(+), 26 deletions(-) create mode 100644 src/garage/torch/optimizers/episode_batch_optimizer.py create mode 100644 src/garage/torch/optimizers/minibatch_optimizer.py create mode 100644 src/garage/torch/optimizers/optimizer.py create mode 100644 src/garage/torch/optimizers/single_batch_optimizer.py diff --git a/src/garage/examples/torch/vpg_pendulum.py b/src/garage/examples/torch/vpg_pendulum.py index 59e4e008a6..5ca60a23b9 100755 --- a/src/garage/examples/torch/vpg_pendulum.py +++ b/src/garage/examples/torch/vpg_pendulum.py @@ -11,7 +11,7 @@ from garage import wrap_experiment from garage.envs import GymEnv from garage.experiment.deterministic import set_seed -from garage.sampler import RaySampler +from garage.sampler import LocalSampler, RaySampler from garage.torch.algos import VPG from garage.torch.policies import GaussianMLPPolicy from garage.torch.value_functions import GaussianMLPValueFunction @@ -44,9 +44,9 @@ def vpg_pendulum(ctxt=None, seed=1): hidden_nonlinearity=torch.tanh, output_nonlinearity=None) - sampler = RaySampler(agents=policy, - envs=env, - max_episode_length=env.spec.max_episode_length) + sampler = LocalSampler(agents=policy, + envs=env, + max_episode_length=env.spec.max_episode_length) algo = VPG(env_spec=env.spec, policy=policy, diff --git a/src/garage/torch/_dtypes.py b/src/garage/torch/_dtypes.py index 3421bf8df3..aa4254d473 100644 --- a/src/garage/torch/_dtypes.py +++ b/src/garage/torch/_dtypes.py @@ -29,7 +29,7 @@ class ObservationOrder(enum.IntEnum): EPISODES = 2 -@dataclass(init=False) +@dataclass(init=False, eq=False) class ObservationBatch(torch.Tensor): r"""The (differentiable) input to all pytorch policies. @@ -91,6 +91,24 @@ def __new__(cls, observations, order, lengths=None): f'when order == {self.order}') return self + def __repr__(self): + return f'{type(self).__name__}({super().__repr__()}, order={self.order!r}, lengths={self.lengths!r})' + + @classmethod + def __torch_function__(cls, func, types, args=(), kwargs=None): + # print(f"func: {func.__name__}, args: {args!r}, kwargs: {kwargs!r}") + if kwargs is None: + kwargs = {} + result = super().__torch_function__(func, types, args, kwargs) + # Fixup ObservationBatch instances returned from methods. + # In the future this might preserve order for some methods + if isinstance(result, ObservationBatch): + if not hasattr(result, 'order'): + result.order = ObservationOrder.SHUFFLED + if not hasattr(result, 'lengths'): + result.lengths = None + return result + def observation_batch_to_packed_sequence(observations): """Turn ObservationBatch into a torch.nn.utils.rnn.PackedSequence. @@ -128,3 +146,25 @@ def observation_batch_to_packed_sequence(observations): start = stop pack_sequence = nn.utils.rnn.pack_sequence return pack_sequence(sequence, enforce_sorted=False) + + +def is_policy_recurrent(policy, env_spec): + """Check if a torch policy is recurrent. + + Args: + policy (garage.torch.Policy): Policy that might be recurrent. + + Returns: + bool: If policy is recurrent. + + """ + try: + policy.forward( + as_tensor([ + env_spec.observation_space.sample(), + env_spec.observation_space.sample() + ])) + except ShuffledOptimizationNotSupported: + return True + else: + return False diff --git a/src/garage/torch/_functions.py b/src/garage/torch/_functions.py index 1319c20da0..8b864cb29b 100644 --- a/src/garage/torch/_functions.py +++ b/src/garage/torch/_functions.py @@ -111,13 +111,14 @@ def discount_cumsum(x, discount): discount, dtype=torch.float, device=x.device) - discount_x[0] = 1.0 + # discount_x[0] = 1.0 filter = torch.cumprod(discount_x, dim=0) - pad = len(x) - 1 - # minibatch of 1, with 1 channel - filter = filter.reshape(1, 1, -1) - returns = F.conv1d(x.reshape(1, 1, -1), filter, stride=1, padding=pad) - returns = returns[0, 0, pad:] + returns = F.conv1d(x, filter, stride=1) + assert returns.shape == (len(x), ) + from garage.np import discount_cumsum as np_discout_cumsum + import numpy as np + expected = np_discout_cumsum(torch_to_np(x), discount) + assert np.array_equal(expected, torch_to_np(returns)) return returns diff --git a/src/garage/torch/algos/vpg.py b/src/garage/torch/algos/vpg.py index 3586b322f4..be68d3110c 100644 --- a/src/garage/torch/algos/vpg.py +++ b/src/garage/torch/algos/vpg.py @@ -8,13 +8,12 @@ import torch.nn.functional as F from garage import log_performance -from garage.np import discount_cumsum from garage.np.algos import RLAlgorithm from garage.torch import (as_tensor, compute_advantages, filter_valids, global_device, ObservationBatch, ObservationOrder) -from garage.torch._functions import (np_to_torch, pad_packed_tensor, +from garage.torch._functions import (discount_cumsum, pad_packed_tensor, split_packed_tensor) -from garage.torch.optimizers import MinibatchOptimizer +from garage.torch.optimizers import MinibatchOptimizer, SingleBatchOptimizer class VPG(RLAlgorithm): @@ -28,9 +27,9 @@ class VPG(RLAlgorithm): value_function (garage.torch.value_functions.ValueFunction): The value function. sampler (garage.sampler.Sampler): Sampler. - policy_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer + policy_optimizer (garage.torch.optimizer.Optimizer): Optimizer for policy. - vf_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer for + vf_optimizer (garage.torch.optimizer.Optimizer): Optimizer for value function. steps_per_epoch (int): Number of train_once calls per epoch. discount (float): Discount. @@ -100,21 +99,29 @@ def __init__( stop_entropy_gradient, policy_ent_coeff) self._episode_reward_mean = collections.deque(maxlen=100) - self.sampler = sampler + self._sampler = sampler + if recurrent is None: + recurrent = is_policy_recurrent(policy) + self._recurrent = recurrent if policy_optimizer: self._policy_optimizer = policy_optimizer + elif self._recurrent: + self._policy_optimizer = EpisodeBatchOptimizer( + torch.optim.Adam, policy) else: - self._policy_optimizer = MinibatchOptimizer( + self._policy_optimizer = SingleBatchOptimizer( torch.optim.Adam, policy) if vf_optimizer: self._vf_optimizer = vf_optimizer + elif self._recurrent: + self._vf_optimizer = EpisodeBatchOptimizer(torch.optim.Adam, + value_function) else: self._vf_optimizer = MinibatchOptimizer(torch.optim.Adam, value_function) self._old_policy = copy.deepcopy(self.policy) - self._recurrent = recurrent @staticmethod def _check_entropy_configuration(entropy_method, center_adv, @@ -225,7 +232,7 @@ def train(self, trainer): for epoch in trainer.step_epochs(): for _ in range(self._steps_per_epoch): - trainer.step_path = trainer.obtain_episodes(epoch) + trainer.step_path = self._sampler.obtain_episodes(epoch) self._train_once(trainer.step_path) last_return = np.mean( log_performance(epoch, @@ -258,8 +265,9 @@ def _train_policy(self, observations, actions, rewards, returns, 'actions': actions, 'rewards': rewards, 'advantages': advantages, - 'lengths': lengths } + if not isinstance(self._policy_optimizer, MinibatchOptimizer): + data['lengths'] = lengths return self._policy_optimizer.step(data, self._loss_function) def _train_value_function(self, observations, returns, lengths): @@ -348,7 +356,8 @@ def _loss_function(self, observations, actions) objectives += self._policy_ent_coeff * policy_entropies - return -objectives.mean() + loss = -objectives.mean() + return loss def _compute_advantage(self, rewards, lengths, baselines): r"""Compute mean value of loss. diff --git a/src/garage/torch/optimizers/__init__.py b/src/garage/torch/optimizers/__init__.py index bf9dec1f12..5fcfe21392 100644 --- a/src/garage/torch/optimizers/__init__.py +++ b/src/garage/torch/optimizers/__init__.py @@ -1,4 +1,5 @@ """PyTorch optimizers.""" +# yapf: disable from garage.torch.optimizers.conjugate_gradient_optimizer import ( ConjugateGradientOptimizer) from garage.torch.optimizers.differentiable_sgd import DifferentiableSGD @@ -6,5 +7,13 @@ EpisodeBatchOptimizer) from garage.torch.optimizers.minibatch_optimizer import MinibatchOptimizer from garage.torch.optimizers.optimizer import Optimizer +from garage.torch.optimizers.single_batch_optimizer import SingleBatchOptimizer -__all__ = ['Optimizer', 'ConjugateGradientOptimizer', 'DifferentiableSGD'] +__all__ = [ + 'ConjugateGradientOptimizer', + 'DifferentiableSGD', + 'EpisodeBatchOptimizer', + 'MinibatchOptimizer', + 'Optimizer', + 'SingleBatchOptimizer', +] diff --git a/src/garage/torch/optimizers/episode_batch_optimizer.py b/src/garage/torch/optimizers/episode_batch_optimizer.py new file mode 100644 index 0000000000..2a3ba7c8f8 --- /dev/null +++ b/src/garage/torch/optimizers/episode_batch_optimizer.py @@ -0,0 +1,110 @@ +"""Optimizer that runs a torch optimizer on full episodes.""" +import click +import numpy as np + +from garage import make_optimizer +from garage.torch import (as_tensor, ObservationBatch, ObservationOrder, + split_packed_tensor) +from garage.torch.optimizers.optimizer import Optimizer + + +class EpisodeBatchOptimizer(Optimizer): + """Optimizer that runs a torch optimizer on full episodes. + + Args: + optimizer (Union[type, tuple[type, dict]]): Type of optimizer + for policy. This can be an optimizer type such as + `torch.optim.Adam` or a tuple of type and dictionary, where + dictionary contains arguments to initialize the optimizer. + e.g. `(torch.optim.Adam, {'lr' : 1e-3})` + Sample strategy to be used when sampling a new task. + module (torch.nn.Module): Module to be optimized. + max_optimization_epochs (int): Maximum number of epochs for update. + minibatch_size (int): Batch size for optimization. + + """ + + def __init__(self, + optimizer, + module, + max_optimization_epochs=1000, + minibatch_size=32): + super().__init__(module) + self._optimizer = make_optimizer(optimizer, module=module) + self._max_optimization_epochs = max_optimization_epochs + self._minibatch_size = minibatch_size + + def _minibatches(self, data_by_episode, lengths): + r"""Yields a batch of inputs. + + Notes: P is the size of minibatch (self._minibatch_size) + + Args: + data_by_episode (dict[str, list[torch.Tensor]]): Dictionary of + data, where each data array has been split by episode. + lengths (list[int]): Length of each episode in data. + + Yields: + dict[str, torch.Tensor]: Batch of inputs to pass to loss function. + + """ + episode_indices = np.range(len(lengths)) + i = 0 + with click.progressbar(range(self._max_optimization_epochs), + label='Optimizing') as pbar: + for _ in pbar: + batch_size = 0 + batch = {k: [] for k in data_by_episode.keys()} + batch_lengths = [] + while sum(batch_lengths) < self._minibatch_size: + if i == 0: + np.random.shuffle(episode_indices) + for k, v in data_by_episode.items(): + batch[k].append(v[i]) + batch_lengths.append(lengths[i]) + i = (i + 1) % len(lengths) + batch = {k: as_tensor(v) for (k, v) in batch.items()} + batch['observations'] = ObservationBatch( + batch['observations'], ObservationOrder.EPISODES, + batch_lengths) + batch['lengths'] = as_tensor(batch_lengths) + yield batch + + def step(self, data, loss_function): + """Use `data` to minimize `loss_function`. + + Note that data may be operated on in optimizer specific ways, and + loss_function may be called multiple times. + + Args: + data (dict[str, torch.Tensor]): Data to feed into the loss + function. May be operated on before feeding. Must contain the + key 'lengths'. + loss_function (dict[str, torch.Tensor] -> torch.Tensor): + Differentiable loss function to optimize. + + Returns: + float: Average value of loss_function over data. + + """ + if 'observations' not in data: + raise ValueError('observations must be in data for ' + 'EpisodeBatchOptimizer') + try: + lengths = data['lengths'] + except KeyError: + try: + lengths = data['observations'].lengths + except AttributeError: + raise ValueError('EpisodeBatchOptimizer must have lengths in ' + 'data or observations must be an ' + 'ObservationBatch') + data_by_episode = { + k: split_packed_tensor(v, lengths) + for (k, v) in data.items() if v != 'lengths' + } + for batch in self._minibatches(data_by_episode, lengths): + self._optimizer.zero_grad() + loss = loss_function(**batch) + loss.backward() + self._optimizer.step() diff --git a/src/garage/torch/optimizers/minibatch_optimizer.py b/src/garage/torch/optimizers/minibatch_optimizer.py new file mode 100644 index 0000000000..430de561fa --- /dev/null +++ b/src/garage/torch/optimizers/minibatch_optimizer.py @@ -0,0 +1,87 @@ +"""A garage optimizer that optimizes using minibatches.""" +import click +import numpy as np + +from garage import make_optimizer +from garage.torch.optimizers.optimizer import Optimizer + + +class MinibatchOptimizer(Optimizer): + """Optimizer that runs a torch.optim.Optimizer on minibatches. + + Args: + optimizer (Union[type, tuple[type, dict]]): Type of optimizer + for policy. This can be an optimizer type such as + `torch.optim.Adam` or a tuple of type and dictionary, where + dictionary contains arguments to initialize the optimizer. + e.g. `(torch.optim.Adam, {'lr' : 1e-3})` + Sample strategy to be used when sampling a new task. + module (torch.nn.Module): Module to be optimized. + max_optimization_epochs (int): Maximum number of times to iterate + through all samples. + minibatch_size (int): Batch size for optimization. If a single large + batch is desired, consider using SingleBatchOptimizer instead. + + """ + + def __init__(self, + optimizer, + module, + max_optimization_epochs=1, + minibatch_size=32): + super().__init__(module) + self._optimizer = make_optimizer(optimizer, module=module) + self._max_optimization_epochs = max_optimization_epochs + self._minibatch_size = minibatch_size + + def _minibatches(self, n_samples, data): + r"""Yields a batch of inputs. + + Notes: P is the size of minibatch (self._minibatch_size) + + Args: + n_samples (int): Total number of samples in data. + data (dict[str, torch.Tensor]): Data to sample into batches. Each + tensor has shape :math:`(N \dot [T], *)`. + + Yields: + dict[str, torch.Tensor]: Batch of inputs to pass to loss function. + + """ + assert n_samples == len(next(iter(data.values()))) + with click.progressbar(range(self._max_optimization_epochs), + label='Optimizing') as pbar: + for _ in pbar: + all_indices = np.arange(n_samples) + np.random.shuffle(all_indices) + split = np.array_split( + all_indices, np.ceil(n_samples / self._minibatch_size)) + for minibatch_indices in split: + yield {k: v[minibatch_indices] for (k, v) in data.items()} + + def step(self, data, loss_function): + """Use `data` to minimize `loss_function`. + + Note that data may be operated on in optimizer specific ways, and + loss_function may be called multiple times. + + Args: + data (dict[str, torch.Tensor]): Data to feed into the loss + function. May be operated on before feeding. + loss_function (dict[str, torch.Tensor] -> torch.Tensor): + Differentiable loss function to optimize. + + Returns: + float: Average value of loss_function over data. + + """ + if 'lengths' in data: + del data['lengths'] + n_samples = [len(v) for v in data.values()] + assert all(n == n_samples[0] for n in n_samples) + + for i, batch in enumerate(self._minibatches(n_samples[0], data)): + self._optimizer.zero_grad() + loss = loss_function(**batch) + loss.backward() + self._optimizer.step() diff --git a/src/garage/torch/optimizers/optimizer.py b/src/garage/torch/optimizers/optimizer.py new file mode 100644 index 0000000000..6bc303e0b4 --- /dev/null +++ b/src/garage/torch/optimizers/optimizer.py @@ -0,0 +1,43 @@ +import abc + + +class Optimizer(metaclass=abc.ABCMeta): + """Base class of optimizers in garage.torch. + + This class exists and differs from torch.optim.Optimizer for a few reasons: + - Several optimizers (DifferentiableSGD, ConjugateGradientOptimizer) + need access to the module they're optimizing as a whole, not just to + their parameters. + - The torch Optimizer class was not designed to be inherited from, and + sometimes breaks base classes in difficult to detect ways. + - The torch Optimizer API is large and not very easy to implement. If + the whole API is needed, one of garage's Optimizer wrappers can be + used instead. + - We want our optimizer API to handle mini-batching, since it makes the + relationship between PPO and TRPO simpler to implement and explain. + + Args: + module (torch.nn.Module): The neural network to optimize. + + """ + + def __init__(self, module): + self._module = module + + @abc.abstractmethod + def step(self, data, loss_function): + """Use `data` to minimize `loss_function`. + + Note that data may be operated on in optimizer specific ways, and + loss_function may be called multiple times. + + Args: + data (dict[str, torch.Tensor]): Data to feed into the loss + function. May be operated on before feeding. + loss_function (dict[str, torch.Tensor] -> torch.Tensor): + Differentiable loss function to optimize. + + Returns: + float: Average value of loss_function over data. + + """ diff --git a/src/garage/torch/optimizers/single_batch_optimizer.py b/src/garage/torch/optimizers/single_batch_optimizer.py new file mode 100644 index 0000000000..b88465bc0b --- /dev/null +++ b/src/garage/torch/optimizers/single_batch_optimizer.py @@ -0,0 +1,45 @@ +"""A garage optimizer that optimizes using a single large batch of SGD.""" +import numpy as np + +from garage import make_optimizer +from garage.torch.optimizers.optimizer import Optimizer + + +class SingleBatchOptimizer(Optimizer): + """Optimizer that runs a torch.optim.Optimizer a single batch. + + Args: + optimizer (Union[type, tuple[type, dict]]): Type of optimizer + for policy. This can be an optimizer type such as + `torch.optim.Adam` or a tuple of type and dictionary, where + dictionary contains arguments to initialize the optimizer. + e.g. `(torch.optim.Adam, {'lr' : 1e-3})` + Sample strategy to be used when sampling a new task. + module (torch.nn.Module): Module to be optimized. + + """ + + def __init__(self, optimizer, module): + super().__init__(module) + self._optimizer = make_optimizer(optimizer, module=module) + + def step(self, data, loss_function): + """Use `data` to minimize `loss_function`. + + Note that data may be operated on in optimizer specific ways, and + loss_function may be called multiple times. + + Args: + data (dict[str, torch.Tensor]): Data to feed into the loss + function. May be operated on before feeding. + loss_function (dict[str, torch.Tensor] -> torch.Tensor): + Differentiable loss function to optimize. + + Returns: + float: Average value of loss_function over data. + + """ + self._optimizer.zero_grad() + loss = loss_function(**data) + loss.backward() + self._optimizer.step() diff --git a/tests/garage/torch/test_functions.py b/tests/garage/torch/test_functions.py index 0235a68f54..34d1d76165 100644 --- a/tests/garage/torch/test_functions.py +++ b/tests/garage/torch/test_functions.py @@ -10,6 +10,7 @@ from garage.envs import GymEnv, normalize from garage.experiment.deterministic import set_seed + from garage.np import discount_cumsum as np_discout_cumsum from garage.torch import (as_torch_dict, compute_advantages, flatten_to_single_vector, global_device, pad_to_last, @@ -130,14 +131,14 @@ def test_state_dict_to(): assert np.all( [moved_state_dict[key].is_cuda for key in moved_state_dict.keys()]) + def test_discount_cumsum(): discount = 0.99 - x = tensor([9.3217, 9.3003, 9.3406, 9.2251, 9.0715, 9.0134, 8.9026, - 8.6619]) + x = torch.tensor([5., 10, 20, 100, 0.5, 0.5, 0.5, 0.5, 1000]) returns = discount_cumsum(x, discount) expected = np_discout_cumsum(torch_to_np(x), discount) assert returns.shape == (len(x), ) - assert np.allclose(expected, torch_to_np(returns) + assert np.allclose(expected, torch_to_np(returns)) class TestTorchAlgoUtils(TfGraphTestCase):