From d45045000eefdeebe2a02294709b92906de47678 Mon Sep 17 00:00:00 2001
From: "K.R. Zentner" <krzentner@gmail.com>
Date: Tue, 1 Dec 2020 06:41:19 -0800
Subject: [PATCH 1/6] Add garage.torch.ObservationBatch

---
 src/garage/torch/__init__.py |   7 ++
 src/garage/torch/_dtypes.py  | 126 +++++++++++++++++++++++++++++++++++
 tests/garage/test_dtypes.py  |   2 +
 3 files changed, 135 insertions(+)
 create mode 100644 src/garage/torch/_dtypes.py

diff --git a/src/garage/torch/__init__.py b/src/garage/torch/__init__.py
index f2add664be..2a439112b2 100644
--- a/src/garage/torch/__init__.py
+++ b/src/garage/torch/__init__.py
@@ -1,5 +1,8 @@
 """PyTorch-backed modules and algorithms."""
 # yapf: disable
+from garage.torch._dtypes import (ObservationBatch, ObservationOrder,
+                                  ShuffledOptimizationNotSupported,
+                                  observation_batch_to_packed_sequence)
 from garage.torch._functions import (as_torch_dict, compute_advantages,
                                      expand_var, filter_valids, flatten_batch,
                                      flatten_to_single_vector, global_device,
@@ -21,12 +24,16 @@
     'flatten_to_single_vector',
     'global_device',
     'np_to_torch',
+    'ObservationBatch',
+    'observation_batch_to_packed_sequence',
+    'ObservationOrder',
     'output_height_2d',
     'output_width_2d',
     'pad_to_last',
     'prefer_gpu',
     'product_of_gaussians',
     'set_gpu_mode',
+    'ShuffledOptimizationNotSupported',
     'soft_update_model',
     'state_dict_to',
     'torch_to_np',
diff --git a/src/garage/torch/_dtypes.py b/src/garage/torch/_dtypes.py
new file mode 100644
index 0000000000..dec76c2c7d
--- /dev/null
+++ b/src/garage/torch/_dtypes.py
@@ -0,0 +1,126 @@
+"""Data structures used in garage.torch."""
+from dataclasses import dataclass
+import enum
+
+import torch
+from torch import nn
+
+
+class ShuffledOptimizationNotSupported(ValueError):
+    """Raised by recurrent policies if they're passed a shuffled batch."""
+
+
+class ObservationOrder(enum.IntEnum):
+    """Defines the order of observations in an ObservationBatch.
+
+    See :class:`ObservationBatch` for detailed documentation.
+
+    """
+    # Tensor contains a batch of "most recent" observations.
+    # This ordering is typcially used when performing rollouts, and it is
+    # expected that stateful policies maintain there own state when using this
+    # ordering.
+    LAST = 0
+    # Tensor contains observations with timesteps from potentially different
+    # episodes in a shuffled order. Recurrent policies should raise
+    # ShuffledOptimizationNotSupported if they encounter this ordering.
+    SHUFFLED = 1
+    # Tensor contains all observations for a batch of episodes, in order.
+    EPISODES = 2
+
+
+@dataclass(init=False)
+class ObservationBatch(torch.Tensor):
+    r"""The (differentiable) input to all pytorch policies.
+
+    Args:
+        observations (torch.Tensor): A torch tensor containing flattened
+            observations in a batch. Stateless policies should always operate
+            on this input. This input is passed to the super-constructor.
+            Shape depends on the order:
+             * If `order == ROLLOUT`, has shape :math:`(V, O)` (where V is the
+                vectorization level).
+             * If `order == SHUFFLED`, has shape :math:`(B, O)` (where B is the
+                mini-batch size).
+             * If order == EPISODES, has shape :math:`(N \bullet [T], O)`
+                (where N is the number of episodes, and T is the episode
+                lengths).
+        order (ObservationOrder): The order of observations in this batch. If
+            this is set to EPISODES, lengths must not be None.
+        lengths (torch.Tensor or None): Integer tensor containing the lengths
+            of each episode. Only has a value if `order == EPISODES`.
+    """
+
+    order: ObservationOrder
+    lengths: torch.Tensor = None
+
+    def __init__(self, observations, order, lengths):
+        """Check that lengths is consistent with the rest of the fields.
+
+        Raises:
+            ValueError: If lengths is not consistent with another field.
+
+        """
+        super().__init__(observations)
+        self.order = order
+        self.lengths = lengths
+        if self.order == ObservationOrder.EPISODES:
+            if self.lengths is None:
+                raise ValueError(
+                    'lengths is None, but must be a torch.Tensor when '
+                    'order == ObservationOrder.EPISODES')
+            assert self.lengths is not None
+            if self.lengths.dtype not in (torch.uint8, torch.int8, torch.int16,
+                                          torch.int32, torch.int64):
+                raise ValueError(
+                    f'lengths has dtype {self.lengths.dtype}, but must have '
+                    f'an integer dtype')
+            total_size = sum(self.lengths)
+            if self.observations.shape[0] != total_size:
+                raise ValueError(
+                    f'observations has batch size '
+                    f'{self.observations.shape[0]}, but must have batch '
+                    f'size {total_size} to match lengths')
+            assert self.observations.shape[0] == total_size
+        elif self.lengths is not None:
+            raise ValueError(
+                f'lengths has value {self.lengths}, but must be None '
+                f'when order == {self.order}')
+
+
+def observation_batch_to_packed_sequence(observations):
+    """Turn ObservationBatch into a torch.nn.utils.rnn.PackedSequence.
+
+    This function is not a method on ObservationBatch so that it can be called
+    on a observation Tensor that is not an ObservationBatch. This simplifies
+    the implementation of recurrent policies.
+
+    Args:
+        observations (torch.Tensor or ObservationBatch): Observations to
+            convert to PackedSequence.
+
+    Raises:
+        ShuffledOptimizationNotSupported: If called with an input that is not
+            an ObservationBatch or when `order != EPISODES`
+
+    Returns:
+        torch.nn.utils.rnn.PackedSequence: The sequence of flattened
+            observations.
+
+    """
+    if not isinstance(observations, ObservationBatch):
+        raise ShuffledOptimizationNotSupported(
+            f'observations should be an ObservationBatch, but was of '
+            f'type {type(observations)!r} instead.')
+    if observations.order != ObservationOrder.EPISODES:
+        raise ShuffledOptimizationNotSupported(
+            f'order has value {observations.order} but must have order '
+            f'{ObservationOrder.EPISODES} to use to_packed_sequence')
+    sequence = []
+    start = 0
+    for length in observations.lengths:
+        stop = start + length
+        sequence.append(observations.observations[start:stop])
+        start = stop
+    pack_sequence = nn.utils.rnn.pack_sequence
+    return pack_sequence(sequence, enforce_sorted=False)
diff --git a/tests/garage/test_dtypes.py b/tests/garage/test_dtypes.py
index e5d23b3af6..4e2bcb9031 100644
--- a/tests/garage/test_dtypes.py
+++ b/tests/garage/test_dtypes.py
@@ -6,6 +6,7 @@
 # yapf: disable
 from garage import (EnvSpec, EnvStep, EpisodeBatch, StepType, TimeStep,
                     TimeStepBatch)
+from garage._dtypes import check_timestep_batch
 
 # yapf: enable
 
@@ -77,6 +78,7 @@ def test_new_eps(eps_data):
     assert t.episode_infos_by_episode is eps_data['episode_infos']
     assert (t.episode_infos['task_one_hot'][0].shape ==
             eps_data['episode_infos']['task_one_hot'][0].shape)
+    check_timestep_batch(t, np.ndarray)
 
 
 def test_lengths_shape_mismatch_eps(eps_data):

From b540f212ab8a0086caa4064e617d2ef068a28d30 Mon Sep 17 00:00:00 2001
From: "K.R. Zentner" <krzentner@gmail.com>
Date: Tue, 1 Dec 2020 06:47:58 -0800
Subject: [PATCH 2/6] Use ObservationBatch in StochasticPolicy

---
 src/garage/torch/__init__.py                   |  4 +++-
 src/garage/torch/_dtypes.py                    |  2 +-
 src/garage/torch/policies/stochastic_policy.py | 12 ++++++++++--
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/garage/torch/__init__.py b/src/garage/torch/__init__.py
index 2a439112b2..71b119b7dd 100644
--- a/src/garage/torch/__init__.py
+++ b/src/garage/torch/__init__.py
@@ -11,7 +11,8 @@
                                      pad_to_last, prefer_gpu,
                                      product_of_gaussians, set_gpu_mode,
                                      soft_update_model, state_dict_to,
-                                     torch_to_np, update_module_params)
+                                     torch_to_np, update_module_params,
+                                     list_to_tensor)
 
 # yapf: enable
 __all__ = [
@@ -23,6 +24,7 @@
     'flatten_batch',
     'flatten_to_single_vector',
     'global_device',
+    'list_to_tensor',
     'np_to_torch',
     'ObservationBatch',
     'observation_batch_to_packed_sequence',
diff --git a/src/garage/torch/_dtypes.py b/src/garage/torch/_dtypes.py
index dec76c2c7d..a1c33d3e0b 100644
--- a/src/garage/torch/_dtypes.py
+++ b/src/garage/torch/_dtypes.py
@@ -54,7 +54,7 @@ class ObservationBatch(torch.Tensor):
     order: ObservationOrder
     lengths: torch.Tensor = None
 
-    def __init__(self, observations, order, lengths):
+    def __init__(self, observations, order, lengths=None):
         """Check that lengths is consistent with the rest of the fields.
 
         Raises:
diff --git a/src/garage/torch/policies/stochastic_policy.py b/src/garage/torch/policies/stochastic_policy.py
index 84a60ca7a8..6d6eafdfbd 100644
--- a/src/garage/torch/policies/stochastic_policy.py
+++ b/src/garage/torch/policies/stochastic_policy.py
@@ -5,7 +5,8 @@
 import numpy as np
 import torch
 
-from garage.torch._functions import list_to_tensor, np_to_torch
+from garage.torch import (list_to_tensor, np_to_torch, ObservationBatch,
+                          ObservationOrder)
 from garage.torch.policies.policy import Policy
 
 
@@ -92,6 +93,8 @@ def get_actions(self, observations):
 
             if isinstance(self._env_spec.observation_space, akro.Image):
                 observations /= 255.0  # scale image
+            observations = ObservationBatch(observations,
+                                            order=ObservationOrder.LAST)
             dist, info = self.forward(observations)
             return dist.sample().cpu().numpy(), {
                 k: v.detach().cpu().numpy()
@@ -105,7 +108,12 @@ def forward(self, observations):
 
         Args:
             observations (torch.Tensor): Batch of observations on default
-                torch device.
+                torch device. Stateful policies may require this input to be a
+                garage.torch.ObservationBatch.
+
+        Raises:
+            ShuffledOptimizationNotSupported: If this policy is a stateful
+                policy and the required an ObservationBatch.
 
         Returns:
             torch.distributions.Distribution: Batch distribution of actions.

From 9985301ed22bc389f1b7aa06a7f95565c11f6eca Mon Sep 17 00:00:00 2001
From: "K.R. Zentner" <krzentner@gmail.com>
Date: Sun, 13 Dec 2020 14:04:10 -0800
Subject: [PATCH 3/6] Make garage.torch.ObservationBatch constructable

---
 src/garage/torch/_dtypes.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/garage/torch/_dtypes.py b/src/garage/torch/_dtypes.py
index a1c33d3e0b..f0be030ce4 100644
--- a/src/garage/torch/_dtypes.py
+++ b/src/garage/torch/_dtypes.py
@@ -54,14 +54,17 @@ class ObservationBatch(torch.Tensor):
     order: ObservationOrder
     lengths: torch.Tensor = None
 
-    def __init__(self, observations, order, lengths=None):
+    def __new__(cls, observations, order, lengths=None):
         """Check that lengths is consistent with the rest of the fields.
 
         Raises:
             ValueError: If lengths is not consistent with another field.
 
+        Returns:
+            ObservationBatch: A new observation batch.
+
         """
-        super().__init__(observations)
+        self = super().__new__(cls, observations)
         self.order = order
         self.lengths = lengths
         if self.order == ObservationOrder.EPISODES:
@@ -86,6 +89,7 @@ def __init__(self, observations, order, lengths=None):
             raise ValueError(
                 f'lengths has value {self.lengths}, but must be None '
                 f'when order == {self.order}')
+        return self
 
 
 def observation_batch_to_packed_sequence(observations):

From 19e4dbb7e7c048b5fe28b8156927bbe5875e9b97 Mon Sep 17 00:00:00 2001
From: "K.R. Zentner" <krzentner@gmail.com>
Date: Sun, 13 Dec 2020 14:10:08 -0800
Subject: [PATCH 4/6] Implement garage.torch.GaussianLSTMPolicy

---
 src/garage/torch/policies/__init__.py         |   2 +
 .../torch/policies/gaussian_lstm_policy.py    | 228 ++++++++++++++++++
 .../policies/test_gaussian_lstm_policy.py     |  67 +++++
 3 files changed, 297 insertions(+)
 create mode 100644 src/garage/torch/policies/gaussian_lstm_policy.py
 create mode 100644 tests/garage/torch/policies/test_gaussian_lstm_policy.py

diff --git a/src/garage/torch/policies/__init__.py b/src/garage/torch/policies/__init__.py
index c50d46bc2f..194078680d 100644
--- a/src/garage/torch/policies/__init__.py
+++ b/src/garage/torch/policies/__init__.py
@@ -7,6 +7,7 @@
 from garage.torch.policies.discrete_cnn_policy import DiscreteCNNPolicy
 from garage.torch.policies.discrete_qf_argmax_policy import (
     DiscreteQFArgmaxPolicy)
+from garage.torch.policies.gaussian_lstm_policy import GaussianLSTMPolicy
 from garage.torch.policies.gaussian_mlp_policy import GaussianMLPPolicy
 from garage.torch.policies.policy import Policy
 from garage.torch.policies.tanh_gaussian_mlp_policy import (
@@ -21,4 +22,5 @@
     'Policy',
     'TanhGaussianMLPPolicy',
     'ContextConditionedPolicy',
+    'GaussianLSTMPolicy',
 ]
diff --git a/src/garage/torch/policies/gaussian_lstm_policy.py b/src/garage/torch/policies/gaussian_lstm_policy.py
new file mode 100644
index 0000000000..d028673cb1
--- /dev/null
+++ b/src/garage/torch/policies/gaussian_lstm_policy.py
@@ -0,0 +1,228 @@
+"""GaussianLSTMPolicy."""
+import torch
+from torch import nn
+from torch.distributions import Normal
+
+from garage.torch import (observation_batch_to_packed_sequence,
+                          ObservationBatch, ObservationOrder,
+                          ShuffledOptimizationNotSupported)
+from garage.torch.modules import GaussianMLPModule
+from garage.torch.policies.stochastic_policy import StochasticPolicy
+
+
+class GaussianLSTMPolicy(StochasticPolicy):
+    """LSTM whose outputs are fed into a Normal distribution..
+
+    A policy that contains a LSTM to make prediction based on a gaussian
+    distribution.
+
+    Args:
+        env_spec (EnvSpec): Environment specification.
+        name (str): Name of policy.
+        hidden_size (int): Hidden dimension for LSTM cell for mean.
+        num_layers (int): Number of recurrent layers.
+        hidden_mlp_nonlinearity (Callable): Activation function for
+            intermediate dense layer(s). It should return a torch.Tensor. Set
+            it to None to maintain a linear activation.
+        hidden_mlp_sizes (list[int]): Output dimension of dense layer(s) for
+            the MLP for mean. For example, (32, 32) means the MLP consists
+            of two hidden layers, each with 32 hidden units.
+        hidden_mlp_w_init (Callable): Initializer function for the weight
+            of intermediate dense layer(s). Should modify a torch.Tensor.
+        hidden_mlp_b_init (Callable): Initializer function for the bias
+            of intermediate dense layer(s). Should modify a torch.Tensor.
+        output_nonlinearity (callable): Activation function for output dense
+            layer. It should return a torch.Tensor. Set it to None to
+            maintain a linear activation.
+        output_w_init (Callable): Initializer function for the weight
+            of output dense layer(s). Should modify a torch.Tensor.
+        output_b_init (Callable): Initializer function for the bias
+            of output dense layer(s). Should modify a torch.Tensor.
+        recurrent_w_init (Callable): Initializer function for the weight
+            of recurrent layer(s). Should modify a torch.Tensor.
+        hidden_state_init (Callable): Initializer function for the
+            initial hidden state. Should modify a torch.Tensor.
+        hidden_state_init_trainable (bool): Bool for whether the initial
+            hidden state is trainable.
+        cell_state_init (Callable): Initializer function for the
+            initial cell state. Should modify a torch.Tensor.
+        cell_state_init_trainable (bool): Bool for whether the initial
+            cell state is trainable.
+        learn_std (bool): Is std trainable.
+        init_std (float): Initial value for std.
+        min_std (float): Minimum value for std.
+        max_std (float): Maximum value for std.
+        std_parameterization (str): How the std should be parametrized. There
+            are two options:
+            - exp: the logarithm of the std will be stored, and applied a
+               exponential transformation
+            - softplus: the std will be computed as log(1+exp(x))
+        layer_normalization (bool): Bool for using layer normalization or not.
+        std_parameterization (str): How the std should be parametrized. There
+            are two options:
+            - exp: the logarithm of the std will be stored, and applied a
+               exponential transformation.
+            - softplus: the std will be computed as log(1+exp(x)).
+        normal_distribution_cls (torch.distribution): normal distribution class
+            to be constructed and returned by a call to forward. By default, is
+            `torch.distributions.Normal`.
+
+    """
+
+    def __init__(self,
+                 env_spec,
+                 *,
+                 name='GaussianLSTMPolicy',
+                 hidden_size=32,
+                 num_layers=2,
+                 cell_state_init_trainable=True,
+                 cell_state_init=None,
+                 hidden_state_init=None,
+                 hidden_state_init_trainable=True,
+                 recurrent_w_init=None,
+                 hidden_mlp_sizes=(32, ),
+                 hidden_mlp_w_init=nn.init.xavier_uniform_,
+                 hidden_mlp_b_init=nn.init.zeros_,
+                 hidden_mlp_nonlinearity=torch.tanh,
+                 output_nonlinearity=None,
+                 output_w_init=nn.init.xavier_uniform_,
+                 output_b_init=nn.init.zeros_,
+                 learn_std=True,
+                 init_std=1.0,
+                 min_std=1e-6,
+                 max_std=None,
+                 std_parameterization='exp',
+                 layer_normalization=False,
+                 normal_distribution_cls=Normal):
+        super().__init__(env_spec, name)
+
+        if std_parameterization not in ('exp', 'softplus'):
+            raise NotImplementedError
+
+        self._obs_dim = env_spec.observation_space.flat_dim
+        self._action_dim = env_spec.action_space.flat_dim
+        self._hidden_size = hidden_size
+        self._num_layers = num_layers
+        h0 = torch.zeros(num_layers, hidden_size)
+        c0 = torch.zeros(num_layers, hidden_size)
+        if cell_state_init is not None:
+            hidden_state_init(h0)
+            cell_state_init(c0)
+        if cell_state_init_trainable:
+            self._c0 = nn.Parameter(c0)
+        else:
+            self._c0 = c0
+            self.register_buffer('_c0', self._c0)
+        if hidden_state_init_trainable:
+            self._h0 = nn.Parameter(h0)
+        else:
+            self._h0 = h0
+            self.register_buffer('_h0', self._h0)
+        self._rnn = nn.LSTM(input_size=self._obs_dim,
+                            hidden_size=hidden_size,
+                            batch_first=False,
+                            num_layers=num_layers)
+        if recurrent_w_init is not None:
+            recurrent_w_init(self._rnn)
+        self._mlp = GaussianMLPModule(
+            input_dim=hidden_size,
+            output_dim=self._action_dim,
+            hidden_sizes=hidden_mlp_sizes,
+            hidden_nonlinearity=hidden_mlp_nonlinearity,
+            hidden_w_init=hidden_mlp_w_init,
+            hidden_b_init=hidden_mlp_b_init,
+            output_nonlinearity=output_nonlinearity,
+            output_w_init=output_w_init,
+            output_b_init=output_b_init,
+            learn_std=learn_std,
+            init_std=init_std,
+            min_std=min_std,
+            max_std=max_std,
+            std_parameterization=std_parameterization,
+            layer_normalization=layer_normalization,
+            normal_distribution_cls=normal_distribution_cls)
+        self._state = None
+
+    def _new_state(self, n_envs):
+        """Compute a new state for running n_envs in parallel.
+
+        Args:
+            n_envs (int): Number of observations and actions each call to
+                get_actions().
+
+        Returns:
+            tuple[torch.Tensor, torch.Tensor]: Tensor containing h0 and c0 with
+                "batch_dim" repeated n_envs times.
+
+        """
+        h0 = self._h0.unsqueeze(1).expand(
+            (self._num_layers, n_envs, self._hidden_size))
+        c0 = self._c0.unsqueeze(1).expand(
+            (self._num_layers, n_envs, self._hidden_size))
+        return h0, c0
+
+    def reset(self, do_resets=None):
+        """Reset the policy.
+
+        This is effective only to recurrent policies.
+
+        do_resets is an array of boolean indicating
+        which internal states to be reset. The length of do_resets should be
+        equal to the length of inputs, i.e. batch size.
+
+        Args:
+            do_resets (numpy.ndarray): Bool array indicating which states
+                to be reset.
+
+        """
+        if do_resets is None:
+            do_resets = [True]
+        h0, c0 = self._new_state(len(do_resets))
+        if all(do_resets):
+            self._state = (h0, c0)
+        for i, do_reset in enumerate(do_resets):
+            if do_reset:
+                # Reset all layer's state
+                self._state[0][:, i] = h0[:, i]
+                self._state[1][:, i] = c0[:, i]
+
+    def forward(self, observations):
+        """Compute the action distributions from the observations.
+
+        Args:
+            observations (torch.Tensor): Batch of observations on default
+                torch device.
+
+        Raises:
+            ValueError: If observations is not consistent with reset().
+            ShuffledOptimizationNotSupported: If passed a shuffled
+                ObservationBatch or a tensor that is not an ObservationBatch.
+
+        Returns:
+            torch.distributions.Distribution: Batch distribution of actions.
+            dict[str, torch.Tensor]: Additional agent_info, as torch Tensors
+
+        """
+        if not isinstance(observations, ObservationBatch):
+            raise ShuffledOptimizationNotSupported(
+                f'observations are of type {type(observations)!r}, but should '
+                f'be an ObservationBatch')
+        if observations.order == ObservationOrder.LAST:
+            if self._state is None:
+                raise ValueError('get_action() called before reset()')
+            if self._state[0].shape[1] != len(observations):
+                raise ValueError(f'observations has length '
+                                 f'{len(observations)} but should have length '
+                                 f'{len(self._state[0])} to match the length '
+                                 f'of do_resets in reset()')
+            # Add sequence dimension.
+            rnn_out, self._state = self._rnn(observations.unsqueeze(0),
+                                             self._state)
+        else:
+            sequence = observation_batch_to_packed_sequence(observations)
+            n_episodes = len(observations.lengths)
+            start = self._new_state(n_episodes)
+            rnn_out, _ = self._rnn(sequence, start)
+        # Remove sequence dimension.
+        dist = self._mlp(rnn_out.squeeze(0))
+        return (dist, dict(mean=dist.mean, log_std=(dist.variance**.5).log()))
diff --git a/tests/garage/torch/policies/test_gaussian_lstm_policy.py b/tests/garage/torch/policies/test_gaussian_lstm_policy.py
new file mode 100644
index 0000000000..989bdf617f
--- /dev/null
+++ b/tests/garage/torch/policies/test_gaussian_lstm_policy.py
@@ -0,0 +1,67 @@
+import pickle
+
+import numpy as np
+import pytest
+
+from garage.envs import GymEnv
+from garage.torch.policies import GaussianLSTMPolicy
+
+# yapf: disable
+from tests.fixtures.envs.dummy import DummyBoxEnv, DummyDictEnv
+
+# yapf: enable
+
+
+def test_get_action_dict_space():
+    env = GymEnv(DummyDictEnv(obs_space_type='box', act_space_type='box'))
+    policy = GaussianLSTMPolicy(env_spec=env.spec, hidden_size=4)
+    policy.reset(do_resets=None)
+    obs = env.reset()[0]
+
+    action, _ = policy.get_action(obs)
+    assert env.action_space.contains(action)
+
+    policy.reset(do_resets=[True, True])
+
+    actions, _ = policy.get_actions([obs, obs])
+    for action in actions:
+        assert env.action_space.contains(action)
+
+
+# yapf: disable
+@pytest.mark.parametrize('obs_dim, action_dim, hidden_size', [
+    ((1, ), (1, ), 4),
+    ((2, ), (2, ), 4),
+    ((1, 1), (1, ), 4),
+    ((2, 2), (2, ), 4)
+])
+# yapf: enable
+def test_get_action(obs_dim, action_dim, hidden_size):
+    env = GymEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
+    policy = GaussianLSTMPolicy(env_spec=env.spec, hidden_size=hidden_size)
+
+    policy.reset()
+    obs = env.reset()[0]
+
+    action, _ = policy.get_action(obs.flatten())
+    assert env.action_space.contains(action)
+
+    actions, _ = policy.get_actions([obs.flatten()])
+    for action in actions:
+        assert env.action_space.contains(action)
+
+
+# pylint: disable=no-member
+def test_is_pickleable():
+    env = GymEnv(DummyBoxEnv(obs_dim=(1, ), action_dim=(1, )))
+    policy = GaussianLSTMPolicy(env_spec=env.spec)
+    policy.reset()
+    obs = env.reset()[0]
+    _, info = policy.get_action(obs)
+
+    p = pickle.dumps(policy)
+    policy_pickled = pickle.loads(p)
+    policy_pickled.reset()
+    _, info2 = policy_pickled.get_action(obs)
+    assert np.array_equal(info['mean'], info2['mean'])
+    assert np.array_equal(info['log_std'], info2['log_std'])

From df3a137c6bcceb68621b66bc10624c928fd8ae7a Mon Sep 17 00:00:00 2001
From: "K.R. Zentner" <krzentner@gmail.com>
Date: Wed, 16 Dec 2020 15:50:51 -0800
Subject: [PATCH 5/6] Torch VPG rework

---
 .../experiments/algos/ppo_garage_pytorch.py   |  18 +-
 src/garage/torch/__init__.py                  |   6 +-
 src/garage/torch/_dtypes.py                   |   4 +-
 src/garage/torch/_functions.py                |  55 ++++
 src/garage/torch/algos/maml_ppo.py            |   8 +-
 src/garage/torch/algos/maml_trpo.py           |   8 +-
 src/garage/torch/algos/maml_vpg.py            |   8 +-
 src/garage/torch/algos/ppo.py                 |  10 +-
 src/garage/torch/algos/td3.py                 |   2 +-
 src/garage/torch/algos/trpo.py                |  34 +-
 src/garage/torch/algos/vpg.py                 | 309 +++++++++---------
 src/garage/torch/optimizers/__init__.py       |   9 +-
 .../conjugate_gradient_optimizer.py           |  12 +-
 .../torch/optimizers/optimizer_wrapper.py     |  63 ----
 .../gaussian_mlp_value_function.py            |   6 +-
 .../torch/value_functions/value_function.py   |   4 +-
 tests/garage/torch/test_functions.py          |  10 +
 17 files changed, 289 insertions(+), 277 deletions(-)
 delete mode 100644 src/garage/torch/optimizers/optimizer_wrapper.py

diff --git a/benchmarks/src/garage_benchmarks/experiments/algos/ppo_garage_pytorch.py b/benchmarks/src/garage_benchmarks/experiments/algos/ppo_garage_pytorch.py
index dc205c3562..666afa1732 100644
--- a/benchmarks/src/garage_benchmarks/experiments/algos/ppo_garage_pytorch.py
+++ b/benchmarks/src/garage_benchmarks/experiments/algos/ppo_garage_pytorch.py
@@ -6,7 +6,7 @@
 from garage.experiment import deterministic
 from garage.sampler import RaySampler
 from garage.torch.algos import PPO as PyTorch_PPO
-from garage.torch.optimizers import OptimizerWrapper
+from garage.torch.optimizers import MinibatchOptimizer
 from garage.torch.policies import GaussianMLPPolicy as PyTorch_GMP
 from garage.torch.value_functions import GaussianMLPValueFunction
 from garage.trainer import Trainer
@@ -45,15 +45,15 @@ def ppo_garage_pytorch(ctxt, env_id, seed):
                                               hidden_nonlinearity=torch.tanh,
                                               output_nonlinearity=None)
 
-    policy_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)),
-                                        policy,
-                                        max_optimization_epochs=10,
-                                        minibatch_size=64)
+    policy_optimizer = MinibatchOptimizer((torch.optim.Adam, dict(lr=2.5e-4)),
+                                          policy,
+                                          max_optimization_epochs=10,
+                                          minibatch_size=64)
 
-    vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)),
-                                    value_function,
-                                    max_optimization_epochs=10,
-                                    minibatch_size=64)
+    vf_optimizer = MinibatchOptimizer((torch.optim.Adam, dict(lr=2.5e-4)),
+                                      value_function,
+                                      max_optimization_epochs=10,
+                                      minibatch_size=64)
 
     sampler = RaySampler(agents=policy,
                          envs=env,
diff --git a/src/garage/torch/__init__.py b/src/garage/torch/__init__.py
index 71b119b7dd..8ac1bf783e 100644
--- a/src/garage/torch/__init__.py
+++ b/src/garage/torch/__init__.py
@@ -3,8 +3,9 @@
 from garage.torch._dtypes import (ObservationBatch, ObservationOrder,
                                   ShuffledOptimizationNotSupported,
                                   observation_batch_to_packed_sequence)
-from garage.torch._functions import (as_torch_dict, compute_advantages,
-                                     expand_var, filter_valids, flatten_batch,
+from garage.torch._functions import (as_tensor, as_torch_dict,
+                                     compute_advantages, expand_var,
+                                     filter_valids, flatten_batch,
                                      flatten_to_single_vector, global_device,
                                      NonLinearity, np_to_torch,
                                      output_height_2d, output_width_2d,
@@ -18,6 +19,7 @@
 __all__ = [
     'NonLinearity',
     'as_torch_dict',
+    'as_tensor',
     'compute_advantages',
     'expand_var',
     'filter_valids',
diff --git a/src/garage/torch/_dtypes.py b/src/garage/torch/_dtypes.py
index f0be030ce4..3421bf8df3 100644
--- a/src/garage/torch/_dtypes.py
+++ b/src/garage/torch/_dtypes.py
@@ -79,12 +79,12 @@ def __new__(cls, observations, order, lengths=None):
                     f'lengths has dtype {self.lengths.dtype}, but must have '
                     f'an integer dtype')
             total_size = sum(self.lengths)
-            if self.observations.shape[0] != total_size:
+            if self.shape[0] != total_size:
                 raise ValueError(
                     f'observations has batch size '
                     f'{self.observations.shape[0]}, but must have batch '
                     f'size {total_size} to match lengths')
-            assert self.observations.shape[0] == total_size
+            assert self.shape[0] == total_size
         elif self.lengths is not None:
             raise ValueError(
                 f'lengths has value {self.lengths}, but must be None '
diff --git a/src/garage/torch/_functions.py b/src/garage/torch/_functions.py
index 2c834ebfa6..1319c20da0 100644
--- a/src/garage/torch/_functions.py
+++ b/src/garage/torch/_functions.py
@@ -106,6 +106,48 @@ def compute_advantages(discount, gae_lambda, max_episode_length, baselines,
     return advantages
 
 
+def discount_cumsum(x, discount):
+    discount_x = torch.full((len(x), ),
+                            discount,
+                            dtype=torch.float,
+                            device=x.device)
+    discount_x[0] = 1.0
+    filter = torch.cumprod(discount_x, dim=0)
+    pad = len(x) - 1
+    # minibatch of 1, with 1 channel
+    filter = filter.reshape(1, 1, -1)
+    returns = F.conv1d(x.reshape(1, 1, -1), filter, stride=1, padding=pad)
+    returns = returns[0, 0, pad:]
+    return returns
+
+
+def split_packed_tensor(t, lengths):
+    """Split a tensor using a sequence of (start, stop) tuples."""
+    start = 0
+    for length in lengths:
+        stop = start + length
+        yield t[start:stop]
+        start = stop
+
+
+def pad_packed_tensor(t, lengths, max_length=None):
+    if max_length is None:
+        max_length = max(lengths)
+    if max(lengths) > max_length:
+        raise ValueError(f'packed tensor contains a sequence of length '
+                         f'{max(lengths)}, but was asked to pad to '
+                         f'length {max_length}')
+    out = torch.zeros((
+        len(lengths),
+        max_length,
+    ) + t.shape[1:],
+                      dtype=t.dtype,
+                      device=t.device)
+    for i, seq in enumerate(split_packed_tensor(t, lengths)):
+        out[i][:len(seq)] = seq
+    return out
+
+
 def pad_to_last(nums, total_length, axis=-1, val=0):
     """Pad val to last in nums in given axis.
 
@@ -383,6 +425,19 @@ def state_dict_to(state_dict, device):
     return state_dict
 
 
+def as_tensor(data):
+    """Convert a list to a PyTorch tensor
+
+    Args:
+        data (list): Data to convert to tensor
+
+    Returns:
+        torch.Tensor: A float tensor
+
+    """
+    return torch.as_tensor(data, dtype=torch.float32, device=global_device())
+
+
 # pylint: disable=W0223
 class NonLinearity(nn.Module):
     """Wrapper class for non linear function or module.
diff --git a/src/garage/torch/algos/maml_ppo.py b/src/garage/torch/algos/maml_ppo.py
index 93e4e76145..b7627247fa 100644
--- a/src/garage/torch/algos/maml_ppo.py
+++ b/src/garage/torch/algos/maml_ppo.py
@@ -4,7 +4,7 @@
 from garage import _Default
 from garage.torch.algos import PPO
 from garage.torch.algos.maml import MAML
-from garage.torch.optimizers import OptimizerWrapper
+from garage.torch.optimizers import MinibatchOptimizer
 
 
 class MAMLPPO(MAML):
@@ -70,10 +70,10 @@ def __init__(self,
                  meta_evaluator=None,
                  evaluate_every_n_epochs=1):
 
-        policy_optimizer = OptimizerWrapper(
+        policy_optimizer = MinibatchOptimizer(
             (torch.optim.Adam, dict(lr=inner_lr)), policy)
-        vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=inner_lr)),
-                                        value_function)
+        vf_optimizer = MinibatchOptimizer(
+            (torch.optim.Adam, dict(lr=inner_lr)), value_function)
 
         inner_algo = PPO(env.spec,
                          policy,
diff --git a/src/garage/torch/algos/maml_trpo.py b/src/garage/torch/algos/maml_trpo.py
index b4236b4cba..f19a29a817 100644
--- a/src/garage/torch/algos/maml_trpo.py
+++ b/src/garage/torch/algos/maml_trpo.py
@@ -5,7 +5,7 @@
 from garage.torch.algos import VPG
 from garage.torch.algos.maml import MAML
 from garage.torch.optimizers import (ConjugateGradientOptimizer,
-                                     OptimizerWrapper)
+                                     MinibatchOptimizer)
 
 
 class MAMLTRPO(MAML):
@@ -71,10 +71,10 @@ def __init__(self,
                  meta_evaluator=None,
                  evaluate_every_n_epochs=1):
 
-        policy_optimizer = OptimizerWrapper(
+        policy_optimizer = MinibatchOptimizer(
             (torch.optim.Adam, dict(lr=inner_lr)), policy)
-        vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=inner_lr)),
-                                        value_function)
+        vf_optimizer = MinibatchOptimizer(
+            (torch.optim.Adam, dict(lr=inner_lr)), value_function)
 
         inner_algo = VPG(env.spec,
                          policy,
diff --git a/src/garage/torch/algos/maml_vpg.py b/src/garage/torch/algos/maml_vpg.py
index cf32d8e6d5..ffb61a2e0f 100644
--- a/src/garage/torch/algos/maml_vpg.py
+++ b/src/garage/torch/algos/maml_vpg.py
@@ -4,7 +4,7 @@
 from garage import _Default
 from garage.torch.algos import VPG
 from garage.torch.algos.maml import MAML
-from garage.torch.optimizers import OptimizerWrapper
+from garage.torch.optimizers import MinibatchOptimizer
 
 
 class MAMLVPG(MAML):
@@ -66,10 +66,10 @@ def __init__(self,
                  num_grad_updates=1,
                  meta_evaluator=None,
                  evaluate_every_n_epochs=1):
-        policy_optimizer = OptimizerWrapper(
+        policy_optimizer = MinibatchOptimizer(
             (torch.optim.Adam, dict(lr=inner_lr)), policy)
-        vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=inner_lr)),
-                                        value_function)
+        vf_optimizer = MinibatchOptimizer(
+            (torch.optim.Adam, dict(lr=inner_lr)), value_function)
 
         inner_algo = VPG(env.spec,
                          policy,
diff --git a/src/garage/torch/algos/ppo.py b/src/garage/torch/algos/ppo.py
index 73668ac810..f3fc115daa 100644
--- a/src/garage/torch/algos/ppo.py
+++ b/src/garage/torch/algos/ppo.py
@@ -2,7 +2,7 @@
 import torch
 
 from garage.torch.algos import VPG
-from garage.torch.optimizers import OptimizerWrapper
+from garage.torch.optimizers import MinibatchOptimizer
 
 
 class PPO(VPG):
@@ -14,9 +14,9 @@ class PPO(VPG):
         value_function (garage.torch.value_functions.ValueFunction): The value
             function.
         sampler (garage.sampler.Sampler): Sampler.
-        policy_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer
+        policy_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer
             for policy.
-        vf_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer for
+        vf_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer for
             value function.
         lr_clip_range (float): The limit on the likelihood ratio between
             policies.
@@ -63,13 +63,13 @@ def __init__(self,
                  entropy_method='no_entropy'):
 
         if policy_optimizer is None:
-            policy_optimizer = OptimizerWrapper(
+            policy_optimizer = MinibatchOptimizer(
                 (torch.optim.Adam, dict(lr=2.5e-4)),
                 policy,
                 max_optimization_epochs=10,
                 minibatch_size=64)
         if vf_optimizer is None:
-            vf_optimizer = OptimizerWrapper(
+            vf_optimizer = MinibatchOptimizer(
                 (torch.optim.Adam, dict(lr=2.5e-4)),
                 value_function,
                 max_optimization_epochs=10,
diff --git a/src/garage/torch/algos/td3.py b/src/garage/torch/algos/td3.py
index edb10083ff..ca833f41bb 100644
--- a/src/garage/torch/algos/td3.py
+++ b/src/garage/torch/algos/td3.py
@@ -84,9 +84,9 @@ def __init__(
             replay_buffer,
             sampler,
             *,  # Everything after this is numbers.
-            max_episode_length_eval=None,
             grad_steps_per_env_step,
             exploration_policy,
+            max_episode_length_eval=None,
             uniform_random_policy=None,
             max_action=None,
             target_update_tau=0.005,
diff --git a/src/garage/torch/algos/trpo.py b/src/garage/torch/algos/trpo.py
index c2becfc1c9..a9fe5939aa 100644
--- a/src/garage/torch/algos/trpo.py
+++ b/src/garage/torch/algos/trpo.py
@@ -4,7 +4,7 @@
 from garage.torch._functions import zero_optim_grads
 from garage.torch.algos import VPG
 from garage.torch.optimizers import (ConjugateGradientOptimizer,
-                                     OptimizerWrapper)
+                                     MinibatchOptimizer)
 
 
 class TRPO(VPG):
@@ -16,9 +16,9 @@ class TRPO(VPG):
         value_function (garage.torch.value_functions.ValueFunction): The value
             function.
         sampler (garage.sampler.Sampler): Sampler.
-        policy_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer
+        policy_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer
             for policy.
-        vf_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer for
+        vf_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer for
             value function.
         num_train_per_epoch (int): Number of train_once calls per epoch.
         discount (float): Discount.
@@ -62,11 +62,11 @@ def __init__(self,
                  entropy_method='no_entropy'):
 
         if policy_optimizer is None:
-            policy_optimizer = OptimizerWrapper(
+            policy_optimizer = MinibatchOptimizer(
                 (ConjugateGradientOptimizer, dict(max_constraint_value=0.01)),
                 policy)
         if vf_optimizer is None:
-            vf_optimizer = OptimizerWrapper(
+            vf_optimizer = MinibatchOptimizer(
                 (torch.optim.Adam, dict(lr=2.5e-4)),
                 value_function,
                 max_optimization_epochs=10,
@@ -117,7 +117,8 @@ def _compute_objective(self, advantages, obs, actions, rewards):
 
         return surrogate
 
-    def _train_policy(self, obs, actions, rewards, advantages):
+    def _train_policy(self, observations, actions, rewards, advantages,
+                      lengths):
         r"""Train the policy.
 
         Args:
@@ -129,18 +130,19 @@ def _train_policy(self, obs, actions, rewards, advantages):
                 with shape :math:`(N, )`.
             advantages (torch.Tensor): Advantage value at each step
                 with shape :math:`(N, )`.
+            lengths (torch.Tensor): Lengths of episodes.
 
         Returns:
             torch.Tensor: Calculated mean scalar value of policy loss (float).
 
         """
-        # pylint: disable=protected-access
-        zero_optim_grads(self._policy_optimizer._optimizer)
-        loss = self._compute_loss_with_adv(obs, actions, rewards, advantages)
-        loss.backward()
-        self._policy_optimizer.step(
-            f_loss=lambda: self._compute_loss_with_adv(obs, actions, rewards,
-                                                       advantages),
-            f_constraint=lambda: self._compute_kl_constraint(obs))
-
-        return loss
+        data = {
+            'observations': observations,
+            'actions': actions,
+            'rewards': rewards,
+            'advantages': advantages,
+            'lengths': lengths
+        }
+        f_constraint = lambda: self._compute_kl_constraint(observations)
+        return self._policy_optimizer.step(data, self._loss_function,
+                                           f_constraint)
diff --git a/src/garage/torch/algos/vpg.py b/src/garage/torch/algos/vpg.py
index 42a75444fb..3586b322f4 100644
--- a/src/garage/torch/algos/vpg.py
+++ b/src/garage/torch/algos/vpg.py
@@ -10,9 +10,11 @@
 from garage import log_performance
 from garage.np import discount_cumsum
 from garage.np.algos import RLAlgorithm
-from garage.torch import compute_advantages, filter_valids
-from garage.torch._functions import np_to_torch, zero_optim_grads
-from garage.torch.optimizers import OptimizerWrapper
+from garage.torch import (as_tensor, compute_advantages, filter_valids,
+                          global_device, ObservationBatch, ObservationOrder)
+from garage.torch._functions import (np_to_torch, pad_packed_tensor,
+                                     split_packed_tensor)
+from garage.torch.optimizers import MinibatchOptimizer
 
 
 class VPG(RLAlgorithm):
@@ -26,11 +28,11 @@ class VPG(RLAlgorithm):
         value_function (garage.torch.value_functions.ValueFunction): The value
             function.
         sampler (garage.sampler.Sampler): Sampler.
-        policy_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer
+        policy_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer
             for policy.
-        vf_optimizer (garage.torch.optimizer.OptimizerWrapper): Optimizer for
+        vf_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer for
             value function.
-        num_train_per_epoch (int): Number of train_once calls per epoch.
+        steps_per_epoch (int): Number of train_once calls per epoch.
         discount (float): Discount.
         gae_lambda (float): Lambda used for generalized advantage
             estimation.
@@ -42,6 +44,8 @@ class VPG(RLAlgorithm):
             standardized before shifting.
         policy_ent_coeff (float): The coefficient of the policy entropy.
             Setting it to zero would mean no entropy regularization.
+        use_neg_logli_entropy (bool): Whether to estimate the entropy as the
+            negative log likelihood of the action.
         use_softplus_entropy (bool): Whether to estimate the softmax
             distribution of the entropy to prevent the entropy from being
             negative.
@@ -62,15 +66,17 @@ def __init__(
         sampler,
         policy_optimizer=None,
         vf_optimizer=None,
-        num_train_per_epoch=1,
+        steps_per_epoch=1,
         discount=0.99,
         gae_lambda=1,
         center_adv=True,
         positive_adv=False,
         policy_ent_coeff=0.0,
+        use_neg_logli_entropy=True,
         use_softplus_entropy=False,
-        stop_entropy_gradient=False,
+        stop_entropy_gradient=True,
         entropy_method='no_entropy',
+        recurrent=None,
     ):
         self._discount = discount
         self.policy = policy
@@ -83,8 +89,9 @@ def __init__(
         self._policy_ent_coeff = policy_ent_coeff
         self._use_softplus_entropy = use_softplus_entropy
         self._stop_entropy_gradient = stop_entropy_gradient
+        self._use_neg_logli_entropy = use_neg_logli_entropy
         self._entropy_method = entropy_method
-        self._n_samples = num_train_per_epoch
+        self._steps_per_epoch = steps_per_epoch
         self._env_spec = env_spec
 
         self._maximum_entropy = (entropy_method == 'max')
@@ -93,19 +100,21 @@ def __init__(
                                           stop_entropy_gradient,
                                           policy_ent_coeff)
         self._episode_reward_mean = collections.deque(maxlen=100)
-        self._sampler = sampler
+        self.sampler = sampler
 
         if policy_optimizer:
             self._policy_optimizer = policy_optimizer
         else:
-            self._policy_optimizer = OptimizerWrapper(torch.optim.Adam, policy)
+            self._policy_optimizer = MinibatchOptimizer(
+                torch.optim.Adam, policy)
         if vf_optimizer:
             self._vf_optimizer = vf_optimizer
         else:
-            self._vf_optimizer = OptimizerWrapper(torch.optim.Adam,
-                                                  value_function)
+            self._vf_optimizer = MinibatchOptimizer(torch.optim.Adam,
+                                                    value_function)
 
         self._old_policy = copy.deepcopy(self.policy)
+        self._recurrent = recurrent
 
     @staticmethod
     def _check_entropy_configuration(entropy_method, center_adv,
@@ -134,77 +143,71 @@ def discount(self):
         """
         return self._discount
 
-    def _train_once(self, itr, eps):
+    def _train_once(self, eps):
         """Train the algorithm once.
 
         Args:
-            itr (int): Iteration number.
             eps (EpisodeBatch): A batch of collected paths.
 
-        Returns:
-            numpy.float64: Calculated mean value of undiscounted returns.
-
         """
-        obs = np_to_torch(eps.padded_observations)
-        rewards = np_to_torch(eps.padded_rewards)
-        returns = np_to_torch(
-            np.stack([
-                discount_cumsum(reward, self.discount)
-                for reward in eps.padded_rewards
-            ]))
-        valids = eps.lengths
-        with torch.no_grad():
-            baselines = self._value_function(obs)
-
+        # Conver to torch and compute returns, etc.
+        lengths = torch.from_numpy(eps.lengths).to(global_device())
+        obs = ObservationBatch(as_tensor(eps.observations),
+                               order=ObservationOrder.EPISODES,
+                               lengths=lengths)
+        actions = torch.Tensor(eps.actions)
+        rewards = torch.Tensor(eps.rewards)
+        policy_entropies = self._compute_policy_entropy(obs, actions)
         if self._maximum_entropy:
-            policy_entropies = self._compute_policy_entropy(obs)
             rewards += self._policy_ent_coeff * policy_entropies
+        returns = torch.hstack([
+            discount_cumsum(r, self._discount)
+            for r in split_packed_tensor(rewards, lengths)
+        ])
+        with torch.no_grad():
+            baselines = self._value_function(obs)
+        advantages = self._compute_advantage(rewards, lengths, baselines)
 
-        obs_flat = np_to_torch(eps.observations)
-        actions_flat = np_to_torch(eps.actions)
-        rewards_flat = np_to_torch(eps.rewards)
-        returns_flat = torch.cat(filter_valids(returns, valids))
-        advs_flat = self._compute_advantage(rewards, valids, baselines)
-
+        # Log before training
         with torch.no_grad():
-            policy_loss_before = self._compute_loss_with_adv(
-                obs_flat, actions_flat, rewards_flat, advs_flat)
-            vf_loss_before = self._value_function.compute_loss(
-                obs_flat, returns_flat)
-            kl_before = self._compute_kl_constraint(obs)
+            policy_loss_before = self._loss_function(obs, actions, rewards,
+                                                     advantages, lengths)
+            vf_loss_before = self._value_function.loss_function(obs, returns)
 
-        self._train(obs_flat, actions_flat, rewards_flat, returns_flat,
-                    advs_flat)
+            with tabular.prefix(self.policy.name):
+                tabular.record('/LossBefore', policy_loss_before.item())
+                tabular.record('/KLBefore',
+                               self._compute_kl_constraint(obs).item())
+                tabular.record('/EntropyBefore',
+                               policy_entropies.mean().item())
 
-        with torch.no_grad():
-            policy_loss_after = self._compute_loss_with_adv(
-                obs_flat, actions_flat, rewards_flat, advs_flat)
-            vf_loss_after = self._value_function.compute_loss(
-                obs_flat, returns_flat)
-            kl_after = self._compute_kl_constraint(obs)
-            policy_entropy = self._compute_policy_entropy(obs)
-
-        with tabular.prefix(self.policy.name):
-            tabular.record('/LossBefore', policy_loss_before.item())
-            tabular.record('/LossAfter', policy_loss_after.item())
-            tabular.record('/dLoss',
-                           (policy_loss_before - policy_loss_after).item())
-            tabular.record('/KLBefore', kl_before.item())
-            tabular.record('/KL', kl_after.item())
-            tabular.record('/Entropy', policy_entropy.mean().item())
-
-        with tabular.prefix(self._value_function.name):
-            tabular.record('/LossBefore', vf_loss_before.item())
-            tabular.record('/LossAfter', vf_loss_after.item())
-            tabular.record('/dLoss',
-                           vf_loss_before.item() - vf_loss_after.item())
+            with tabular.prefix(self._value_function.name):
+                tabular.record('/LossBefore', vf_loss_before.item())
 
+        # Save the current policy state and train
         self._old_policy.load_state_dict(self.policy.state_dict())
+        self._train_policy(obs, actions, rewards, returns, advantages, lengths)
+        self._train_value_function(obs, returns, lengths)
+
+        # Log after training
+        with torch.no_grad():
 
-        undiscounted_returns = log_performance(itr,
-                                               eps,
-                                               discount=self._discount)
-        return np.mean(undiscounted_returns)
+            policy_loss_after = self._loss_function(obs, actions, rewards,
+                                                    advantages, lengths)
+            with tabular.prefix(self.policy.name):
+                tabular.record('/LossAfter', policy_loss_after.item())
+                tabular.record('/dLoss',
+                               (policy_loss_before - policy_loss_after).item())
+                tabular.record('/KL', self._compute_kl_constraint(obs).item())
+                tabular.record(
+                    '/EntropyAfter',
+                    self._compute_policy_entropy(obs, actions).mean().item())
+
+            vf_loss_after = self._value_function.loss_function(obs, returns)
+            with tabular.prefix(self._value_function.name):
+                tabular.record('/vfLossAfter', vf_loss_after.item())
+                tabular.record('/dLoss',
+                               vf_loss_before.item() - vf_loss_after.item())
 
     def train(self, trainer):
         """Obtain samplers and start actual training for each epoch.
@@ -220,84 +223,73 @@ def train(self, trainer):
         """
         last_return = None
 
-        for _ in trainer.step_epochs():
-            for _ in range(self._n_samples):
-                eps = trainer.obtain_episodes(trainer.step_itr)
-                last_return = self._train_once(trainer.step_itr, eps)
-                trainer.step_itr += 1
-
+        for epoch in trainer.step_epochs():
+            for _ in range(self._steps_per_epoch):
+                trainer.step_path = trainer.obtain_episodes(epoch)
+                self._train_once(trainer.step_path)
+            last_return = np.mean(
+                log_performance(epoch,
+                                trainer.step_path,
+                                discount=self._discount))
         return last_return
 
-    def _train(self, obs, actions, rewards, returns, advs):
-        r"""Train the policy and value function with minibatch.
-
-        Args:
-            obs (torch.Tensor): Observation from the environment with shape
-                :math:`(N, O*)`.
-            actions (torch.Tensor): Actions fed to the environment with shape
-                :math:`(N, A*)`.
-            rewards (torch.Tensor): Acquired rewards with shape :math:`(N, )`.
-            returns (torch.Tensor): Acquired returns with shape :math:`(N, )`.
-            advs (torch.Tensor): Advantage value at each step with shape
-                :math:`(N, )`.
-
-        """
-        for dataset in self._policy_optimizer.get_minibatch(
-                obs, actions, rewards, advs):
-            self._train_policy(*dataset)
-        for dataset in self._vf_optimizer.get_minibatch(obs, returns):
-            self._train_value_function(*dataset)
-
-    def _train_policy(self, obs, actions, rewards, advantages):
+    def _train_policy(self, observations, actions, rewards, returns,
+                      advantages, lengths):
         r"""Train the policy.
 
         Args:
-            obs (torch.Tensor): Observation from the environment
+            observations (torch.Tensor): Observation from the environment
                 with shape :math:`(N, O*)`.
             actions (torch.Tensor): Actions fed to the environment
                 with shape :math:`(N, A*)`.
             rewards (torch.Tensor): Acquired rewards
                 with shape :math:`(N, )`.
+            returns (torch.Tensor): Acquired returns with shape :math:`(N, )`.
             advantages (torch.Tensor): Advantage value at each step
                 with shape :math:`(N, )`.
+            lengths (torch.Tensor): Lengths of episodes.
 
         Returns:
             torch.Tensor: Calculated mean scalar value of policy loss (float).
 
         """
-        # pylint: disable=protected-access
-        zero_optim_grads(self._policy_optimizer._optimizer)
-        loss = self._compute_loss_with_adv(obs, actions, rewards, advantages)
-        loss.backward()
-        self._policy_optimizer.step()
-
-        return loss
-
-    def _train_value_function(self, obs, returns):
+        data = {
+            'observations': observations,
+            'actions': actions,
+            'rewards': rewards,
+            'advantages': advantages,
+            'lengths': lengths
+        }
+        return self._policy_optimizer.step(data, self._loss_function)
+
+    def _train_value_function(self, observations, returns, lengths):
         r"""Train the value function.
 
         Args:
-            obs (torch.Tensor): Observation from the environment
+            observations (torch.Tensor): Observation from the environment
                 with shape :math:`(N, O*)`.
             returns (torch.Tensor): Acquired returns
                 with shape :math:`(N, )`.
+            lengths (torch.Tensor): Lengths of episodes.
 
         Returns:
             torch.Tensor: Calculated mean scalar value of value function loss
                 (float).
 
         """
-        # pylint: disable=protected-access
-        zero_optim_grads(self._vf_optimizer._optimizer)
-        loss = self._value_function.compute_loss(obs, returns)
-        loss.backward()
-        self._vf_optimizer.step()
-
-        return loss
-
-    def _compute_loss(self, obs, actions, rewards, valids, baselines):
+        data = {
+            'observations': observations,
+            'returns': returns,
+            'lengths': lengths
+        }
+        return self._vf_optimizer.step(data,
+                                       self._value_function.loss_function)
+
+    def _compute_loss(self, obs, actions, rewards, lengths, baselines):
         r"""Compute mean value of loss.
 
+        Note that this function is private, but used by MAML.
+
         Notes: P is the maximum episode length (self.max_episode_length)
 
         Args:
@@ -307,7 +299,7 @@ def _compute_loss(self, obs, actions, rewards, valids, baselines):
                 with shape :math:`(N, P, A*)`.
             rewards (torch.Tensor): Acquired rewards
                 with shape :math:`(N, P)`.
-            valids (list[int]): Numbers of valid steps in each episode
+            lengths (list[int]): Numbers of valid steps in each episode
             baselines (torch.Tensor): Value function estimation at each step
                 with shape :math:`(N, P)`.
 
@@ -316,19 +308,24 @@ def _compute_loss(self, obs, actions, rewards, valids, baselines):
                 objective (float).
 
         """
-        obs_flat = torch.cat(filter_valids(obs, valids))
-        actions_flat = torch.cat(filter_valids(actions, valids))
-        rewards_flat = torch.cat(filter_valids(rewards, valids))
-        advantages_flat = self._compute_advantage(rewards, valids, baselines)
-
-        return self._compute_loss_with_adv(obs_flat, actions_flat,
-                                           rewards_flat, advantages_flat)
-
-    def _compute_loss_with_adv(self, obs, actions, rewards, advantages):
+        obs_flat = torch.cat(filter_valids(obs, lengths))
+        actions_flat = torch.cat(filter_valids(actions, lengths))
+        rewards_flat = torch.cat(filter_valids(rewards, lengths))
+        advantages_flat = self._compute_advantage(rewards, lengths, baselines)
+
+        return self._loss_function(obs_flat, actions_flat, rewards_flat,
+                                   advantages_flat, lengths)
+
+    def _loss_function(self,
+                       observations,
+                       actions,
+                       rewards,
+                       advantages,
+                       lengths=None):
         r"""Compute mean value of loss.
 
         Args:
-            obs (torch.Tensor): Observation from the environment
+            observations (torch.Tensor): Observation from the environment
                 with shape :math:`(N \dot [T], O*)`.
             actions (torch.Tensor): Actions fed to the environment
                 with shape :math:`(N \dot [T], A*)`.
@@ -336,50 +333,56 @@ def _compute_loss_with_adv(self, obs, actions, rewards, advantages):
                 with shape :math:`(N \dot [T], )`.
             advantages (torch.Tensor): Advantage value at each step
                 with shape :math:`(N \dot [T], )`.
+            lengths (torch.Tensor or None): Lengths of episodes, if operating
+                on full episodes.
 
         Returns:
             torch.Tensor: Calculated negative mean scalar value of objective.
 
         """
-        objectives = self._compute_objective(advantages, obs, actions, rewards)
+        objectives = self._compute_objective(advantages, observations, actions,
+                                             rewards)
 
         if self._entropy_regularzied:
-            policy_entropies = self._compute_policy_entropy(obs)
+            policy_entropies = self._compute_policy_entropy(
+                observations, actions)
             objectives += self._policy_ent_coeff * policy_entropies
 
         return -objectives.mean()
 
-    def _compute_advantage(self, rewards, valids, baselines):
+    def _compute_advantage(self, rewards, lengths, baselines):
         r"""Compute mean value of loss.
 
-        Notes: P is the maximum episode length (self.max_episode_length)
-
         Args:
-            rewards (torch.Tensor): Acquired rewards
-                with shape :math:`(N, P)`.
-            valids (list[int]): Numbers of valid steps in each episode
-            baselines (torch.Tensor): Value function estimation at each step
-                with shape :math:`(N, P)`.
+            rewards (torch.Tensor): Packed acquired rewards
+                with shape :math:`(N \bullet [T])`.
+            lengths (list[int]): Numbers of valid steps in each episode
+            baselines (torch.Tensor): Packed value function estimation of
+                returns with shape :math:`(N \bullet [T])`.
 
         Returns:
             torch.Tensor: Calculated advantage values given rewards and
                 baselines with shape :math:`(N \dot [T], )`.
 
         """
-        advantages = compute_advantages(self._discount, self._gae_lambda,
-                                        self.max_episode_length, baselines,
-                                        rewards)
-        advantage_flat = torch.cat(filter_valids(advantages, valids))
+        padded_rewards = pad_packed_tensor(rewards, lengths)
+        padded_baselines = pad_packed_tensor(baselines, lengths)
+        padded_advantages = compute_advantages(self._discount,
+                                               self._gae_lambda,
+                                               self.max_episode_length,
+                                               padded_baselines,
+                                               padded_rewards)
+        advantages = torch.cat(filter_valids(padded_advantages, lengths))
 
         if self._center_adv:
-            means = advantage_flat.mean()
-            variance = advantage_flat.var()
-            advantage_flat = (advantage_flat - means) / (variance + 1e-8)
+            means = advantages.mean()
+            variance = advantages.var()
+            advantages = (advantages - means) / (variance + 1e-8)
 
         if self._positive_adv:
-            advantage_flat -= advantage_flat.min()
+            advantages -= advantages.min()
 
-        return advantage_flat
+        return advantages
 
     def _compute_kl_constraint(self, obs):
         r"""Compute KL divergence.
@@ -408,25 +411,27 @@ def _compute_kl_constraint(self, obs):
 
         return kl_constraint.mean()
 
-    def _compute_policy_entropy(self, obs):
+    def _compute_policy_entropy(self, obs, actions):
         r"""Compute entropy value of probability distribution.
 
         Notes: P is the maximum episode length (self.max_episode_length)
 
         Args:
-            obs (torch.Tensor): Observation from the environment
+            observations (torch.Tensor): Observation from the environment
                 with shape :math:`(N, P, O*)`.
+            actions (torch.Tensor): Actions fed to the environment
+                with shape :math:`(N \dot [T], A*)`.
 
         Returns:
             torch.Tensor: Calculated entropy values given observation
                 with shape :math:`(N, P)`.
 
         """
-        if self._stop_entropy_gradient:
-            with torch.no_grad():
+        with torch.set_grad_enabled(not self._stop_entropy_gradient):
+            if self._use_neg_logli_entropy:
+                policy_entropy = -self.policy(obs)[0].log_prob(actions)
+            else:
                 policy_entropy = self.policy(obs)[0].entropy()
-        else:
-            policy_entropy = self.policy(obs)[0].entropy()
 
         # This prevents entropy from becoming negative for small policy std
         if self._use_softplus_entropy:
diff --git a/src/garage/torch/optimizers/__init__.py b/src/garage/torch/optimizers/__init__.py
index bc21022af4..bf9dec1f12 100644
--- a/src/garage/torch/optimizers/__init__.py
+++ b/src/garage/torch/optimizers/__init__.py
@@ -2,8 +2,9 @@
 from garage.torch.optimizers.conjugate_gradient_optimizer import (
     ConjugateGradientOptimizer)
 from garage.torch.optimizers.differentiable_sgd import DifferentiableSGD
-from garage.torch.optimizers.optimizer_wrapper import OptimizerWrapper
+from garage.torch.optimizers.episode_batch_optimizer import (
+    EpisodeBatchOptimizer)
+from garage.torch.optimizers.minibatch_optimizer import MinibatchOptimizer
+from garage.torch.optimizers.optimizer import Optimizer
 
-__all__ = [
-    'OptimizerWrapper', 'ConjugateGradientOptimizer', 'DifferentiableSGD'
-]
+__all__ = ['Optimizer', 'ConjugateGradientOptimizer', 'DifferentiableSGD']
diff --git a/src/garage/torch/optimizers/conjugate_gradient_optimizer.py b/src/garage/torch/optimizers/conjugate_gradient_optimizer.py
index 489587e672..0157d2c908 100644
--- a/src/garage/torch/optimizers/conjugate_gradient_optimizer.py
+++ b/src/garage/torch/optimizers/conjugate_gradient_optimizer.py
@@ -138,12 +138,12 @@ def __init__(self,
         self._hvp_reg_coeff = hvp_reg_coeff
         self._accept_violation = accept_violation
 
-    def step(self, f_loss, f_constraint):  # pylint: disable=arguments-differ
+    def step(self, loss_function, constraint_function):  # pylint: disable=arguments-differ
         """Take an optimization step.
 
         Args:
-            f_loss (callable): Function to compute the loss.
-            f_constraint (callable): Function to compute the constraint value.
+            loss_function (callable): Function to compute the loss.
+            constraint_function (callable): Function to compute the constraint value.
 
         """
         # Collect trainable parameters and gradients
@@ -157,7 +157,7 @@ def step(self, f_loss, f_constraint):  # pylint: disable=arguments-differ
         flat_loss_grads = torch.cat(grads)
 
         # Build Hessian-vector-product function
-        f_Ax = _build_hessian_vector_product(f_constraint, params,
+        f_Ax = _build_hessian_vector_product(constraint_function, params,
                                              self._hvp_reg_coeff)
 
         # Compute step direction
@@ -177,8 +177,8 @@ def step(self, f_loss, f_constraint):  # pylint: disable=arguments-differ
         descent_step = step_size * step_dir
 
         # Update parameters using backtracking line search
-        self._backtracking_line_search(params, descent_step, f_loss,
-                                       f_constraint)
+        self._backtracking_line_search(params, descent_step, loss_function,
+                                       constraint_function)
 
     @property
     def state(self):
diff --git a/src/garage/torch/optimizers/optimizer_wrapper.py b/src/garage/torch/optimizers/optimizer_wrapper.py
deleted file mode 100644
index 9f69ce565d..0000000000
--- a/src/garage/torch/optimizers/optimizer_wrapper.py
+++ /dev/null
@@ -1,63 +0,0 @@
-"""A PyTorch optimizer wrapper that compute loss and optimize module."""
-from garage import make_optimizer
-from garage.np.optimizers import BatchDataset
-
-
-class OptimizerWrapper:
-    """A wrapper class to handle torch.optim.optimizer.
-
-    Args:
-        optimizer (Union[type, tuple[type, dict]]): Type of optimizer
-            for policy. This can be an optimizer type such as
-            `torch.optim.Adam` or a tuple of type and dictionary, where
-            dictionary contains arguments to initialize the optimizer.
-            e.g. `(torch.optim.Adam, {'lr' : 1e-3})`
-            Sample strategy to be used when sampling a new task.
-        module (torch.nn.Module): Module to be optimized.
-        max_optimization_epochs (int): Maximum number of epochs for update.
-        minibatch_size (int): Batch size for optimization.
-
-    """
-
-    def __init__(self,
-                 optimizer,
-                 module,
-                 max_optimization_epochs=1,
-                 minibatch_size=None):
-        self._optimizer = make_optimizer(optimizer, module=module)
-        self._max_optimization_epochs = max_optimization_epochs
-        self._minibatch_size = minibatch_size
-
-    def get_minibatch(self, *inputs):
-        r"""Yields a batch of inputs.
-
-        Notes: P is the size of minibatch (self._minibatch_size)
-
-        Args:
-            *inputs (list[torch.Tensor]): A list of inputs. Each input has
-                shape :math:`(N \dot [T], *)`.
-
-        Yields:
-            list[torch.Tensor]: A list batch of inputs. Each batch has shape
-                :math:`(P, *)`.
-
-        """
-        batch_dataset = BatchDataset(inputs, self._minibatch_size)
-
-        for _ in range(self._max_optimization_epochs):
-            for dataset in batch_dataset.iterate():
-                yield dataset
-
-    def zero_grad(self):
-        r"""Clears the gradients of all optimized :class:`torch.Tensor` s."""
-        self._optimizer.zero_grad()
-
-    def step(self, **closure):
-        """Performs a single optimization step.
-
-        Arguments:
-            **closure (callable, optional): A closure that reevaluates the
-                model and returns the loss.
-
-        """
-        self._optimizer.step(**closure)
diff --git a/src/garage/torch/value_functions/gaussian_mlp_value_function.py b/src/garage/torch/value_functions/gaussian_mlp_value_function.py
index d340fef2a9..7f0670841c 100644
--- a/src/garage/torch/value_functions/gaussian_mlp_value_function.py
+++ b/src/garage/torch/value_functions/gaussian_mlp_value_function.py
@@ -78,11 +78,11 @@ def __init__(self,
             std_parameterization='exp',
             layer_normalization=layer_normalization)
 
-    def compute_loss(self, obs, returns):
+    def loss_function(self, observations, returns, lengths=None):
         r"""Compute mean value of loss.
 
         Args:
-            obs (torch.Tensor): Observation from the environment
+            observations (torch.Tensor): Observation from the environment
                 with shape :math:`(N \dot [T], O*)`.
             returns (torch.Tensor): Acquired returns with shape :math:`(N, )`.
 
@@ -91,7 +91,7 @@ def compute_loss(self, obs, returns):
                 objective (float).
 
         """
-        dist = self.module(obs)
+        dist = self.module(observations)
         ll = dist.log_prob(returns.reshape(-1, 1))
         loss = -ll.mean()
         return loss
diff --git a/src/garage/torch/value_functions/value_function.py b/src/garage/torch/value_functions/value_function.py
index 1cc533f33d..7aa54e7968 100644
--- a/src/garage/torch/value_functions/value_function.py
+++ b/src/garage/torch/value_functions/value_function.py
@@ -20,11 +20,11 @@ def __init__(self, env_spec, name):
         self.name = name
 
     @abc.abstractmethod
-    def compute_loss(self, obs, returns):
+    def loss_function(self, observations, returns, lengths=None):
         r"""Compute mean value of loss.
 
         Args:
-            obs (torch.Tensor): Observation from the environment
+            observations (torch.Tensor): Observation from the environment
                 with shape :math:`(N \dot [T], O*)`.
             returns (torch.Tensor): Acquired returns with shape :math:`(N, )`.
 
diff --git a/tests/garage/torch/test_functions.py b/tests/garage/torch/test_functions.py
index b89b0c0042..0235a68f54 100644
--- a/tests/garage/torch/test_functions.py
+++ b/tests/garage/torch/test_functions.py
@@ -10,6 +10,7 @@
 
 from garage.envs import GymEnv, normalize
 from garage.experiment.deterministic import set_seed
+from garage.np import discount_cumsum as np_discout_cumsum
 from garage.torch import (as_torch_dict, compute_advantages,
                           flatten_to_single_vector, global_device, pad_to_last,
                           product_of_gaussians, set_gpu_mode, state_dict_to,
@@ -129,6 +130,15 @@ def test_state_dict_to():
     assert np.all(
         [moved_state_dict[key].is_cuda for key in moved_state_dict.keys()])
 
+def test_discount_cumsum():
+    discount = 0.99
+    x = tensor([9.3217, 9.3003, 9.3406, 9.2251, 9.0715, 9.0134, 8.9026,
+                8.6619])
+    returns = discount_cumsum(x, discount)
+    expected = np_discout_cumsum(torch_to_np(x), discount)
+    assert returns.shape == (len(x), )
+    assert np.allclose(expected, torch_to_np(returns)
+
 
 class TestTorchAlgoUtils(TfGraphTestCase):
     """Test class for torch algo utility functions."""

From bca7b682598cd598ec427bbfbe9f04b47ad630f6 Mon Sep 17 00:00:00 2001
From: "K.R. Zentner" <krzentner@gmail.com>
Date: Wed, 16 Dec 2020 15:50:51 -0800
Subject: [PATCH 6/6] Torch VPG rework

WIP torch optimizer refactor

WIP torch optimizer refactor

WIP
---
 src/garage/examples/torch/vpg_pendulum.py     |   8 +-
 src/garage/torch/_dtypes.py                   |  42 ++++++-
 src/garage/torch/_functions.py                |  13 ++-
 src/garage/torch/algos/vpg.py                 |  31 +++--
 src/garage/torch/optimizers/__init__.py       |  11 +-
 .../optimizers/episode_batch_optimizer.py     | 110 ++++++++++++++++++
 .../torch/optimizers/minibatch_optimizer.py   |  87 ++++++++++++++
 src/garage/torch/optimizers/optimizer.py      |  43 +++++++
 .../optimizers/single_batch_optimizer.py      |  45 +++++++
 tests/garage/torch/test_functions.py          |   7 +-
 10 files changed, 371 insertions(+), 26 deletions(-)
 create mode 100644 src/garage/torch/optimizers/episode_batch_optimizer.py
 create mode 100644 src/garage/torch/optimizers/minibatch_optimizer.py
 create mode 100644 src/garage/torch/optimizers/optimizer.py
 create mode 100644 src/garage/torch/optimizers/single_batch_optimizer.py

diff --git a/src/garage/examples/torch/vpg_pendulum.py b/src/garage/examples/torch/vpg_pendulum.py
index 59e4e008a6..5ca60a23b9 100755
--- a/src/garage/examples/torch/vpg_pendulum.py
+++ b/src/garage/examples/torch/vpg_pendulum.py
@@ -11,7 +11,7 @@
 from garage import wrap_experiment
 from garage.envs import GymEnv
 from garage.experiment.deterministic import set_seed
-from garage.sampler import RaySampler
+from garage.sampler import LocalSampler, RaySampler
 from garage.torch.algos import VPG
 from garage.torch.policies import GaussianMLPPolicy
 from garage.torch.value_functions import GaussianMLPValueFunction
@@ -44,9 +44,9 @@ def vpg_pendulum(ctxt=None, seed=1):
                                               hidden_nonlinearity=torch.tanh,
                                               output_nonlinearity=None)
 
-    sampler = RaySampler(agents=policy,
-                         envs=env,
-                         max_episode_length=env.spec.max_episode_length)
+    sampler = LocalSampler(agents=policy,
+                           envs=env,
+                           max_episode_length=env.spec.max_episode_length)
 
     algo = VPG(env_spec=env.spec,
                policy=policy,
diff --git a/src/garage/torch/_dtypes.py b/src/garage/torch/_dtypes.py
index 3421bf8df3..aa4254d473 100644
--- a/src/garage/torch/_dtypes.py
+++ b/src/garage/torch/_dtypes.py
@@ -29,7 +29,7 @@ class ObservationOrder(enum.IntEnum):
     EPISODES = 2
 
 
-@dataclass(init=False)
+@dataclass(init=False, eq=False)
 class ObservationBatch(torch.Tensor):
     r"""The (differentiable) input to all pytorch policies.
 
@@ -91,6 +91,24 @@ def __new__(cls, observations, order, lengths=None):
                 f'when order == {self.order}')
         return self
 
+    def __repr__(self):
+        return f'{type(self).__name__}({super().__repr__()}, order={self.order!r}, lengths={self.lengths!r})'
+
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        # print(f"func: {func.__name__}, args: {args!r}, kwargs: {kwargs!r}")
+        if kwargs is None:
+            kwargs = {}
+        result = super().__torch_function__(func, types, args, kwargs)
+        # Fixup ObservationBatch instances returned from methods.
+        # In the future this might preserve order for some methods
+        if isinstance(result, ObservationBatch):
+            if not hasattr(result, 'order'):
+                result.order = ObservationOrder.SHUFFLED
+            if not hasattr(result, 'lengths'):
+                result.lengths = None
+        return result
+
 
 def observation_batch_to_packed_sequence(observations):
     """Turn ObservationBatch into a torch.nn.utils.rnn.PackedSequence.
@@ -128,3 +146,25 @@ def observation_batch_to_packed_sequence(observations):
         start = stop
     pack_sequence = nn.utils.rnn.pack_sequence
     return pack_sequence(sequence, enforce_sorted=False)
+
+
+def is_policy_recurrent(policy, env_spec):
+    """Check if a torch policy is recurrent.
+
+    Args:
+        policy (garage.torch.Policy): Policy that might be recurrent.
+
+    Returns:
+        bool: If policy is recurrent.
+
+    """
+    try:
+        policy.forward(
+            as_tensor([
+                env_spec.observation_space.sample(),
+                env_spec.observation_space.sample()
+            ]))
+    except ShuffledOptimizationNotSupported:
+        return True
+    else:
+        return False
diff --git a/src/garage/torch/_functions.py b/src/garage/torch/_functions.py
index 1319c20da0..8b864cb29b 100644
--- a/src/garage/torch/_functions.py
+++ b/src/garage/torch/_functions.py
@@ -111,13 +111,14 @@ def discount_cumsum(x, discount):
                             discount,
                             dtype=torch.float,
                             device=x.device)
-    discount_x[0] = 1.0
+    # discount_x[0] = 1.0
     filter = torch.cumprod(discount_x, dim=0)
-    pad = len(x) - 1
-    # minibatch of 1, with 1 channel
-    filter = filter.reshape(1, 1, -1)
-    returns = F.conv1d(x.reshape(1, 1, -1), filter, stride=1, padding=pad)
-    returns = returns[0, 0, pad:]
+    returns = F.conv1d(x, filter, stride=1)
+    assert returns.shape == (len(x), )
+    from garage.np import discount_cumsum as np_discout_cumsum
+    import numpy as np
+    expected = np_discout_cumsum(torch_to_np(x), discount)
+    assert np.array_equal(expected, torch_to_np(returns))
     return returns
 
 
diff --git a/src/garage/torch/algos/vpg.py b/src/garage/torch/algos/vpg.py
index 3586b322f4..be68d3110c 100644
--- a/src/garage/torch/algos/vpg.py
+++ b/src/garage/torch/algos/vpg.py
@@ -8,13 +8,12 @@
 import torch.nn.functional as F
 
 from garage import log_performance
-from garage.np import discount_cumsum
 from garage.np.algos import RLAlgorithm
 from garage.torch import (as_tensor, compute_advantages, filter_valids,
                           global_device, ObservationBatch, ObservationOrder)
-from garage.torch._functions import (np_to_torch, pad_packed_tensor,
+from garage.torch._functions import (discount_cumsum, pad_packed_tensor,
                                      split_packed_tensor)
-from garage.torch.optimizers import MinibatchOptimizer
+from garage.torch.optimizers import MinibatchOptimizer, SingleBatchOptimizer
 
 
 class VPG(RLAlgorithm):
@@ -28,9 +27,9 @@ class VPG(RLAlgorithm):
         value_function (garage.torch.value_functions.ValueFunction): The value
             function.
         sampler (garage.sampler.Sampler): Sampler.
-        policy_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer
+        policy_optimizer (garage.torch.optimizer.Optimizer): Optimizer
             for policy.
-        vf_optimizer (garage.torch.optimizer.MinibatchOptimizer): Optimizer for
+        vf_optimizer (garage.torch.optimizer.Optimizer): Optimizer for
             value function.
         steps_per_epoch (int): Number of train_once calls per epoch.
         discount (float): Discount.
@@ -100,21 +99,29 @@ def __init__(
                                           stop_entropy_gradient,
                                           policy_ent_coeff)
         self._episode_reward_mean = collections.deque(maxlen=100)
-        self.sampler = sampler
+        self._sampler = sampler
+        if recurrent is None:
+            recurrent = is_policy_recurrent(policy)
+        self._recurrent = recurrent
 
         if policy_optimizer:
             self._policy_optimizer = policy_optimizer
+        elif self._recurrent:
+            self._policy_optimizer = EpisodeBatchOptimizer(
+                torch.optim.Adam, policy)
         else:
-            self._policy_optimizer = MinibatchOptimizer(
+            self._policy_optimizer = SingleBatchOptimizer(
                 torch.optim.Adam, policy)
         if vf_optimizer:
             self._vf_optimizer = vf_optimizer
+        elif self._recurrent:
+            self._vf_optimizer = EpisodeBatchOptimizer(torch.optim.Adam,
+                                                       value_function)
         else:
             self._vf_optimizer = MinibatchOptimizer(torch.optim.Adam,
                                                     value_function)
 
         self._old_policy = copy.deepcopy(self.policy)
-        self._recurrent = recurrent
 
     @staticmethod
     def _check_entropy_configuration(entropy_method, center_adv,
@@ -225,7 +232,7 @@ def train(self, trainer):
 
         for epoch in trainer.step_epochs():
             for _ in range(self._steps_per_epoch):
-                trainer.step_path = trainer.obtain_episodes(epoch)
+                trainer.step_path = self._sampler.obtain_episodes(epoch)
                 self._train_once(trainer.step_path)
             last_return = np.mean(
                 log_performance(epoch,
@@ -258,8 +265,9 @@ def _train_policy(self, observations, actions, rewards, returns,
             'actions': actions,
             'rewards': rewards,
             'advantages': advantages,
-            'lengths': lengths
         }
+        if not isinstance(self._policy_optimizer, MinibatchOptimizer):
+            data['lengths'] = lengths
         return self._policy_optimizer.step(data, self._loss_function)
 
     def _train_value_function(self, observations, returns, lengths):
@@ -348,7 +356,8 @@ def _loss_function(self,
                 observations, actions)
             objectives += self._policy_ent_coeff * policy_entropies
 
-        return -objectives.mean()
+        loss = -objectives.mean()
+        return loss
 
     def _compute_advantage(self, rewards, lengths, baselines):
         r"""Compute mean value of loss.
diff --git a/src/garage/torch/optimizers/__init__.py b/src/garage/torch/optimizers/__init__.py
index bf9dec1f12..5fcfe21392 100644
--- a/src/garage/torch/optimizers/__init__.py
+++ b/src/garage/torch/optimizers/__init__.py
@@ -1,4 +1,5 @@
 """PyTorch optimizers."""
+# yapf: disable
 from garage.torch.optimizers.conjugate_gradient_optimizer import (
     ConjugateGradientOptimizer)
 from garage.torch.optimizers.differentiable_sgd import DifferentiableSGD
@@ -6,5 +7,13 @@
     EpisodeBatchOptimizer)
 from garage.torch.optimizers.minibatch_optimizer import MinibatchOptimizer
 from garage.torch.optimizers.optimizer import Optimizer
+from garage.torch.optimizers.single_batch_optimizer import SingleBatchOptimizer
 
-__all__ = ['Optimizer', 'ConjugateGradientOptimizer', 'DifferentiableSGD']
+__all__ = [
+    'ConjugateGradientOptimizer',
+    'DifferentiableSGD',
+    'EpisodeBatchOptimizer',
+    'MinibatchOptimizer',
+    'Optimizer',
+    'SingleBatchOptimizer',
+]
diff --git a/src/garage/torch/optimizers/episode_batch_optimizer.py b/src/garage/torch/optimizers/episode_batch_optimizer.py
new file mode 100644
index 0000000000..2a3ba7c8f8
--- /dev/null
+++ b/src/garage/torch/optimizers/episode_batch_optimizer.py
@@ -0,0 +1,110 @@
+"""Optimizer that runs a torch optimizer on full episodes."""
+import click
+import numpy as np
+
+from garage import make_optimizer
+from garage.torch import (as_tensor, ObservationBatch, ObservationOrder,
+                          split_packed_tensor)
+from garage.torch.optimizers.optimizer import Optimizer
+
+
+class EpisodeBatchOptimizer(Optimizer):
+    """Optimizer that runs a torch optimizer on full episodes.
+
+    Args:
+        optimizer (Union[type, tuple[type, dict]]): Type of optimizer
+            for policy. This can be an optimizer type such as
+            `torch.optim.Adam` or a tuple of type and dictionary, where
+            dictionary contains arguments to initialize the optimizer.
+            e.g. `(torch.optim.Adam, {'lr' : 1e-3})`
+            Sample strategy to be used when sampling a new task.
+        module (torch.nn.Module): Module to be optimized.
+        max_optimization_epochs (int): Maximum number of epochs for update.
+        minibatch_size (int): Batch size for optimization.
+
+    """
+
+    def __init__(self,
+                 optimizer,
+                 module,
+                 max_optimization_epochs=1000,
+                 minibatch_size=32):
+        super().__init__(module)
+        self._optimizer = make_optimizer(optimizer, module=module)
+        self._max_optimization_epochs = max_optimization_epochs
+        self._minibatch_size = minibatch_size
+
+    def _minibatches(self, data_by_episode, lengths):
+        r"""Yields a batch of inputs.
+
+        Notes: P is the size of minibatch (self._minibatch_size)
+
+        Args:
+            data_by_episode (dict[str, list[torch.Tensor]]): Dictionary of
+                data, where each data array has been split by episode.
+            lengths (list[int]): Length of each episode in data.
+
+        Yields:
+            dict[str, torch.Tensor]: Batch of inputs to pass to loss function.
+
+        """
+        episode_indices = np.range(len(lengths))
+        i = 0
+        with click.progressbar(range(self._max_optimization_epochs),
+                               label='Optimizing') as pbar:
+            for _ in pbar:
+                batch_size = 0
+                batch = {k: [] for k in data_by_episode.keys()}
+                batch_lengths = []
+                while sum(batch_lengths) < self._minibatch_size:
+                    if i == 0:
+                        np.random.shuffle(episode_indices)
+                    for k, v in data_by_episode.items():
+                        batch[k].append(v[i])
+                    batch_lengths.append(lengths[i])
+                    i = (i + 1) % len(lengths)
+                batch = {k: as_tensor(v) for (k, v) in batch.items()}
+                batch['observations'] = ObservationBatch(
+                    batch['observations'], ObservationOrder.EPISODES,
+                    batch_lengths)
+                batch['lengths'] = as_tensor(batch_lengths)
+                yield batch
+
+    def step(self, data, loss_function):
+        """Use `data` to minimize `loss_function`.
+
+        Note that data may be operated on in optimizer specific ways, and
+        loss_function may be called multiple times.
+
+        Args:
+            data (dict[str, torch.Tensor]): Data to feed into the loss
+                function. May be operated on before feeding. Must contain the
+                key 'lengths'.
+            loss_function (dict[str, torch.Tensor] -> torch.Tensor):
+                Differentiable loss function to optimize.
+
+        Returns:
+            float: Average value of loss_function over data.
+
+        """
+        if 'observations' not in data:
+            raise ValueError('observations must be in data for '
+                             'EpisodeBatchOptimizer')
+        try:
+            lengths = data['lengths']
+        except KeyError:
+            try:
+                lengths = data['observations'].lengths
+            except AttributeError:
+                raise ValueError('EpisodeBatchOptimizer must have lengths in '
+                                 'data or observations must be an '
+                                 'ObservationBatch')
+        data_by_episode = {
+            k: split_packed_tensor(v, lengths)
+            for (k, v) in data.items() if v != 'lengths'
+        }
+        for batch in self._minibatches(data_by_episode, lengths):
+            self._optimizer.zero_grad()
+            loss = loss_function(**batch)
+            loss.backward()
+            self._optimizer.step()
diff --git a/src/garage/torch/optimizers/minibatch_optimizer.py b/src/garage/torch/optimizers/minibatch_optimizer.py
new file mode 100644
index 0000000000..430de561fa
--- /dev/null
+++ b/src/garage/torch/optimizers/minibatch_optimizer.py
@@ -0,0 +1,87 @@
+"""A garage optimizer that optimizes using minibatches."""
+import click
+import numpy as np
+
+from garage import make_optimizer
+from garage.torch.optimizers.optimizer import Optimizer
+
+
+class MinibatchOptimizer(Optimizer):
+    """Optimizer that runs a torch.optim.Optimizer on minibatches.
+
+    Args:
+        optimizer (Union[type, tuple[type, dict]]): Type of optimizer
+            for policy. This can be an optimizer type such as
+            `torch.optim.Adam` or a tuple of type and dictionary, where
+            dictionary contains arguments to initialize the optimizer.
+            e.g. `(torch.optim.Adam, {'lr' : 1e-3})`
+            Sample strategy to be used when sampling a new task.
+        module (torch.nn.Module): Module to be optimized.
+        max_optimization_epochs (int): Maximum number of times to iterate
+            through all samples.
+        minibatch_size (int): Batch size for optimization. If a single large
+            batch is desired, consider using SingleBatchOptimizer instead.
+
+    """
+
+    def __init__(self,
+                 optimizer,
+                 module,
+                 max_optimization_epochs=1,
+                 minibatch_size=32):
+        super().__init__(module)
+        self._optimizer = make_optimizer(optimizer, module=module)
+        self._max_optimization_epochs = max_optimization_epochs
+        self._minibatch_size = minibatch_size
+
+    def _minibatches(self, n_samples, data):
+        r"""Yields a batch of inputs.
+
+        Notes: P is the size of minibatch (self._minibatch_size)
+
+        Args:
+            n_samples (int): Total number of samples in data.
+            data (dict[str, torch.Tensor]): Data to sample into batches. Each
+                tensor has shape :math:`(N \dot [T], *)`.
+
+        Yields:
+            dict[str, torch.Tensor]: Batch of inputs to pass to loss function.
+
+        """
+        assert n_samples == len(next(iter(data.values())))
+        with click.progressbar(range(self._max_optimization_epochs),
+                               label='Optimizing') as pbar:
+            for _ in pbar:
+                all_indices = np.arange(n_samples)
+                np.random.shuffle(all_indices)
+                split = np.array_split(
+                    all_indices, np.ceil(n_samples / self._minibatch_size))
+                for minibatch_indices in split:
+                    yield {k: v[minibatch_indices] for (k, v) in data.items()}
+
+    def step(self, data, loss_function):
+        """Use `data` to minimize `loss_function`.
+
+        Note that data may be operated on in optimizer specific ways, and
+        loss_function may be called multiple times.
+
+        Args:
+            data (dict[str, torch.Tensor]): Data to feed into the loss
+                function. May be operated on before feeding.
+            loss_function (dict[str, torch.Tensor] -> torch.Tensor):
+                Differentiable loss function to optimize.
+
+        Returns:
+            float: Average value of loss_function over data.
+
+        """
+        if 'lengths' in data:
+            del data['lengths']
+        n_samples = [len(v) for v in data.values()]
+        assert all(n == n_samples[0] for n in n_samples)
+
+        for i, batch in enumerate(self._minibatches(n_samples[0], data)):
+            self._optimizer.zero_grad()
+            loss = loss_function(**batch)
+            loss.backward()
+            self._optimizer.step()
diff --git a/src/garage/torch/optimizers/optimizer.py b/src/garage/torch/optimizers/optimizer.py
new file mode 100644
index 0000000000..6bc303e0b4
--- /dev/null
+++ b/src/garage/torch/optimizers/optimizer.py
@@ -0,0 +1,43 @@
+import abc
+
+
+class Optimizer(metaclass=abc.ABCMeta):
+    """Base class of optimizers in garage.torch.
+
+    This class exists and differs from torch.optim.Optimizer for a few reasons:
+        - Several optimizers (DifferentiableSGD, ConjugateGradientOptimizer)
+          need access to the module they're optimizing as a whole, not just to
+          their parameters.
+        - The torch Optimizer class was not designed to be inherited from, and
+          sometimes breaks base classes in difficult to detect ways.
+        - The torch Optimizer API is large and not very easy to implement. If
+          the whole API is needed, one of garage's Optimizer wrappers can be
+          used instead.
+        - We want our optimizer API to handle mini-batching, since it makes the
+          relationship between PPO and TRPO simpler to implement and explain.
+
+    Args:
+        module (torch.nn.Module): The neural network to optimize.
+
+    """
+
+    def __init__(self, module):
+        self._module = module
+
+    @abc.abstractmethod
+    def step(self, data, loss_function):
+        """Use `data` to minimize `loss_function`.
+
+        Note that data may be operated on in optimizer specific ways, and
+        loss_function may be called multiple times.
+
+        Args:
+            data (dict[str, torch.Tensor]): Data to feed into the loss
+                function. May be operated on before feeding.
+            loss_function (dict[str, torch.Tensor] -> torch.Tensor):
+                Differentiable loss function to optimize.
+
+        Returns:
+            float: Average value of loss_function over data.
+
+        """
diff --git a/src/garage/torch/optimizers/single_batch_optimizer.py b/src/garage/torch/optimizers/single_batch_optimizer.py
new file mode 100644
index 0000000000..b88465bc0b
--- /dev/null
+++ b/src/garage/torch/optimizers/single_batch_optimizer.py
@@ -0,0 +1,45 @@
+"""A garage optimizer that optimizes using a single large batch of SGD."""
+import numpy as np
+
+from garage import make_optimizer
+from garage.torch.optimizers.optimizer import Optimizer
+
+
+class SingleBatchOptimizer(Optimizer):
+    """Optimizer that runs a torch.optim.Optimizer a single batch.
+
+    Args:
+        optimizer (Union[type, tuple[type, dict]]): Type of optimizer
+            for policy. This can be an optimizer type such as
+            `torch.optim.Adam` or a tuple of type and dictionary, where
+            dictionary contains arguments to initialize the optimizer.
+            e.g. `(torch.optim.Adam, {'lr' : 1e-3})`
+            Sample strategy to be used when sampling a new task.
+        module (torch.nn.Module): Module to be optimized.
+
+    """
+
+    def __init__(self, optimizer, module):
+        super().__init__(module)
+        self._optimizer = make_optimizer(optimizer, module=module)
+
+    def step(self, data, loss_function):
+        """Use `data` to minimize `loss_function`.
+
+        Note that data may be operated on in optimizer specific ways, and
+        loss_function may be called multiple times.
+
+        Args:
+            data (dict[str, torch.Tensor]): Data to feed into the loss
+                function. May be operated on before feeding.
+            loss_function (dict[str, torch.Tensor] -> torch.Tensor):
+                Differentiable loss function to optimize.
+
+        Returns:
+            float: Average value of loss_function over data.
+
+        """
+        self._optimizer.zero_grad()
+        loss = loss_function(**data)
+        loss.backward()
+        self._optimizer.step()
diff --git a/tests/garage/torch/test_functions.py b/tests/garage/torch/test_functions.py
index 0235a68f54..34d1d76165 100644
--- a/tests/garage/torch/test_functions.py
+++ b/tests/garage/torch/test_functions.py
@@ -10,6 +10,7 @@
 
 from garage.envs import GymEnv, normalize
 from garage.experiment.deterministic import set_seed
+
 from garage.np import discount_cumsum as np_discout_cumsum
 from garage.torch import (as_torch_dict, compute_advantages,
                           flatten_to_single_vector, global_device, pad_to_last,
@@ -130,14 +131,14 @@ def test_state_dict_to():
     assert np.all(
         [moved_state_dict[key].is_cuda for key in moved_state_dict.keys()])
 
+
 def test_discount_cumsum():
     discount = 0.99
-    x = tensor([9.3217, 9.3003, 9.3406, 9.2251, 9.0715, 9.0134, 8.9026,
-                8.6619])
+    x = torch.tensor([5., 10, 20, 100, 0.5, 0.5, 0.5, 0.5, 1000])
     returns = discount_cumsum(x, discount)
     expected = np_discout_cumsum(torch_to_np(x), discount)
     assert returns.shape == (len(x), )
-    assert np.allclose(expected, torch_to_np(returns)
+    assert np.allclose(expected, torch_to_np(returns))
 
 
 class TestTorchAlgoUtils(TfGraphTestCase):