From e6a68c18cc8bb55e036c5a284ec9c49e0f9fa23f Mon Sep 17 00:00:00 2001
From: Sam Sunde <samsunde19@gmail.com>
Date: Tue, 16 Apr 2024 23:06:30 -0700
Subject: [PATCH] tensorflow==2.* adamoptimizer

---
 docs/utils/logger.rst                         |  8 +-
 spinup/algos/tf1/ddpg/ddpg.py                 | 50 +++++------
 spinup/algos/tf1/sac/sac.py                   | 58 ++++++-------
 spinup/algos/tf1/td3/td3.py                   | 64 +++++++--------
 spinup/examples/tf1/pg_math/1_simple_pg.py    |  4 +-
 spinup/examples/tf1/pg_math/2_rtg_pg.py       |  4 +-
 spinup/examples/tf1/train_mnist.py            |  8 +-
 .../tf1/problem_set_1/exercise1_3.py          | 82 +++++++++----------
 spinup/utils/mpi_tf.py                        |  8 +-
 9 files changed, 143 insertions(+), 143 deletions(-)

diff --git a/docs/utils/logger.rst b/docs/utils/logger.rst
index a4f545ed9..8e50050c1 100644
--- a/docs/utils/logger.rst
+++ b/docs/utils/logger.rst
@@ -55,8 +55,8 @@ Next, let's look at a full training procedure with the logger embedded, to highl
 
 
     # Simple script for training an MLP on MNIST.
-    def train_mnist(steps_per_epoch=100, epochs=5, 
-                    lr=1e-3, layers=2, hidden_size=64, 
+    def train_mnist(steps_per_epoch=100, epochs=5,
+                    lr=1e-3, layers=2, hidden_size=64,
                     logger_kwargs=dict(), save_freq=1):
 
         logger = EpochLogger(**logger_kwargs)
@@ -76,14 +76,14 @@ Next, let's look at a full training procedure with the logger embedded, to highl
         y = tf.one_hot(y_ph, 10)
         loss = tf.losses.softmax_cross_entropy(y, logits)
         acc = tf.reduce_mean(tf.cast(tf.equal(y_ph, predict), tf.float32))
-        train_op = tf.train.AdamOptimizer().minimize(loss)
+        train_op = tf.optimizers.Adam().minimize(loss)
 
         # Prepare session
         sess = tf.Session()
         sess.run(tf.global_variables_initializer())
 
         # Setup model saving
-        logger.setup_tf_saver(sess, inputs={'x': x_ph}, 
+        logger.setup_tf_saver(sess, inputs={'x': x_ph},
                                     outputs={'logits': logits, 'predict': predict})
 
         start_time = time.time()
diff --git a/spinup/algos/tf1/ddpg/ddpg.py b/spinup/algos/tf1/ddpg/ddpg.py
index 90dabaa2d..86b952357 100644
--- a/spinup/algos/tf1/ddpg/ddpg.py
+++ b/spinup/algos/tf1/ddpg/ddpg.py
@@ -39,10 +39,10 @@ def sample_batch(self, batch_size=32):
 
 
 
-def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, 
-         steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, 
-         polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, 
-         update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=10, 
+def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
+         steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99,
+         polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000,
+         update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=10,
          max_ep_len=1000, logger_kwargs=dict(), save_freq=1):
     """
     Deep Deterministic Policy Gradient (DDPG)
@@ -52,8 +52,8 @@ def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
         env_fn : A function which creates a copy of the environment.
             The environment must satisfy the OpenAI Gym API.
 
-        actor_critic: A function which takes in placeholder symbols 
-            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
+        actor_critic: A function which takes in placeholder symbols
+            for state, ``x_ph``, and action, ``a_ph``, and returns the main
             outputs from the agent's Tensorflow computation graph:
 
             ===========  ================  ======================================
@@ -61,20 +61,20 @@ def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
             ===========  ================  ======================================
             ``pi``       (batch, act_dim)  | Deterministically computes actions
                                            | from policy given states.
-            ``q``        (batch,)          | Gives the current estimate of Q* for 
+            ``q``        (batch,)          | Gives the current estimate of Q* for
                                            | states in ``x_ph`` and actions in
                                            | ``a_ph``.
-            ``q_pi``     (batch,)          | Gives the composition of ``q`` and 
-                                           | ``pi`` for states in ``x_ph``: 
+            ``q_pi``     (batch,)          | Gives the composition of ``q`` and
+                                           | ``pi`` for states in ``x_ph``:
                                            | q(x, pi(x)).
             ===========  ================  ======================================
 
-        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
+        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
             function you provided to DDPG.
 
         seed (int): Seed for random number generators.
 
-        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
+        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
             for the agent and the environment in each epoch.
 
         epochs (int): Number of epochs to run and train agent.
@@ -83,14 +83,14 @@ def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
 
         gamma (float): Discount factor. (Always between 0 and 1.)
 
-        polyak (float): Interpolation factor in polyak averaging for target 
-            networks. Target networks are updated towards main networks 
+        polyak (float): Interpolation factor in polyak averaging for target
+            networks. Target networks are updated towards main networks
             according to:
 
-            .. math:: \\theta_{\\text{targ}} \\leftarrow 
+            .. math:: \\theta_{\\text{targ}} \\leftarrow
                 \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
 
-            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
+            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
             close to 1.)
 
         pi_lr (float): Learning rate for policy.
@@ -107,11 +107,11 @@ def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
             is full enough for useful updates.
 
         update_every (int): Number of env interactions that should elapse
-            between gradient descent updates. Note: Regardless of how long 
-            you wait between updates, the ratio of env steps to gradient steps 
+            between gradient descent updates. Note: Regardless of how long
+            you wait between updates, the ratio of env steps to gradient steps
             is locked to 1.
 
-        act_noise (float): Stddev for Gaussian exploration noise added to 
+        act_noise (float): Stddev for Gaussian exploration noise added to
             policy at training time. (At test time, no noise is added.)
 
         num_test_episodes (int): Number of episodes to test the deterministic
@@ -148,10 +148,10 @@ def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
     # Main outputs from computation graph
     with tf.variable_scope('main'):
         pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs)
-    
+
     # Target networks
     with tf.variable_scope('target'):
-        # Note that the action placeholder going to actor_critic here is 
+        # Note that the action placeholder going to actor_critic here is
         # irrelevant, because we only need q_targ(s, pi_targ(s)).
         pi_targ, _, q_pi_targ  = actor_critic(x2_ph, a_ph, **ac_kwargs)
 
@@ -170,8 +170,8 @@ def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
     q_loss = tf.reduce_mean((q-backup)**2)
 
     # Separate train ops for pi, q
-    pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
-    q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
+    pi_optimizer = tf.optimizers.Adam(learning_rate=pi_lr)
+    q_optimizer = tf.optimizers.Adam(learning_rate=q_lr)
     train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
     train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))
 
@@ -214,8 +214,8 @@ def test_agent():
     for t in range(total_steps):
 
         # Until start_steps have elapsed, randomly sample actions
-        # from a uniform distribution for better exploration. Afterwards, 
-        # use the learned policy (with some noise, via act_noise). 
+        # from a uniform distribution for better exploration. Afterwards,
+        # use the learned policy (with some noise, via act_noise).
         if t > start_steps:
             a = get_action(o, act_noise)
         else:
@@ -234,7 +234,7 @@ def test_agent():
         # Store experience to replay buffer
         replay_buffer.store(o, a, r, o2, d)
 
-        # Super critical, easy to overlook step: make sure to update 
+        # Super critical, easy to overlook step: make sure to update
         # most recent observation!
         o = o2
 
diff --git a/spinup/algos/tf1/sac/sac.py b/spinup/algos/tf1/sac/sac.py
index e31a09dd2..30bedcdc7 100644
--- a/spinup/algos/tf1/sac/sac.py
+++ b/spinup/algos/tf1/sac/sac.py
@@ -39,10 +39,10 @@ def sample_batch(self, batch_size=32):
 
 
 
-def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, 
-        steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, 
-        polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, 
-        update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, 
+def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
+        steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99,
+        polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000,
+        update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000,
         logger_kwargs=dict(), save_freq=1):
     """
     Soft Actor-Critic (SAC)
@@ -52,8 +52,8 @@ def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
         env_fn : A function which creates a copy of the environment.
             The environment must satisfy the OpenAI Gym API.
 
-        actor_critic: A function which takes in placeholder symbols 
-            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
+        actor_critic: A function which takes in placeholder symbols
+            for state, ``x_ph``, and action, ``a_ph``, and returns the main
             outputs from the agent's Tensorflow computation graph:
 
             ===========  ================  ======================================
@@ -61,27 +61,27 @@ def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
             ===========  ================  ======================================
             ``mu``       (batch, act_dim)  | Computes mean actions from policy
                                            | given states.
-            ``pi``       (batch, act_dim)  | Samples actions from policy given 
+            ``pi``       (batch, act_dim)  | Samples actions from policy given
                                            | states.
             ``logp_pi``  (batch,)          | Gives log probability, according to
                                            | the policy, of the action sampled by
                                            | ``pi``. Critical: must be differentiable
                                            | with respect to policy parameters all
                                            | the way through action sampling.
-            ``q1``       (batch,)          | Gives one estimate of Q* for 
+            ``q1``       (batch,)          | Gives one estimate of Q* for
                                            | states in ``x_ph`` and actions in
                                            | ``a_ph``.
-            ``q2``       (batch,)          | Gives another estimate of Q* for 
+            ``q2``       (batch,)          | Gives another estimate of Q* for
                                            | states in ``x_ph`` and actions in
                                            | ``a_ph``.
             ===========  ================  ======================================
 
-        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
+        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
             function you provided to SAC.
 
         seed (int): Seed for random number generators.
 
-        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
+        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
             for the agent and the environment in each epoch.
 
         epochs (int): Number of epochs to run and train agent.
@@ -90,19 +90,19 @@ def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
 
         gamma (float): Discount factor. (Always between 0 and 1.)
 
-        polyak (float): Interpolation factor in polyak averaging for target 
-            networks. Target networks are updated towards main networks 
+        polyak (float): Interpolation factor in polyak averaging for target
+            networks. Target networks are updated towards main networks
             according to:
 
-            .. math:: \\theta_{\\text{targ}} \\leftarrow 
+            .. math:: \\theta_{\\text{targ}} \\leftarrow
                 \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
 
-            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
+            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
             close to 1.)
 
         lr (float): Learning rate (used for both policy and value learning).
 
-        alpha (float): Entropy regularization coefficient. (Equivalent to 
+        alpha (float): Entropy regularization coefficient. (Equivalent to
             inverse of reward scale in the original SAC paper.)
 
         batch_size (int): Minibatch size for SGD.
@@ -115,8 +115,8 @@ def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
             is full enough for useful updates.
 
         update_every (int): Number of env interactions that should elapse
-            between gradient descent updates. Note: Regardless of how long 
-            you wait between updates, the ratio of env steps to gradient steps 
+            between gradient descent updates. Note: Regardless of how long
+            you wait between updates, the ratio of env steps to gradient steps
             is locked to 1.
 
         num_test_episodes (int): Number of episodes to test the deterministic
@@ -160,7 +160,7 @@ def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
 
         # get actions and log probs of actions for next states, for Q-learning
         _, pi_next, logp_pi_next, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs)
-    
+
     # Target value network
     with tf.variable_scope('target'):
         # target q values, using actions from *current* policy
@@ -186,14 +186,14 @@ def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
     q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
     value_loss = q1_loss + q2_loss
 
-    # Policy train op 
+    # Policy train op
     # (has to be separate from value train op, because q1_pi appears in pi_loss)
-    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
+    pi_optimizer = tf.optimizers.Adam(learning_rate=lr)
     train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
 
     # Value train op
     # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
-    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
+    value_optimizer = tf.optimizers.Adam(learning_rate=lr)
     value_params = get_vars('main/q')
     with tf.control_dependencies([train_pi_op]):
         train_value_op = value_optimizer.minimize(value_loss, var_list=value_params)
@@ -205,7 +205,7 @@ def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
                                   for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
 
     # All ops to call during one training step
-    step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, 
+    step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi,
                 train_pi_op, train_value_op, target_update]
 
     # Initializing targets to match main variables
@@ -217,7 +217,7 @@ def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
     sess.run(target_init)
 
     # Setup model saving
-    logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, 
+    logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph},
                                 outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2})
 
     def get_action(o, deterministic=False):
@@ -228,7 +228,7 @@ def test_agent():
         for j in range(num_test_episodes):
             o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
             while not(d or (ep_len == max_ep_len)):
-                # Take deterministic actions at test time 
+                # Take deterministic actions at test time
                 o, r, d, _ = test_env.step(get_action(o, True))
                 ep_ret += r
                 ep_len += 1
@@ -242,7 +242,7 @@ def test_agent():
     for t in range(total_steps):
 
         # Until start_steps have elapsed, randomly sample actions
-        # from a uniform distribution for better exploration. Afterwards, 
+        # from a uniform distribution for better exploration. Afterwards,
         # use the learned policy.
         if t > start_steps:
             a = get_action(o)
@@ -262,7 +262,7 @@ def test_agent():
         # Store experience to replay buffer
         replay_buffer.store(o, a, r, o2, d)
 
-        # Super critical, easy to overlook step: make sure to update 
+        # Super critical, easy to overlook step: make sure to update
         # most recent observation!
         o = o2
 
@@ -303,8 +303,8 @@ def test_agent():
             logger.log_tabular('EpLen', average_only=True)
             logger.log_tabular('TestEpLen', average_only=True)
             logger.log_tabular('TotalEnvInteracts', t)
-            logger.log_tabular('Q1Vals', with_min_and_max=True) 
-            logger.log_tabular('Q2Vals', with_min_and_max=True) 
+            logger.log_tabular('Q1Vals', with_min_and_max=True)
+            logger.log_tabular('Q2Vals', with_min_and_max=True)
             logger.log_tabular('LogPi', with_min_and_max=True)
             logger.log_tabular('LossPi', average_only=True)
             logger.log_tabular('LossQ1', average_only=True)
diff --git a/spinup/algos/tf1/td3/td3.py b/spinup/algos/tf1/td3/td3.py
index 32257e990..8c3b46fe8 100644
--- a/spinup/algos/tf1/td3/td3.py
+++ b/spinup/algos/tf1/td3/td3.py
@@ -39,11 +39,11 @@ def sample_batch(self, batch_size=32):
 
 
 
-def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, 
-        steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, 
-        polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, 
-        update_after=1000, update_every=50, act_noise=0.1, target_noise=0.2, 
-        noise_clip=0.5, policy_delay=2, num_test_episodes=10, max_ep_len=1000, 
+def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
+        steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99,
+        polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000,
+        update_after=1000, update_every=50, act_noise=0.1, target_noise=0.2,
+        noise_clip=0.5, policy_delay=2, num_test_episodes=10, max_ep_len=1000,
         logger_kwargs=dict(), save_freq=1):
     """
     Twin Delayed Deep Deterministic Policy Gradient (TD3)
@@ -53,8 +53,8 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
         env_fn : A function which creates a copy of the environment.
             The environment must satisfy the OpenAI Gym API.
 
-        actor_critic: A function which takes in placeholder symbols 
-            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
+        actor_critic: A function which takes in placeholder symbols
+            for state, ``x_ph``, and action, ``a_ph``, and returns the main
             outputs from the agent's Tensorflow computation graph:
 
             ===========  ================  ======================================
@@ -62,23 +62,23 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
             ===========  ================  ======================================
             ``pi``       (batch, act_dim)  | Deterministically computes actions
                                            | from policy given states.
-            ``q1``       (batch,)          | Gives one estimate of Q* for 
+            ``q1``       (batch,)          | Gives one estimate of Q* for
                                            | states in ``x_ph`` and actions in
                                            | ``a_ph``.
-            ``q2``       (batch,)          | Gives another estimate of Q* for 
+            ``q2``       (batch,)          | Gives another estimate of Q* for
                                            | states in ``x_ph`` and actions in
                                            | ``a_ph``.
-            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
-                                           | ``pi`` for states in ``x_ph``: 
+            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and
+                                           | ``pi`` for states in ``x_ph``:
                                            | q1(x, pi(x)).
             ===========  ================  ======================================
 
-        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
+        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
             function you provided to TD3.
 
         seed (int): Seed for random number generators.
 
-        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
+        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
             for the agent and the environment in each epoch.
 
         epochs (int): Number of epochs to run and train agent.
@@ -87,14 +87,14 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
 
         gamma (float): Discount factor. (Always between 0 and 1.)
 
-        polyak (float): Interpolation factor in polyak averaging for target 
-            networks. Target networks are updated towards main networks 
+        polyak (float): Interpolation factor in polyak averaging for target
+            networks. Target networks are updated towards main networks
             according to:
 
-            .. math:: \\theta_{\\text{targ}} \\leftarrow 
+            .. math:: \\theta_{\\text{targ}} \\leftarrow
                 \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
 
-            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
+            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
             close to 1.)
 
         pi_lr (float): Learning rate for policy.
@@ -111,20 +111,20 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
             is full enough for useful updates.
 
         update_every (int): Number of env interactions that should elapse
-            between gradient descent updates. Note: Regardless of how long 
-            you wait between updates, the ratio of env steps to gradient steps 
+            between gradient descent updates. Note: Regardless of how long
+            you wait between updates, the ratio of env steps to gradient steps
             is locked to 1.
-            
-        act_noise (float): Stddev for Gaussian exploration noise added to 
+
+        act_noise (float): Stddev for Gaussian exploration noise added to
             policy at training time. (At test time, no noise is added.)
 
-        target_noise (float): Stddev for smoothing noise added to target 
+        target_noise (float): Stddev for smoothing noise added to target
             policy.
 
-        noise_clip (float): Limit for absolute value of target policy 
+        noise_clip (float): Limit for absolute value of target policy
             smoothing noise.
 
-        policy_delay (int): Policy will only be updated once every 
+        policy_delay (int): Policy will only be updated once every
             policy_delay times for each update of the Q-networks.
 
         num_test_episodes (int): Number of episodes to test the deterministic
@@ -161,11 +161,11 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
     # Main outputs from computation graph
     with tf.variable_scope('main'):
         pi, q1, q2, q1_pi = actor_critic(x_ph, a_ph, **ac_kwargs)
-    
+
     # Target policy network
     with tf.variable_scope('target'):
         pi_targ, _, _, _  = actor_critic(x2_ph, a_ph, **ac_kwargs)
-    
+
     # Target Q networks
     with tf.variable_scope('target', reuse=True):
 
@@ -196,8 +196,8 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
     q_loss = q1_loss + q2_loss
 
     # Separate train ops for pi, q
-    pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
-    q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
+    pi_optimizer = tf.optimizers.Adam(learning_rate=pi_lr)
+    q_optimizer = tf.optimizers.Adam(learning_rate=q_lr)
     train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
     train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))
 
@@ -239,8 +239,8 @@ def test_agent():
     for t in range(total_steps):
 
         # Until start_steps have elapsed, randomly sample actions
-        # from a uniform distribution for better exploration. Afterwards, 
-        # use the learned policy (with some noise, via act_noise). 
+        # from a uniform distribution for better exploration. Afterwards,
+        # use the learned policy (with some noise, via act_noise).
         if t > start_steps:
             a = get_action(o, act_noise)
         else:
@@ -259,7 +259,7 @@ def test_agent():
         # Store experience to replay buffer
         replay_buffer.store(o, a, r, o2, d)
 
-        # Super critical, easy to overlook step: make sure to update 
+        # Super critical, easy to overlook step: make sure to update
         # most recent observation!
         o = o2
 
@@ -326,7 +326,7 @@ def test_agent():
 
     from spinup.utils.run_utils import setup_logger_kwargs
     logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed)
-    
+
     td3(lambda : gym.make(args.env), actor_critic=core.mlp_actor_critic,
         ac_kwargs=dict(hidden_sizes=[args.hid]*args.l),
         gamma=args.gamma, seed=args.seed, epochs=args.epochs,
diff --git a/spinup/examples/tf1/pg_math/1_simple_pg.py b/spinup/examples/tf1/pg_math/1_simple_pg.py
index 0921447f7..975a2bc94 100644
--- a/spinup/examples/tf1/pg_math/1_simple_pg.py
+++ b/spinup/examples/tf1/pg_math/1_simple_pg.py
@@ -9,7 +9,7 @@ def mlp(x, sizes, activation=tf.tanh, output_activation=None):
         x = tf.layers.dense(x, units=size, activation=activation)
     return tf.layers.dense(x, units=sizes[-1], activation=output_activation)
 
-def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, 
+def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2,
           epochs=50, batch_size=5000, render=False):
 
     # make environment, check spaces, get obs / act dims
@@ -37,7 +37,7 @@ def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2,
     loss = -tf.reduce_mean(weights_ph * log_probs)
 
     # make train op
-    train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)
+    train_op = tf.optimizers.Adam(learning_rate=lr).minimize(loss)
 
     sess = tf.InteractiveSession()
     sess.run(tf.global_variables_initializer())
diff --git a/spinup/examples/tf1/pg_math/2_rtg_pg.py b/spinup/examples/tf1/pg_math/2_rtg_pg.py
index 933d30eb4..d6d86b6a7 100644
--- a/spinup/examples/tf1/pg_math/2_rtg_pg.py
+++ b/spinup/examples/tf1/pg_math/2_rtg_pg.py
@@ -16,7 +16,7 @@ def reward_to_go(rews):
         rtgs[i] = rews[i] + (rtgs[i+1] if i+1 < n else 0)
     return rtgs
 
-def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, 
+def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2,
           epochs=50, batch_size=5000, render=False):
 
     # make environment, check spaces, get obs / act dims
@@ -44,7 +44,7 @@ def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2,
     loss = -tf.reduce_mean(weights_ph * log_probs)
 
     # make train op
-    train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)
+    train_op = tf.optimizers.Adam(learning_rate=lr).minimize(loss)
 
     sess = tf.InteractiveSession()
     sess.run(tf.global_variables_initializer())
diff --git a/spinup/examples/tf1/train_mnist.py b/spinup/examples/tf1/train_mnist.py
index dc8f5077d..64c0a34f2 100644
--- a/spinup/examples/tf1/train_mnist.py
+++ b/spinup/examples/tf1/train_mnist.py
@@ -11,8 +11,8 @@ def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
 
 
 # Simple script for training an MLP on MNIST.
-def train_mnist(steps_per_epoch=100, epochs=5, 
-                lr=1e-3, layers=2, hidden_size=64, 
+def train_mnist(steps_per_epoch=100, epochs=5,
+                lr=1e-3, layers=2, hidden_size=64,
                 logger_kwargs=dict(), save_freq=1):
 
     logger = EpochLogger(**logger_kwargs)
@@ -32,14 +32,14 @@ def train_mnist(steps_per_epoch=100, epochs=5,
     y = tf.one_hot(y_ph, 10)
     loss = tf.losses.softmax_cross_entropy(y, logits)
     acc = tf.reduce_mean(tf.cast(tf.equal(y_ph, predict), tf.float32))
-    train_op = tf.train.AdamOptimizer().minimize(loss)
+    train_op = tf.optimizers.Adam().minimize(loss)
 
     # Prepare session
     sess = tf.Session()
     sess.run(tf.global_variables_initializer())
 
     # Setup model saving
-    logger.setup_tf_saver(sess, inputs={'x': x_ph}, 
+    logger.setup_tf_saver(sess, inputs={'x': x_ph},
                                 outputs={'logits': logits, 'predict': predict})
 
     start_time = time.time()
diff --git a/spinup/exercises/tf1/problem_set_1/exercise1_3.py b/spinup/exercises/tf1/problem_set_1/exercise1_3.py
index 2d073ef03..8913d951e 100644
--- a/spinup/exercises/tf1/problem_set_1/exercise1_3.py
+++ b/spinup/exercises/tf1/problem_set_1/exercise1_3.py
@@ -16,7 +16,7 @@
 As starter code, you are given the entirety of the TD3 algorithm except
 for the computation graph. Find "YOUR CODE HERE" to begin.
 
-To clarify: you will not write an "actor_critic" function for this 
+To clarify: you will not write an "actor_critic" function for this
 exercise. But you will use one to build the graph for computing the
 TD3 updates.
 
@@ -55,11 +55,11 @@ def sample_batch(self, batch_size=32):
 
 
 
-def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, 
-        steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, 
-        polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, 
-        update_after=1000, update_every=50, act_noise=0.1, target_noise=0.2, 
-        noise_clip=0.5, policy_delay=2, num_test_episodes=10, max_ep_len=1000, 
+def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
+        steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99,
+        polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000,
+        update_after=1000, update_every=50, act_noise=0.1, target_noise=0.2,
+        noise_clip=0.5, policy_delay=2, num_test_episodes=10, max_ep_len=1000,
         logger_kwargs=dict(), save_freq=1):
     """
     Twin Delayed Deep Deterministic Policy Gradient (TD3)
@@ -69,8 +69,8 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
         env_fn : A function which creates a copy of the environment.
             The environment must satisfy the OpenAI Gym API.
 
-        actor_critic: A function which takes in placeholder symbols 
-            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
+        actor_critic: A function which takes in placeholder symbols
+            for state, ``x_ph``, and action, ``a_ph``, and returns the main
             outputs from the agent's Tensorflow computation graph:
 
             ===========  ================  ======================================
@@ -78,23 +78,23 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
             ===========  ================  ======================================
             ``pi``       (batch, act_dim)  | Deterministically computes actions
                                            | from policy given states.
-            ``q1``       (batch,)          | Gives one estimate of Q* for 
+            ``q1``       (batch,)          | Gives one estimate of Q* for
                                            | states in ``x_ph`` and actions in
                                            | ``a_ph``.
-            ``q2``       (batch,)          | Gives another estimate of Q* for 
+            ``q2``       (batch,)          | Gives another estimate of Q* for
                                            | states in ``x_ph`` and actions in
                                            | ``a_ph``.
-            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
-                                           | ``pi`` for states in ``x_ph``: 
+            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and
+                                           | ``pi`` for states in ``x_ph``:
                                            | q1(x, pi(x)).
             ===========  ================  ======================================
 
-        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
+        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
             function you provided to TD3.
 
         seed (int): Seed for random number generators.
 
-        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
+        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
             for the agent and the environment in each epoch.
 
         epochs (int): Number of epochs to run and train agent.
@@ -103,14 +103,14 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
 
         gamma (float): Discount factor. (Always between 0 and 1.)
 
-        polyak (float): Interpolation factor in polyak averaging for target 
-            networks. Target networks are updated towards main networks 
+        polyak (float): Interpolation factor in polyak averaging for target
+            networks. Target networks are updated towards main networks
             according to:
 
-            .. math:: \\theta_{\\text{targ}} \\leftarrow 
+            .. math:: \\theta_{\\text{targ}} \\leftarrow
                 \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
 
-            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
+            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
             close to 1.)
 
         pi_lr (float): Learning rate for policy.
@@ -127,20 +127,20 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
             is full enough for useful updates.
 
         update_every (int): Number of env interactions that should elapse
-            between gradient descent updates. Note: Regardless of how long 
-            you wait between updates, the ratio of env steps to gradient steps 
+            between gradient descent updates. Note: Regardless of how long
+            you wait between updates, the ratio of env steps to gradient steps
             is locked to 1.
-            
-        act_noise (float): Stddev for Gaussian exploration noise added to 
+
+        act_noise (float): Stddev for Gaussian exploration noise added to
             policy at training time. (At test time, no noise is added.)
 
-        target_noise (float): Stddev for smoothing noise added to target 
+        target_noise (float): Stddev for smoothing noise added to target
             policy.
 
-        noise_clip (float): Limit for absolute value of target policy 
+        noise_clip (float): Limit for absolute value of target policy
             smoothing noise.
 
-        policy_delay (int): Policy will only be updated once every 
+        policy_delay (int): Policy will only be updated once every
             policy_delay times for each update of the Q-networks.
 
         num_test_episodes (int): Number of episodes to test the deterministic
@@ -187,9 +187,9 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
         #   YOUR CODE HERE    #
         #                     #
         #######################
-        # pi, q1, q2, q1_pi = 
+        # pi, q1, q2, q1_pi =
         pass
-    
+
     # Target policy network
     with tf.variable_scope('target'):
         #######################
@@ -199,7 +199,7 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
         #######################
         # pi_targ =
         pass
-    
+
     # Target Q networks
     with tf.variable_scope('target', reuse=True):
 
@@ -238,10 +238,10 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
     #   YOUR CODE HERE    #
     #                     #
     #######################
-    # pi_loss = 
-    # q1_loss = 
-    # q2_loss = 
-    # q_loss = 
+    # pi_loss =
+    # q1_loss =
+    # q2_loss =
+    # q_loss =
 
     #=========================================================================#
     #                                                                         #
@@ -250,8 +250,8 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
     #=========================================================================#
 
     # Separate train ops for pi, q
-    pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
-    q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
+    pi_optimizer = tf.optimizers.Adam(learning_rate=pi_lr)
+    q_optimizer = tf.optimizers.Adam(learning_rate=q_lr)
     train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
     train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))
 
@@ -293,8 +293,8 @@ def test_agent():
     for t in range(total_steps):
 
         # Until start_steps have elapsed, randomly sample actions
-        # from a uniform distribution for better exploration. Afterwards, 
-        # use the learned policy (with some noise, via act_noise). 
+        # from a uniform distribution for better exploration. Afterwards,
+        # use the learned policy (with some noise, via act_noise).
         if t > start_steps:
             a = get_action(o, act_noise)
         else:
@@ -313,7 +313,7 @@ def test_agent():
         # Store experience to replay buffer
         replay_buffer.store(o, a, r, o2, d)
 
-        # Super critical, easy to overlook step: make sure to update 
+        # Super critical, easy to overlook step: make sure to update
         # most recent observation!
         o = o2
 
@@ -379,15 +379,15 @@ def test_agent():
     logger_kwargs = setup_logger_kwargs(args.exp_name + '-' + args.env.lower(), args.seed)
 
     all_kwargs = dict(
-        env_fn=lambda : gym.make(args.env), 
+        env_fn=lambda : gym.make(args.env),
         actor_critic=core.mlp_actor_critic,
-        ac_kwargs=dict(hidden_sizes=[128,128]), 
+        ac_kwargs=dict(hidden_sizes=[128,128]),
         max_ep_len=150,
-        seed=args.seed, 
+        seed=args.seed,
         logger_kwargs=logger_kwargs,
         epochs=10
         )
-    
+
     if args.use_soln:
         true_td3(**all_kwargs)
     else:
diff --git a/spinup/utils/mpi_tf.py b/spinup/utils/mpi_tf.py
index 96cbcf5e0..a2320adce 100644
--- a/spinup/utils/mpi_tf.py
+++ b/spinup/utils/mpi_tf.py
@@ -26,12 +26,12 @@ def sync_all_params():
     return sync_params(tf.global_variables())
 
 
-class MpiAdamOptimizer(tf.train.AdamOptimizer):
+class MpiAdamOptimizer(tf.optimizers.Adam):
     """
     Adam optimizer that averages gradients across MPI processes.
 
-    The compute_gradients method is taken from Baselines `MpiAdamOptimizer`_. 
-    For documentation on method arguments, see the Tensorflow docs page for 
+    The compute_gradients method is taken from Baselines `MpiAdamOptimizer`_.
+    For documentation on method arguments, see the Tensorflow docs page for
     the base `AdamOptimizer`_.
 
     .. _`MpiAdamOptimizer`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_adam_optimizer.py
@@ -40,7 +40,7 @@ class MpiAdamOptimizer(tf.train.AdamOptimizer):
 
     def __init__(self, **kwargs):
         self.comm = MPI.COMM_WORLD
-        tf.train.AdamOptimizer.__init__(self, **kwargs)
+        tf.optimizers.Adam.__init__(self, **kwargs)
 
     def compute_gradients(self, loss, var_list, **kwargs):
         """