openai · samsunde · Apr 17, 2024
diff --git a/docs/utils/logger.rst b/docs/utils/logger.rst
@@ -55,8 +55,8 @@ Next, let's look at a full training procedure with the logger embedded, to highl
 
 
  # Simple script for training an MLP on MNIST.
- def train_mnist(steps_per_epoch=100, epochs=5, 
- lr=1e-3, layers=2, hidden_size=64, 
+ def train_mnist(steps_per_epoch=100, epochs=5,
+ lr=1e-3, layers=2, hidden_size=64,
  logger_kwargs=dict(), save_freq=1):
 
  logger = EpochLogger(**logger_kwargs)
@@ -76,14 +76,14 @@ Next, let's look at a full training procedure with the logger embedded, to highl
  y = tf.one_hot(y_ph, 10)
  loss = tf.losses.softmax_cross_entropy(y, logits)
  acc = tf.reduce_mean(tf.cast(tf.equal(y_ph, predict), tf.float32))
- train_op = tf.train.AdamOptimizer().minimize(loss)
+ train_op = tf.optimizers.Adam().minimize(loss)
 
  # Prepare session
  sess = tf.Session()
  sess.run(tf.global_variables_initializer())
 
  # Setup model saving
- logger.setup_tf_saver(sess, inputs={'x': x_ph}, 
+ logger.setup_tf_saver(sess, inputs={'x': x_ph},
  outputs={'logits': logits, 'predict': predict})
 
  start_time = time.time()

diff --git a/spinup/algos/tf1/ddpg/ddpg.py b/spinup/algos/tf1/ddpg/ddpg.py
@@ -39,10 +39,10 @@ def sample_batch(self, batch_size=32):
 
 
 
-def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, 
- steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, 
- polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, 
- update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=10, 
+def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
+ steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99,
+ polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000,
+ update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=10,
  max_ep_len=1000, logger_kwargs=dict(), save_freq=1):
  """
  Deep Deterministic Policy Gradient (DDPG)
@@ -52,29 +52,29 @@ def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
  env_fn : A function which creates a copy of the environment.
  The environment must satisfy the OpenAI Gym API.
 
- actor_critic: A function which takes in placeholder symbols 
- for state, ``x_ph``, and action, ``a_ph``, and returns the main 
+ actor_critic: A function which takes in placeholder symbols
+ for state, ``x_ph``, and action, ``a_ph``, and returns the main
  outputs from the agent's Tensorflow computation graph:
 
  =========== ================ ======================================
  Symbol Shape Description
  =========== ================ ======================================
  ``pi`` (batch, act_dim) | Deterministically computes actions
  | from policy given states.
- ``q`` (batch,) | Gives the current estimate of Q* for 
+ ``q`` (batch,) | Gives the current estimate of Q* for
  | states in ``x_ph`` and actions in
  | ``a_ph``.
- ``q_pi`` (batch,) | Gives the composition of ``q`` and 
- | ``pi`` for states in ``x_ph``: 
+ ``q_pi`` (batch,) | Gives the composition of ``q`` and
+ | ``pi`` for states in ``x_ph``:
  | q(x, pi(x)).
  =========== ================ ======================================
 
- ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
+ ac_kwargs (dict): Any kwargs appropriate for the actor_critic
  function you provided to DDPG.
 
  seed (int): Seed for random number generators.
 
- steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
+ steps_per_epoch (int): Number of steps of interaction (state-action pairs)
  for the agent and the environment in each epoch.
 
  epochs (int): Number of epochs to run and train agent.
@@ -83,14 +83,14 @@ def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
 
  gamma (float): Discount factor. (Always between 0 and 1.)
 
- polyak (float): Interpolation factor in polyak averaging for target 
- networks. Target networks are updated towards main networks 
+ polyak (float): Interpolation factor in polyak averaging for target
+ networks. Target networks are updated towards main networks
  according to:
 
- .. math:: \\theta_{\\text{targ}} \\leftarrow 
+ .. math:: \\theta_{\\text{targ}} \\leftarrow
  \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
 
- where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
+ where :math:`\\rho` is polyak. (Always between 0 and 1, usually
  close to 1.)
 
  pi_lr (float): Learning rate for policy.
@@ -107,11 +107,11 @@ def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
  is full enough for useful updates.
 
  update_every (int): Number of env interactions that should elapse
- between gradient descent updates. Note: Regardless of how long 
- you wait between updates, the ratio of env steps to gradient steps 
+ between gradient descent updates. Note: Regardless of how long
+ you wait between updates, the ratio of env steps to gradient steps
  is locked to 1.
 
- act_noise (float): Stddev for Gaussian exploration noise added to 
+ act_noise (float): Stddev for Gaussian exploration noise added to
  policy at training time. (At test time, no noise is added.)
 
  num_test_episodes (int): Number of episodes to test the deterministic
@@ -148,10 +148,10 @@ def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
  # Main outputs from computation graph
  with tf.variable_scope('main'):
  pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs)
- 
+
  # Target networks
  with tf.variable_scope('target'):
- # Note that the action placeholder going to actor_critic here is 
+ # Note that the action placeholder going to actor_critic here is
  # irrelevant, because we only need q_targ(s, pi_targ(s)).
  pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs)
 
@@ -170,8 +170,8 @@ def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
  q_loss = tf.reduce_mean((q-backup)**2)
 
  # Separate train ops for pi, q
- pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
- q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
+ pi_optimizer = tf.optimizers.Adam(learning_rate=pi_lr)
+ q_optimizer = tf.optimizers.Adam(learning_rate=q_lr)
  train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
  train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))
 
@@ -214,8 +214,8 @@ def test_agent():
  for t in range(total_steps):
 
  # Until start_steps have elapsed, randomly sample actions
- # from a uniform distribution for better exploration. Afterwards, 
- # use the learned policy (with some noise, via act_noise). 
+ # from a uniform distribution for better exploration. Afterwards,
+ # use the learned policy (with some noise, via act_noise).
  if t > start_steps:
  a = get_action(o, act_noise)
  else:
@@ -234,7 +234,7 @@ def test_agent():
  # Store experience to replay buffer
  replay_buffer.store(o, a, r, o2, d)
 
- # Super critical, easy to overlook step: make sure to update 
+ # Super critical, easy to overlook step: make sure to update
  # most recent observation!
  o = o2
 

diff --git a/spinup/algos/tf1/sac/sac.py b/spinup/algos/tf1/sac/sac.py
@@ -39,10 +39,10 @@ def sample_batch(self, batch_size=32):
 
 
 
-def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, 
- steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, 
- polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, 
- update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, 
+def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
+ steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99,
+ polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000,
+ update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000,
  logger_kwargs=dict(), save_freq=1):
  """
  Soft Actor-Critic (SAC)
@@ -52,36 +52,36 @@ def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
  env_fn : A function which creates a copy of the environment.
  The environment must satisfy the OpenAI Gym API.
 
- actor_critic: A function which takes in placeholder symbols 
- for state, ``x_ph``, and action, ``a_ph``, and returns the main 
+ actor_critic: A function which takes in placeholder symbols
+ for state, ``x_ph``, and action, ``a_ph``, and returns the main
  outputs from the agent's Tensorflow computation graph:
 
  =========== ================ ======================================
  Symbol Shape Description
  =========== ================ ======================================
  ``mu`` (batch, act_dim) | Computes mean actions from policy
  | given states.
- ``pi`` (batch, act_dim) | Samples actions from policy given 
+ ``pi`` (batch, act_dim) | Samples actions from policy given
  | states.
  ``logp_pi`` (batch,) | Gives log probability, according to
  | the policy, of the action sampled by
  | ``pi``. Critical: must be differentiable
  | with respect to policy parameters all
  | the way through action sampling.
- ``q1`` (batch,) | Gives one estimate of Q* for 
+ ``q1`` (batch,) | Gives one estimate of Q* for
  | states in ``x_ph`` and actions in
  | ``a_ph``.
- ``q2`` (batch,) | Gives another estimate of Q* for 
+ ``q2`` (batch,) | Gives another estimate of Q* for
  | states in ``x_ph`` and actions in
  | ``a_ph``.
  =========== ================ ======================================
 
- ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
+ ac_kwargs (dict): Any kwargs appropriate for the actor_critic
  function you provided to SAC.
 
  seed (int): Seed for random number generators.
 
- steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
+ steps_per_epoch (int): Number of steps of interaction (state-action pairs)
  for the agent and the environment in each epoch.
 
  epochs (int): Number of epochs to run and train agent.
@@ -90,19 +90,19 @@ def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
 
  gamma (float): Discount factor. (Always between 0 and 1.)
 
- polyak (float): Interpolation factor in polyak averaging for target 
- networks. Target networks are updated towards main networks 
+ polyak (float): Interpolation factor in polyak averaging for target
+ networks. Target networks are updated towards main networks
  according to:
 
- .. math:: \\theta_{\\text{targ}} \\leftarrow 
+ .. math:: \\theta_{\\text{targ}} \\leftarrow
  \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
 
- where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
+ where :math:`\\rho` is polyak. (Always between 0 and 1, usually
  close to 1.)
 
  lr (float): Learning rate (used for both policy and value learning).
 
- alpha (float): Entropy regularization coefficient. (Equivalent to 
+ alpha (float): Entropy regularization coefficient. (Equivalent to
  inverse of reward scale in the original SAC paper.)
 
  batch_size (int): Minibatch size for SGD.
@@ -115,8 +115,8 @@ def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
  is full enough for useful updates.
 
  update_every (int): Number of env interactions that should elapse
- between gradient descent updates. Note: Regardless of how long 
- you wait between updates, the ratio of env steps to gradient steps 
+ between gradient descent updates. Note: Regardless of how long
+ you wait between updates, the ratio of env steps to gradient steps
  is locked to 1.
 
  num_test_episodes (int): Number of episodes to test the deterministic
@@ -160,7 +160,7 @@ def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
 
  # get actions and log probs of actions for next states, for Q-learning
  _, pi_next, logp_pi_next, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs)
- 
+
  # Target value network
  with tf.variable_scope('target'):
  # target q values, using actions from *current* policy
@@ -186,14 +186,14 @@ def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
  q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
  value_loss = q1_loss + q2_loss
 
- # Policy train op 
+ # Policy train op
  # (has to be separate from value train op, because q1_pi appears in pi_loss)
- pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
+ pi_optimizer = tf.optimizers.Adam(learning_rate=lr)
  train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
 
  # Value train op
  # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
- value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
+ value_optimizer = tf.optimizers.Adam(learning_rate=lr)
  value_params = get_vars('main/q')
  with tf.control_dependencies([train_pi_op]):
  train_value_op = value_optimizer.minimize(value_loss, var_list=value_params)
@@ -205,7 +205,7 @@ def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
  for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
 
  # All ops to call during one training step
- step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, 
+ step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi,
  train_pi_op, train_value_op, target_update]
 
  # Initializing targets to match main variables
@@ -217,7 +217,7 @@ def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
  sess.run(target_init)
 
  # Setup model saving
- logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, 
+ logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph},
  outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2})
 
  def get_action(o, deterministic=False):
@@ -228,7 +228,7 @@ def test_agent():
  for j in range(num_test_episodes):
  o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
  while not(d or (ep_len == max_ep_len)):
- # Take deterministic actions at test time 
+ # Take deterministic actions at test time
  o, r, d, _ = test_env.step(get_action(o, True))
  ep_ret += r
  ep_len += 1
@@ -242,7 +242,7 @@ def test_agent():
  for t in range(total_steps):
 
  # Until start_steps have elapsed, randomly sample actions
- # from a uniform distribution for better exploration. Afterwards, 
+ # from a uniform distribution for better exploration. Afterwards,
  # use the learned policy.
  if t > start_steps:
  a = get_action(o)
@@ -262,7 +262,7 @@ def test_agent():
  # Store experience to replay buffer
  replay_buffer.store(o, a, r, o2, d)
 
- # Super critical, easy to overlook step: make sure to update 
+ # Super critical, easy to overlook step: make sure to update
  # most recent observation!
  o = o2
 
@@ -303,8 +303,8 @@ def test_agent():
  logger.log_tabular('EpLen', average_only=True)
  logger.log_tabular('TestEpLen', average_only=True)
  logger.log_tabular('TotalEnvInteracts', t)
- logger.log_tabular('Q1Vals', with_min_and_max=True) 
- logger.log_tabular('Q2Vals', with_min_and_max=True) 
+ logger.log_tabular('Q1Vals', with_min_and_max=True)
+ logger.log_tabular('Q2Vals', with_min_and_max=True)
  logger.log_tabular('LogPi', with_min_and_max=True)
  logger.log_tabular('LossPi', average_only=True)
  logger.log_tabular('LossQ1', average_only=True)