From e6a68c18cc8bb55e036c5a284ec9c49e0f9fa23f Mon Sep 17 00:00:00 2001 From: Sam Sunde Date: Tue, 16 Apr 2024 23:06:30 -0700 Subject: [PATCH] tensorflow==2.* adamoptimizer --- docs/utils/logger.rst | 8 +- spinup/algos/tf1/ddpg/ddpg.py | 50 +++++------ spinup/algos/tf1/sac/sac.py | 58 ++++++------- spinup/algos/tf1/td3/td3.py | 64 +++++++-------- spinup/examples/tf1/pg_math/1_simple_pg.py | 4 +- spinup/examples/tf1/pg_math/2_rtg_pg.py | 4 +- spinup/examples/tf1/train_mnist.py | 8 +- .../tf1/problem_set_1/exercise1_3.py | 82 +++++++++---------- spinup/utils/mpi_tf.py | 8 +- 9 files changed, 143 insertions(+), 143 deletions(-) diff --git a/docs/utils/logger.rst b/docs/utils/logger.rst index a4f545ed9..8e50050c1 100644 --- a/docs/utils/logger.rst +++ b/docs/utils/logger.rst @@ -55,8 +55,8 @@ Next, let's look at a full training procedure with the logger embedded, to highl # Simple script for training an MLP on MNIST. - def train_mnist(steps_per_epoch=100, epochs=5, - lr=1e-3, layers=2, hidden_size=64, + def train_mnist(steps_per_epoch=100, epochs=5, + lr=1e-3, layers=2, hidden_size=64, logger_kwargs=dict(), save_freq=1): logger = EpochLogger(**logger_kwargs) @@ -76,14 +76,14 @@ Next, let's look at a full training procedure with the logger embedded, to highl y = tf.one_hot(y_ph, 10) loss = tf.losses.softmax_cross_entropy(y, logits) acc = tf.reduce_mean(tf.cast(tf.equal(y_ph, predict), tf.float32)) - train_op = tf.train.AdamOptimizer().minimize(loss) + train_op = tf.optimizers.Adam().minimize(loss) # Prepare session sess = tf.Session() sess.run(tf.global_variables_initializer()) # Setup model saving - logger.setup_tf_saver(sess, inputs={'x': x_ph}, + logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'logits': logits, 'predict': predict}) start_time = time.time() diff --git a/spinup/algos/tf1/ddpg/ddpg.py b/spinup/algos/tf1/ddpg/ddpg.py index 90dabaa2d..86b952357 100644 --- a/spinup/algos/tf1/ddpg/ddpg.py +++ b/spinup/algos/tf1/ddpg/ddpg.py @@ -39,10 +39,10 @@ def sample_batch(self, batch_size=32): -def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, - steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, - polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, - update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=10, +def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, + steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, + polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, + update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Deep Deterministic Policy Gradient (DDPG) @@ -52,8 +52,8 @@ def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. - actor_critic: A function which takes in placeholder symbols - for state, ``x_ph``, and action, ``a_ph``, and returns the main + actor_critic: A function which takes in placeholder symbols + for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== @@ -61,20 +61,20 @@ def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. - ``q`` (batch,) | Gives the current estimate of Q* for + ``q`` (batch,) | Gives the current estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. - ``q_pi`` (batch,) | Gives the composition of ``q`` and - | ``pi`` for states in ``x_ph``: + ``q_pi`` (batch,) | Gives the composition of ``q`` and + | ``pi`` for states in ``x_ph``: | q(x, pi(x)). =========== ================ ====================================== - ac_kwargs (dict): Any kwargs appropriate for the actor_critic + ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to DDPG. seed (int): Seed for random number generators. - steps_per_epoch (int): Number of steps of interaction (state-action pairs) + steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. @@ -83,14 +83,14 @@ def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, gamma (float): Discount factor. (Always between 0 and 1.) - polyak (float): Interpolation factor in polyak averaging for target - networks. Target networks are updated towards main networks + polyak (float): Interpolation factor in polyak averaging for target + networks. Target networks are updated towards main networks according to: - .. math:: \\theta_{\\text{targ}} \\leftarrow + .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta - where :math:`\\rho` is polyak. (Always between 0 and 1, usually + where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. @@ -107,11 +107,11 @@ def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, is full enough for useful updates. update_every (int): Number of env interactions that should elapse - between gradient descent updates. Note: Regardless of how long - you wait between updates, the ratio of env steps to gradient steps + between gradient descent updates. Note: Regardless of how long + you wait between updates, the ratio of env steps to gradient steps is locked to 1. - act_noise (float): Stddev for Gaussian exploration noise added to + act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) num_test_episodes (int): Number of episodes to test the deterministic @@ -148,10 +148,10 @@ def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, # Main outputs from computation graph with tf.variable_scope('main'): pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) - + # Target networks with tf.variable_scope('target'): - # Note that the action placeholder going to actor_critic here is + # Note that the action placeholder going to actor_critic here is # irrelevant, because we only need q_targ(s, pi_targ(s)). pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) @@ -170,8 +170,8 @@ def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, q_loss = tf.reduce_mean((q-backup)**2) # Separate train ops for pi, q - pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) - q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) + pi_optimizer = tf.optimizers.Adam(learning_rate=pi_lr) + q_optimizer = tf.optimizers.Adam(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) @@ -214,8 +214,8 @@ def test_agent(): for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions - # from a uniform distribution for better exploration. Afterwards, - # use the learned policy (with some noise, via act_noise). + # from a uniform distribution for better exploration. Afterwards, + # use the learned policy (with some noise, via act_noise). if t > start_steps: a = get_action(o, act_noise) else: @@ -234,7 +234,7 @@ def test_agent(): # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) - # Super critical, easy to overlook step: make sure to update + # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 diff --git a/spinup/algos/tf1/sac/sac.py b/spinup/algos/tf1/sac/sac.py index e31a09dd2..30bedcdc7 100644 --- a/spinup/algos/tf1/sac/sac.py +++ b/spinup/algos/tf1/sac/sac.py @@ -39,10 +39,10 @@ def sample_batch(self, batch_size=32): -def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, - steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, - polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, - update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, +def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, + steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, + polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, + update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Soft Actor-Critic (SAC) @@ -52,8 +52,8 @@ def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. - actor_critic: A function which takes in placeholder symbols - for state, ``x_ph``, and action, ``a_ph``, and returns the main + actor_critic: A function which takes in placeholder symbols + for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== @@ -61,27 +61,27 @@ def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. - ``pi`` (batch, act_dim) | Samples actions from policy given + ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. - ``q1`` (batch,) | Gives one estimate of Q* for + ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. - ``q2`` (batch,) | Gives another estimate of Q* for + ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. =========== ================ ====================================== - ac_kwargs (dict): Any kwargs appropriate for the actor_critic + ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. - steps_per_epoch (int): Number of steps of interaction (state-action pairs) + steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. @@ -90,19 +90,19 @@ def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, gamma (float): Discount factor. (Always between 0 and 1.) - polyak (float): Interpolation factor in polyak averaging for target - networks. Target networks are updated towards main networks + polyak (float): Interpolation factor in polyak averaging for target + networks. Target networks are updated towards main networks according to: - .. math:: \\theta_{\\text{targ}} \\leftarrow + .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta - where :math:`\\rho` is polyak. (Always between 0 and 1, usually + where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). - alpha (float): Entropy regularization coefficient. (Equivalent to + alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. @@ -115,8 +115,8 @@ def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, is full enough for useful updates. update_every (int): Number of env interactions that should elapse - between gradient descent updates. Note: Regardless of how long - you wait between updates, the ratio of env steps to gradient steps + between gradient descent updates. Note: Regardless of how long + you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic @@ -160,7 +160,7 @@ def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, # get actions and log probs of actions for next states, for Q-learning _, pi_next, logp_pi_next, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs) - + # Target value network with tf.variable_scope('target'): # target q values, using actions from *current* policy @@ -186,14 +186,14 @@ def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) value_loss = q1_loss + q2_loss - # Policy train op + # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) - pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) + pi_optimizer = tf.optimizers.Adam(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) - value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) + value_optimizer = tf.optimizers.Adam(learning_rate=lr) value_params = get_vars('main/q') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) @@ -205,7 +205,7 @@ def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step - step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, + step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, train_pi_op, train_value_op, target_update] # Initializing targets to match main variables @@ -217,7 +217,7 @@ def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, sess.run(target_init) # Setup model saving - logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, + logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2}) def get_action(o, deterministic=False): @@ -228,7 +228,7 @@ def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not(d or (ep_len == max_ep_len)): - # Take deterministic actions at test time + # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 @@ -242,7 +242,7 @@ def test_agent(): for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions - # from a uniform distribution for better exploration. Afterwards, + # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) @@ -262,7 +262,7 @@ def test_agent(): # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) - # Super critical, easy to overlook step: make sure to update + # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 @@ -303,8 +303,8 @@ def test_agent(): logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) - logger.log_tabular('Q1Vals', with_min_and_max=True) - logger.log_tabular('Q2Vals', with_min_and_max=True) + logger.log_tabular('Q1Vals', with_min_and_max=True) + logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) diff --git a/spinup/algos/tf1/td3/td3.py b/spinup/algos/tf1/td3/td3.py index 32257e990..8c3b46fe8 100644 --- a/spinup/algos/tf1/td3/td3.py +++ b/spinup/algos/tf1/td3/td3.py @@ -39,11 +39,11 @@ def sample_batch(self, batch_size=32): -def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, - steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, - polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, - update_after=1000, update_every=50, act_noise=0.1, target_noise=0.2, - noise_clip=0.5, policy_delay=2, num_test_episodes=10, max_ep_len=1000, +def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, + steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, + polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, + update_after=1000, update_every=50, act_noise=0.1, target_noise=0.2, + noise_clip=0.5, policy_delay=2, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Twin Delayed Deep Deterministic Policy Gradient (TD3) @@ -53,8 +53,8 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. - actor_critic: A function which takes in placeholder symbols - for state, ``x_ph``, and action, ``a_ph``, and returns the main + actor_critic: A function which takes in placeholder symbols + for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== @@ -62,23 +62,23 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. - ``q1`` (batch,) | Gives one estimate of Q* for + ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. - ``q2`` (batch,) | Gives another estimate of Q* for + ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. - ``q1_pi`` (batch,) | Gives the composition of ``q1`` and - | ``pi`` for states in ``x_ph``: + ``q1_pi`` (batch,) | Gives the composition of ``q1`` and + | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). =========== ================ ====================================== - ac_kwargs (dict): Any kwargs appropriate for the actor_critic + ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TD3. seed (int): Seed for random number generators. - steps_per_epoch (int): Number of steps of interaction (state-action pairs) + steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. @@ -87,14 +87,14 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, gamma (float): Discount factor. (Always between 0 and 1.) - polyak (float): Interpolation factor in polyak averaging for target - networks. Target networks are updated towards main networks + polyak (float): Interpolation factor in polyak averaging for target + networks. Target networks are updated towards main networks according to: - .. math:: \\theta_{\\text{targ}} \\leftarrow + .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta - where :math:`\\rho` is polyak. (Always between 0 and 1, usually + where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. @@ -111,20 +111,20 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, is full enough for useful updates. update_every (int): Number of env interactions that should elapse - between gradient descent updates. Note: Regardless of how long - you wait between updates, the ratio of env steps to gradient steps + between gradient descent updates. Note: Regardless of how long + you wait between updates, the ratio of env steps to gradient steps is locked to 1. - - act_noise (float): Stddev for Gaussian exploration noise added to + + act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) - target_noise (float): Stddev for smoothing noise added to target + target_noise (float): Stddev for smoothing noise added to target policy. - noise_clip (float): Limit for absolute value of target policy + noise_clip (float): Limit for absolute value of target policy smoothing noise. - policy_delay (int): Policy will only be updated once every + policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. num_test_episodes (int): Number of episodes to test the deterministic @@ -161,11 +161,11 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, # Main outputs from computation graph with tf.variable_scope('main'): pi, q1, q2, q1_pi = actor_critic(x_ph, a_ph, **ac_kwargs) - + # Target policy network with tf.variable_scope('target'): pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs) - + # Target Q networks with tf.variable_scope('target', reuse=True): @@ -196,8 +196,8 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, q_loss = q1_loss + q2_loss # Separate train ops for pi, q - pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) - q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) + pi_optimizer = tf.optimizers.Adam(learning_rate=pi_lr) + q_optimizer = tf.optimizers.Adam(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) @@ -239,8 +239,8 @@ def test_agent(): for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions - # from a uniform distribution for better exploration. Afterwards, - # use the learned policy (with some noise, via act_noise). + # from a uniform distribution for better exploration. Afterwards, + # use the learned policy (with some noise, via act_noise). if t > start_steps: a = get_action(o, act_noise) else: @@ -259,7 +259,7 @@ def test_agent(): # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) - # Super critical, easy to overlook step: make sure to update + # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 @@ -326,7 +326,7 @@ def test_agent(): from spinup.utils.run_utils import setup_logger_kwargs logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) - + td3(lambda : gym.make(args.env), actor_critic=core.mlp_actor_critic, ac_kwargs=dict(hidden_sizes=[args.hid]*args.l), gamma=args.gamma, seed=args.seed, epochs=args.epochs, diff --git a/spinup/examples/tf1/pg_math/1_simple_pg.py b/spinup/examples/tf1/pg_math/1_simple_pg.py index 0921447f7..975a2bc94 100644 --- a/spinup/examples/tf1/pg_math/1_simple_pg.py +++ b/spinup/examples/tf1/pg_math/1_simple_pg.py @@ -9,7 +9,7 @@ def mlp(x, sizes, activation=tf.tanh, output_activation=None): x = tf.layers.dense(x, units=size, activation=activation) return tf.layers.dense(x, units=sizes[-1], activation=output_activation) -def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, +def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, epochs=50, batch_size=5000, render=False): # make environment, check spaces, get obs / act dims @@ -37,7 +37,7 @@ def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, loss = -tf.reduce_mean(weights_ph * log_probs) # make train op - train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss) + train_op = tf.optimizers.Adam(learning_rate=lr).minimize(loss) sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) diff --git a/spinup/examples/tf1/pg_math/2_rtg_pg.py b/spinup/examples/tf1/pg_math/2_rtg_pg.py index 933d30eb4..d6d86b6a7 100644 --- a/spinup/examples/tf1/pg_math/2_rtg_pg.py +++ b/spinup/examples/tf1/pg_math/2_rtg_pg.py @@ -16,7 +16,7 @@ def reward_to_go(rews): rtgs[i] = rews[i] + (rtgs[i+1] if i+1 < n else 0) return rtgs -def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, +def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, epochs=50, batch_size=5000, render=False): # make environment, check spaces, get obs / act dims @@ -44,7 +44,7 @@ def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, loss = -tf.reduce_mean(weights_ph * log_probs) # make train op - train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss) + train_op = tf.optimizers.Adam(learning_rate=lr).minimize(loss) sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) diff --git a/spinup/examples/tf1/train_mnist.py b/spinup/examples/tf1/train_mnist.py index dc8f5077d..64c0a34f2 100644 --- a/spinup/examples/tf1/train_mnist.py +++ b/spinup/examples/tf1/train_mnist.py @@ -11,8 +11,8 @@ def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): # Simple script for training an MLP on MNIST. -def train_mnist(steps_per_epoch=100, epochs=5, - lr=1e-3, layers=2, hidden_size=64, +def train_mnist(steps_per_epoch=100, epochs=5, + lr=1e-3, layers=2, hidden_size=64, logger_kwargs=dict(), save_freq=1): logger = EpochLogger(**logger_kwargs) @@ -32,14 +32,14 @@ def train_mnist(steps_per_epoch=100, epochs=5, y = tf.one_hot(y_ph, 10) loss = tf.losses.softmax_cross_entropy(y, logits) acc = tf.reduce_mean(tf.cast(tf.equal(y_ph, predict), tf.float32)) - train_op = tf.train.AdamOptimizer().minimize(loss) + train_op = tf.optimizers.Adam().minimize(loss) # Prepare session sess = tf.Session() sess.run(tf.global_variables_initializer()) # Setup model saving - logger.setup_tf_saver(sess, inputs={'x': x_ph}, + logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'logits': logits, 'predict': predict}) start_time = time.time() diff --git a/spinup/exercises/tf1/problem_set_1/exercise1_3.py b/spinup/exercises/tf1/problem_set_1/exercise1_3.py index 2d073ef03..8913d951e 100644 --- a/spinup/exercises/tf1/problem_set_1/exercise1_3.py +++ b/spinup/exercises/tf1/problem_set_1/exercise1_3.py @@ -16,7 +16,7 @@ As starter code, you are given the entirety of the TD3 algorithm except for the computation graph. Find "YOUR CODE HERE" to begin. -To clarify: you will not write an "actor_critic" function for this +To clarify: you will not write an "actor_critic" function for this exercise. But you will use one to build the graph for computing the TD3 updates. @@ -55,11 +55,11 @@ def sample_batch(self, batch_size=32): -def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, - steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, - polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, - update_after=1000, update_every=50, act_noise=0.1, target_noise=0.2, - noise_clip=0.5, policy_delay=2, num_test_episodes=10, max_ep_len=1000, +def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, + steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, + polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, + update_after=1000, update_every=50, act_noise=0.1, target_noise=0.2, + noise_clip=0.5, policy_delay=2, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Twin Delayed Deep Deterministic Policy Gradient (TD3) @@ -69,8 +69,8 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. - actor_critic: A function which takes in placeholder symbols - for state, ``x_ph``, and action, ``a_ph``, and returns the main + actor_critic: A function which takes in placeholder symbols + for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== @@ -78,23 +78,23 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. - ``q1`` (batch,) | Gives one estimate of Q* for + ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. - ``q2`` (batch,) | Gives another estimate of Q* for + ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. - ``q1_pi`` (batch,) | Gives the composition of ``q1`` and - | ``pi`` for states in ``x_ph``: + ``q1_pi`` (batch,) | Gives the composition of ``q1`` and + | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). =========== ================ ====================================== - ac_kwargs (dict): Any kwargs appropriate for the actor_critic + ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TD3. seed (int): Seed for random number generators. - steps_per_epoch (int): Number of steps of interaction (state-action pairs) + steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. @@ -103,14 +103,14 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, gamma (float): Discount factor. (Always between 0 and 1.) - polyak (float): Interpolation factor in polyak averaging for target - networks. Target networks are updated towards main networks + polyak (float): Interpolation factor in polyak averaging for target + networks. Target networks are updated towards main networks according to: - .. math:: \\theta_{\\text{targ}} \\leftarrow + .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta - where :math:`\\rho` is polyak. (Always between 0 and 1, usually + where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. @@ -127,20 +127,20 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, is full enough for useful updates. update_every (int): Number of env interactions that should elapse - between gradient descent updates. Note: Regardless of how long - you wait between updates, the ratio of env steps to gradient steps + between gradient descent updates. Note: Regardless of how long + you wait between updates, the ratio of env steps to gradient steps is locked to 1. - - act_noise (float): Stddev for Gaussian exploration noise added to + + act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) - target_noise (float): Stddev for smoothing noise added to target + target_noise (float): Stddev for smoothing noise added to target policy. - noise_clip (float): Limit for absolute value of target policy + noise_clip (float): Limit for absolute value of target policy smoothing noise. - policy_delay (int): Policy will only be updated once every + policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. num_test_episodes (int): Number of episodes to test the deterministic @@ -187,9 +187,9 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, # YOUR CODE HERE # # # ####################### - # pi, q1, q2, q1_pi = + # pi, q1, q2, q1_pi = pass - + # Target policy network with tf.variable_scope('target'): ####################### @@ -199,7 +199,7 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, ####################### # pi_targ = pass - + # Target Q networks with tf.variable_scope('target', reuse=True): @@ -238,10 +238,10 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, # YOUR CODE HERE # # # ####################### - # pi_loss = - # q1_loss = - # q2_loss = - # q_loss = + # pi_loss = + # q1_loss = + # q2_loss = + # q_loss = #=========================================================================# # # @@ -250,8 +250,8 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, #=========================================================================# # Separate train ops for pi, q - pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) - q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) + pi_optimizer = tf.optimizers.Adam(learning_rate=pi_lr) + q_optimizer = tf.optimizers.Adam(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) @@ -293,8 +293,8 @@ def test_agent(): for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions - # from a uniform distribution for better exploration. Afterwards, - # use the learned policy (with some noise, via act_noise). + # from a uniform distribution for better exploration. Afterwards, + # use the learned policy (with some noise, via act_noise). if t > start_steps: a = get_action(o, act_noise) else: @@ -313,7 +313,7 @@ def test_agent(): # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) - # Super critical, easy to overlook step: make sure to update + # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 @@ -379,15 +379,15 @@ def test_agent(): logger_kwargs = setup_logger_kwargs(args.exp_name + '-' + args.env.lower(), args.seed) all_kwargs = dict( - env_fn=lambda : gym.make(args.env), + env_fn=lambda : gym.make(args.env), actor_critic=core.mlp_actor_critic, - ac_kwargs=dict(hidden_sizes=[128,128]), + ac_kwargs=dict(hidden_sizes=[128,128]), max_ep_len=150, - seed=args.seed, + seed=args.seed, logger_kwargs=logger_kwargs, epochs=10 ) - + if args.use_soln: true_td3(**all_kwargs) else: diff --git a/spinup/utils/mpi_tf.py b/spinup/utils/mpi_tf.py index 96cbcf5e0..a2320adce 100644 --- a/spinup/utils/mpi_tf.py +++ b/spinup/utils/mpi_tf.py @@ -26,12 +26,12 @@ def sync_all_params(): return sync_params(tf.global_variables()) -class MpiAdamOptimizer(tf.train.AdamOptimizer): +class MpiAdamOptimizer(tf.optimizers.Adam): """ Adam optimizer that averages gradients across MPI processes. - The compute_gradients method is taken from Baselines `MpiAdamOptimizer`_. - For documentation on method arguments, see the Tensorflow docs page for + The compute_gradients method is taken from Baselines `MpiAdamOptimizer`_. + For documentation on method arguments, see the Tensorflow docs page for the base `AdamOptimizer`_. .. _`MpiAdamOptimizer`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_adam_optimizer.py @@ -40,7 +40,7 @@ class MpiAdamOptimizer(tf.train.AdamOptimizer): def __init__(self, **kwargs): self.comm = MPI.COMM_WORLD - tf.train.AdamOptimizer.__init__(self, **kwargs) + tf.optimizers.Adam.__init__(self, **kwargs) def compute_gradients(self, loss, var_list, **kwargs): """