From 209f31b452cb3c50b4bf6c6caf94649d1514d46c Mon Sep 17 00:00:00 2001 From: Timothy Clifford Date: Sun, 10 Oct 2021 20:10:32 -0400 Subject: [PATCH 1/7] not supposed to work: just a base for linking issue --- src/train.py | 264 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 264 insertions(+) create mode 100644 src/train.py diff --git a/src/train.py b/src/train.py new file mode 100644 index 0000000..4b8d76b --- /dev/null +++ b/src/train.py @@ -0,0 +1,264 @@ +''' +## Train ## +# Code to train Deep Q Network on OpenAI Gym environments +@author: Mark Sinton (msinto93@gmail.com) +''' +'^^^should i get rid of this since Ive made a lot of changes or no?' +import os +import sys +import argparse +import gym_super_mario_bros +import tensorflow as tf +import numpy as np +import time +import random + +from utils.utils import preprocess_image, reset_env_and_state_buffer +from utils.experience_replay import ReplayMemory +from utils.state_buffer import StateBuffer +from utils.network import DeepQNetwork + +def get_train_args(): + train_params = argparse.ArgumentParser() + + # Environment parameters + train_params.add_argument("--env", type=str, default='BreakoutDeterministic-v4', help="Environment to use (must have RGB image state space and discrete action space)") + train_params.add_argument("--render", type=bool, default=False, help="Whether or not to display the environment on the screen during training") + train_params.add_argument("--random_seed", type=int, default=1234, help="Random seed for reproducability") + train_params.add_argument("--frame_width", type=int, default=105, help="Frame width after resize.") + train_params.add_argument("--frame_height", type=int, default=80, help="Frame height after resize.") + train_params.add_argument("--frames_per_state", type=int, default=4, help="Sequence of frames which constitutes a single state.") + + # Training parameters + train_params.add_argument("--num_steps_train", type=int, default=50000000, help="Number of steps to train for") + train_params.add_argument("--train_frequency", type=int, default=4, help="Perform training step every N game steps.") + train_params.add_argument("--max_ep_steps", type=int, default=2000, help="Maximum number of steps per episode") + train_params.add_argument("--batch_size", type=int, default=32) + train_params.add_argument("--learning_rate", type=float, default=0.00025) + train_params.add_argument("--replay_mem_size", type=int, default=1000000, help="Maximum size of replay memory buffer") + train_params.add_argument("--initial_replay_mem_size", type=int, default=50000, help="Initial size of replay memory (populated by random actions) before learning can start") + train_params.add_argument("--epsilon_start", type=float, default=1.0, help="Exploration rate at the beginning of training.") + train_params.add_argument("--epsilon_end", type=float, default=0.1, help="Exploration rate at the end of decay.") + train_params.add_argument("--epsilon_step_end", type=int, default=1000000, help="After how many steps to stop decaying the exploration rate.") + train_params.add_argument("--discount_rate", type=float, default=0.99, help="Discount rate (gamma) for future rewards.") + train_params.add_argument("--update_target_step", type=float, default=10000, help="Copy current network parameters to target network every N steps.") + train_params.add_argument("--save_ckpt_step", type=float, default=250000, help="Save checkpoint every N steps") + train_params.add_argument("--save_log_step", type=int, default=1000, help="Save logs every N steps") + + # Files/directories + train_params.add_argument("--ckpt_dir", type=str, default='./ckpts', help="Directory for saving/loading checkpoints") + train_params.add_argument("--ckpt_file", type=str, default=None, help="Checkpoint file to load and resume training from (if None, train from scratch)") + train_params.add_argument("--log_dir", type=str, default='./logs/train', help="Directory for saving logs") + + return train_params.parse_args() + + +def train(args): + + # Function to return exploration rate based on current step + def exploration_rate(current_step, exp_rate_start, exp_rate_end, exp_step_end): + if current_step < exp_step_end: + exploration_rate = current_step * ((exp_rate_end-exp_rate_start)/(float(exp_step_end))) + 1 + else: + exploration_rate = exp_rate_end + + return exploration_rate + + # Function to update target network parameters with main network parameters + def update_target_network(from_scope, to_scope): + from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope) + to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope) + + op_holder = [] + + # Update old network parameters with new network parameters + for from_var,to_var in zip(from_vars,to_vars): + op_holder.append(to_var.assign(from_var)) + + return op_holder + + + # Create environment + env = gym_super_mario_bros.make(args.env) + num_actions = env.action_space.n + + # Initialise replay memory and state buffer + replay_mem = ReplayMemory(args) + state_buf = StateBuffer(args) + + # Define input placeholders + state_ph = tf.placeholder(tf.uint8, (None, args.frame_height, args.frame_width, args.frames_per_state)) + action_ph = tf.placeholder(tf.int32, (None)) + target_ph = tf.placeholder(tf.float32, (None)) + + # Instantiate DQN network + DQN = DeepQNetwork(num_actions, state_ph, action_ph, target_ph, args.learning_rate, scope='DQN_main') # Note: One scope cannot be the prefix of another scope (e.g. cannot name this scope 'DQN' and + # target network scope 'DQN_target', as a search for vars in 'DQN' scope will return both networks' vars) + DQN_predict_op = DQN.predict() + DQN_train_step_op = DQN.train_step() + + # Instantiate DQN target network + DQN_target = DeepQNetwork(num_actions, state_ph, scope='DQN_target') + + update_target_op = update_target_network('DQN_main', 'DQN_target') + + # Create session + config = tf.ConfigProto(allow_soft_placement=True) + config.gpu_options.allow_growth = True + sess = tf.Session(config=config) + + # Add summaries for Tensorboard visualisation + tf.summary.scalar('Loss', DQN.loss) + reward_var = tf.Variable(0.0, trainable=False) + tf.summary.scalar("Episode Reward", reward_var) + epsilon_var = tf.Variable(args.epsilon_start, trainable=False) + tf.summary.scalar("Exploration Rate", epsilon_var) + summary_op = tf.summary.merge_all() + + # Define saver for saving model ckpts + model_name = 'model.ckpt' + checkpoint_path = os.path.join(args.ckpt_dir, model_name) + if not os.path.exists(args.ckpt_dir): + os.makedirs(args.ckpt_dir) + saver = tf.train.Saver(max_to_keep=201) + + # Create summary writer to write summaries to disk + if not os.path.exists(args.log_dir): + os.makedirs(args.log_dir) + summary_writer = tf.summary.FileWriter(args.log_dir, sess.graph) + + # Load ckpt file if given + if args.ckpt_file is not None: + loader = tf.train.Saver() #Restore all variables from ckpt + ckpt = args.ckpt_dir + '/' + args.ckpt_file + ckpt_split = ckpt.split('-') + step_str = ckpt_split[-1] + start_step = int(step_str) + loader.restore(sess, ckpt) + else: + start_step = 0 + sess.run(tf.global_variables_initializer()) + sess.run(update_target_op) + + + ## Begin training + + env.reset() + + ep_steps = 0 + episode_reward = 0 + episode_rewards = [] + duration_values = [] + + # Initially populate replay memory by taking random actions + sys.stdout.write('\nPopulating replay memory with random actions...\n') + sys.stdout.flush() + + for random_step in range(1, args.initial_replay_mem_size+1): + + if args.render: + env.render() + else: + env.render(mode='rgb_array') + + action = env.action_space.sample() + frame, reward, terminal, _ = env.step(action) + frame = preprocess_image(frame, args.frame_width, args.frame_height) + replay_mem.add(action, reward, frame, terminal) + + if terminal: + env.reset() + + sys.stdout.write('\x1b[2K\rStep {:d}/{:d}'.format(random_step, args.initial_replay_mem_size)) + sys.stdout.flush() + + # Begin training process + reset_env_and_state_buffer(env, state_buf, args) + sys.stdout.write('\n\nTraining...\n\n') + sys.stdout.flush() + + for train_step in range(start_step+1, args.num_steps_train+1): + start_time = time.time() + # Run 'train_frequency' iterations in the game for every training step + for _ in range(0, args.train_frequency): + ep_steps += 1 + + if args.render: + env.render() + else: + env.render(mode='rgb_array') + + # Use an epsilon-greedy policy to select action + epsilon = exploration_rate(train_step, args.epsilon_start, args.epsilon_end, args.epsilon_step_end) + if random.random() < epsilon: + #Choose random action + action = env.action_space.sample() + else: + #Choose action with highest Q-value according to network's current policy + current_state = np.expand_dims(state_buf.get_state(), 0) + action = sess.run(DQN_predict_op, {state_ph:current_state}) + + # Take action and store experience + frame, reward, terminal, _ = env.step(action) + frame = preprocess_image(frame, args.frame_width, args.frame_height) + state_buf.add(frame) + replay_mem.add(action, reward, frame, terminal) + episode_reward += reward + + if terminal or ep_steps == args.max_ep_steps: + # Collect total reward of episode + episode_rewards.append(episode_reward) + # Reset episode reward and episode steps counters + episode_reward = 0 + ep_steps = 0 + # Reset environment and state buffer for next episode + reset_env_and_state_buffer(env, state_buf, args) + + ## Training step + # Get minibatch from replay mem + states_batch, actions_batch, rewards_batch, next_states_batch, terminals_batch = replay_mem.getMinibatch() + # Calculate target by passing next states through the target network and finding max future Q + future_Q = sess.run(DQN_target.output, {state_ph:next_states_batch}) + max_future_Q = np.max(future_Q, axis=1) + # Q values of the terminal states is 0 by definition + max_future_Q[terminals_batch] = 0 + targets = rewards_batch + (max_future_Q*args.discount_rate) + + # Execute training step + if train_step % args.save_log_step == 0: + # Train and save logs + average_reward = sum(episode_rewards)/len(episode_rewards) + summary_str, _ = sess.run([summary_op, DQN_train_step_op], {state_ph:states_batch, action_ph:actions_batch, target_ph:targets, reward_var: average_reward, epsilon_var: epsilon}) + summary_writer.add_summary(summary_str, train_step) + # Reset rewards buffer + episode_rewards = [] + else: + # Just train + _ = sess.run(DQN_train_step_op, {state_ph:states_batch, action_ph:actions_batch, target_ph:targets}) + + # Update target networks + if train_step % args.update_target_step == 0: + sess.run(update_target_op) + + # Calculate time per step and display progress to console + duration = time.time() - start_time + duration_values.append(duration) + ave_duration = sum(duration_values)/float(len(duration_values)) + + sys.stdout.write('\x1b[2K\rStep {:d}/{:d} \t ({:.3f} s/step)'.format(train_step, args.num_steps_train, ave_duration)) + sys.stdout.flush() + + # Save checkpoint + if train_step % args.save_ckpt_step == 0: + saver.save(sess, checkpoint_path, global_step=train_step) + sys.stdout.write('\n Checkpoint saved\n') + sys.stdout.flush() + + # Reset time calculation + duration_values = [] + + + +if __name__ == '__main__': + train_args = get_train_args() + train(train_args) \ No newline at end of file From ee35f3b4b815d47a89508ab43c6c3ed419677710 Mon Sep 17 00:00:00 2001 From: Timothy Clifford Date: Tue, 19 Oct 2021 10:23:22 -0400 Subject: [PATCH 2/7] a --- src/train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/train.py b/src/train.py index 4b8d76b..6f70052 100644 --- a/src/train.py +++ b/src/train.py @@ -17,6 +17,7 @@ from utils.experience_replay import ReplayMemory from utils.state_buffer import StateBuffer from utils.network import DeepQNetwork + def get_train_args(): train_params = argparse.ArgumentParser() From 7b763ca1a771d216ac1888d63024e70463b0ab05 Mon Sep 17 00:00:00 2001 From: Timothy Clifford Date: Tue, 19 Oct 2021 10:52:39 -0400 Subject: [PATCH 3/7] checkpoint line 140ish --- src/train.py | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/src/train.py b/src/train.py index 6f70052..f1962e1 100644 --- a/src/train.py +++ b/src/train.py @@ -1,9 +1,9 @@ ''' ## Train ## -# Code to train Deep Q Network on OpenAI Gym environments +# Adapted from code to train Deep Q Network on OpenAI Gym environments @author: Mark Sinton (msinto93@gmail.com) ''' -'^^^should i get rid of this since Ive made a lot of changes or no?' + import os import sys import argparse @@ -12,12 +12,12 @@ import numpy as np import time import random - -from utils.utils import preprocess_image, reset_env_and_state_buffer -from utils.experience_replay import ReplayMemory -from utils.state_buffer import StateBuffer -from utils.network import DeepQNetwork - +import utils +''' dont exist yet in git +import ReplayMemory +import StateBuffer +''' +import Model def get_train_args(): train_params = argparse.ArgumentParser() @@ -81,27 +81,28 @@ def update_target_network(from_scope, to_scope): # Create environment env = gym_super_mario_bros.make(args.env) - num_actions = env.action_space.n + num_actions = 7 # Initialise replay memory and state buffer replay_mem = ReplayMemory(args) state_buf = StateBuffer(args) # Define input placeholders - state_ph = tf.placeholder(tf.uint8, (None, args.frame_height, args.frame_width, args.frames_per_state)) - action_ph = tf.placeholder(tf.int32, (None)) - target_ph = tf.placeholder(tf.float32, (None)) + #state_ph = tf.placeholder(tf.uint8, (None, args.frame_height, args.frame_width, args.frames_per_state)) + #action_ph = tf.placeholder(tf.int32, (None)) + #target_ph = tf.placeholder(tf.float32, (None)) # Instantiate DQN network - DQN = DeepQNetwork(num_actions, state_ph, action_ph, target_ph, args.learning_rate, scope='DQN_main') # Note: One scope cannot be the prefix of another scope (e.g. cannot name this scope 'DQN' and + #DQN = DeepQNetwork(num_actions, state_ph, action_ph, target_ph, args.learning_rate, scope='DQN_main') # Note: One scope cannot be the prefix of another scope (e.g. cannot name this scope 'DQN' and # target network scope 'DQN_target', as a search for vars in 'DQN' scope will return both networks' vars) + DQN = Model(240, 256) DQN_predict_op = DQN.predict() DQN_train_step_op = DQN.train_step() # Instantiate DQN target network - DQN_target = DeepQNetwork(num_actions, state_ph, scope='DQN_target') + #DQN_target = DeepQNetwork(num_actions, state_ph, scope='DQN_target') - update_target_op = update_target_network('DQN_main', 'DQN_target') + #update_target_op = update_target_network('DQN_main', 'DQN_target') # Create session config = tf.ConfigProto(allow_soft_placement=True) @@ -142,7 +143,7 @@ def update_target_network(from_scope, to_scope): sess.run(update_target_op) - ## Begin training + ## Begin training # THIS IS WHERE I AM AT LOOKY HERE TIM WHEN YOU GO env.reset() @@ -219,7 +220,7 @@ def update_target_network(from_scope, to_scope): # Get minibatch from replay mem states_batch, actions_batch, rewards_batch, next_states_batch, terminals_batch = replay_mem.getMinibatch() # Calculate target by passing next states through the target network and finding max future Q - future_Q = sess.run(DQN_target.output, {state_ph:next_states_batch}) + #future_Q = sess.run(DQN_target.output, {state_ph:next_states_batch}) max_future_Q = np.max(future_Q, axis=1) # Q values of the terminal states is 0 by definition max_future_Q[terminals_batch] = 0 From 690cfdddbb5a8c7c654e53e2df67dfc001d1cede Mon Sep 17 00:00:00 2001 From: Timothy Clifford Date: Tue, 19 Oct 2021 13:17:55 -0400 Subject: [PATCH 4/7] comments and fixes --- src/train.py | 46 +++++++++++++++++++++------------------------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/src/train.py b/src/train.py index f1962e1..a661bcb 100644 --- a/src/train.py +++ b/src/train.py @@ -13,10 +13,8 @@ import time import random import utils -''' dont exist yet in git -import ReplayMemory +import ReplayMemory # doesn't exist --------------------------------------------------------- import StateBuffer -''' import Model def get_train_args(): @@ -84,19 +82,19 @@ def update_target_network(from_scope, to_scope): num_actions = 7 # Initialise replay memory and state buffer - replay_mem = ReplayMemory(args) + replay_mem = ReplayMemory(args) # don't exist yet ---------------------------------- state_buf = StateBuffer(args) # Define input placeholders - #state_ph = tf.placeholder(tf.uint8, (None, args.frame_height, args.frame_width, args.frames_per_state)) - #action_ph = tf.placeholder(tf.int32, (None)) - #target_ph = tf.placeholder(tf.float32, (None)) + state_ph = tf.placeholder(tf.uint8, (None, args.frame_height, args.frame_width, args.frames_per_state)) + action_ph = tf.placeholder(tf.int32, (None)) + target_ph = tf.placeholder(tf.float32, (None)) # Instantiate DQN network #DQN = DeepQNetwork(num_actions, state_ph, action_ph, target_ph, args.learning_rate, scope='DQN_main') # Note: One scope cannot be the prefix of another scope (e.g. cannot name this scope 'DQN' and # target network scope 'DQN_target', as a search for vars in 'DQN' scope will return both networks' vars) DQN = Model(240, 256) - DQN_predict_op = DQN.predict() + DQN_predict_op = DQN.predict() #talk to rithvik -------------------------------------------------- DQN_train_step_op = DQN.train_step() # Instantiate DQN target network @@ -143,7 +141,7 @@ def update_target_network(from_scope, to_scope): sess.run(update_target_op) - ## Begin training # THIS IS WHERE I AM AT LOOKY HERE TIM WHEN YOU GO + ## Begin training env.reset() @@ -152,9 +150,7 @@ def update_target_network(from_scope, to_scope): episode_rewards = [] duration_values = [] - # Initially populate replay memory by taking random actions - sys.stdout.write('\nPopulating replay memory with random actions...\n') - sys.stdout.flush() + # Initially populate replay memory by taking random actions for random_step in range(1, args.initial_replay_mem_size+1): @@ -163,21 +159,21 @@ def update_target_network(from_scope, to_scope): else: env.render(mode='rgb_array') - action = env.action_space.sample() + action = env.action_space.sample() #get an action ------------------------------------------------------ frame, reward, terminal, _ = env.step(action) - frame = preprocess_image(frame, args.frame_width, args.frame_height) + frame = preprocess_image(frame, args.frame_width, args.frame_height) #should be function from utils --------------- replay_mem.add(action, reward, frame, terminal) if terminal: env.reset() - sys.stdout.write('\x1b[2K\rStep {:d}/{:d}'.format(random_step, args.initial_replay_mem_size)) - sys.stdout.flush() + #sys.stdout.write('\x1b[2K\rStep {:d}/{:d}'.format(random_step, args.initial_replay_mem_size)) + #sys.stdout.flush() # Begin training process - reset_env_and_state_buffer(env, state_buf, args) - sys.stdout.write('\n\nTraining...\n\n') - sys.stdout.flush() + reset_env_and_state_buffer(env, state_buf, args) #should be function from utils ----------------------------- + #sys.stdout.write('\n\nTraining...\n\n') + #sys.stdout.flush() for train_step in range(start_step+1, args.num_steps_train+1): start_time = time.time() @@ -202,7 +198,7 @@ def update_target_network(from_scope, to_scope): # Take action and store experience frame, reward, terminal, _ = env.step(action) - frame = preprocess_image(frame, args.frame_width, args.frame_height) + frame = preprocess_image(frame, args.frame_width, args.frame_height) # again utils ------------------- state_buf.add(frame) replay_mem.add(action, reward, frame, terminal) episode_reward += reward @@ -214,14 +210,14 @@ def update_target_network(from_scope, to_scope): episode_reward = 0 ep_steps = 0 # Reset environment and state buffer for next episode - reset_env_and_state_buffer(env, state_buf, args) + reset_env_and_state_buffer(env, state_buf, args) #utilsssss -------------------- ## Training step # Get minibatch from replay mem states_batch, actions_batch, rewards_batch, next_states_batch, terminals_batch = replay_mem.getMinibatch() # Calculate target by passing next states through the target network and finding max future Q #future_Q = sess.run(DQN_target.output, {state_ph:next_states_batch}) - max_future_Q = np.max(future_Q, axis=1) + max_future_Q = np.max(future_Q, axis=1) #actually this one i don't know if should be utils or not ------------------ # Q values of the terminal states is 0 by definition max_future_Q[terminals_batch] = 0 targets = rewards_batch + (max_future_Q*args.discount_rate) @@ -240,15 +236,15 @@ def update_target_network(from_scope, to_scope): # Update target networks if train_step % args.update_target_step == 0: - sess.run(update_target_op) + sess.run(update_target_op) # i'm not sure where this comes from-------------------------- # Calculate time per step and display progress to console duration = time.time() - start_time duration_values.append(duration) ave_duration = sum(duration_values)/float(len(duration_values)) - sys.stdout.write('\x1b[2K\rStep {:d}/{:d} \t ({:.3f} s/step)'.format(train_step, args.num_steps_train, ave_duration)) - sys.stdout.flush() + #sys.stdout.write('\x1b[2K\rStep {:d}/{:d} \t ({:.3f} s/step)'.format(train_step, args.num_steps_train, ave_duration)) + #sys.stdout.flush() # Save checkpoint if train_step % args.save_ckpt_step == 0: From 2848ef9b80c43158196338110b7c0f70ced4ee76 Mon Sep 17 00:00:00 2001 From: Timothy Clifford Date: Fri, 22 Oct 2021 12:55:55 -0400 Subject: [PATCH 5/7] friday grind --- src/train.py | 433 ++++++++++++++++++++++----------------------- test/test_train.py | 19 ++ 2 files changed, 231 insertions(+), 221 deletions(-) create mode 100644 test/test_train.py diff --git a/src/train.py b/src/train.py index a661bcb..ffe9751 100644 --- a/src/train.py +++ b/src/train.py @@ -17,246 +17,237 @@ import StateBuffer import Model -def get_train_args(): - train_params = argparse.ArgumentParser() - - # Environment parameters - train_params.add_argument("--env", type=str, default='BreakoutDeterministic-v4', help="Environment to use (must have RGB image state space and discrete action space)") - train_params.add_argument("--render", type=bool, default=False, help="Whether or not to display the environment on the screen during training") - train_params.add_argument("--random_seed", type=int, default=1234, help="Random seed for reproducability") - train_params.add_argument("--frame_width", type=int, default=105, help="Frame width after resize.") - train_params.add_argument("--frame_height", type=int, default=80, help="Frame height after resize.") - train_params.add_argument("--frames_per_state", type=int, default=4, help="Sequence of frames which constitutes a single state.") - - # Training parameters - train_params.add_argument("--num_steps_train", type=int, default=50000000, help="Number of steps to train for") - train_params.add_argument("--train_frequency", type=int, default=4, help="Perform training step every N game steps.") - train_params.add_argument("--max_ep_steps", type=int, default=2000, help="Maximum number of steps per episode") - train_params.add_argument("--batch_size", type=int, default=32) - train_params.add_argument("--learning_rate", type=float, default=0.00025) - train_params.add_argument("--replay_mem_size", type=int, default=1000000, help="Maximum size of replay memory buffer") - train_params.add_argument("--initial_replay_mem_size", type=int, default=50000, help="Initial size of replay memory (populated by random actions) before learning can start") - train_params.add_argument("--epsilon_start", type=float, default=1.0, help="Exploration rate at the beginning of training.") - train_params.add_argument("--epsilon_end", type=float, default=0.1, help="Exploration rate at the end of decay.") - train_params.add_argument("--epsilon_step_end", type=int, default=1000000, help="After how many steps to stop decaying the exploration rate.") - train_params.add_argument("--discount_rate", type=float, default=0.99, help="Discount rate (gamma) for future rewards.") - train_params.add_argument("--update_target_step", type=float, default=10000, help="Copy current network parameters to target network every N steps.") - train_params.add_argument("--save_ckpt_step", type=float, default=250000, help="Save checkpoint every N steps") - train_params.add_argument("--save_log_step", type=int, default=1000, help="Save logs every N steps") - - # Files/directories - train_params.add_argument("--ckpt_dir", type=str, default='./ckpts', help="Directory for saving/loading checkpoints") - train_params.add_argument("--ckpt_file", type=str, default=None, help="Checkpoint file to load and resume training from (if None, train from scratch)") - train_params.add_argument("--log_dir", type=str, default='./logs/train', help="Directory for saving logs") - - return train_params.parse_args() - - -def train(args): - - # Function to return exploration rate based on current step - def exploration_rate(current_step, exp_rate_start, exp_rate_end, exp_step_end): - if current_step < exp_step_end: - exploration_rate = current_step * ((exp_rate_end-exp_rate_start)/(float(exp_step_end))) + 1 - else: - exploration_rate = exp_rate_end +class Train(): + + def __init__(env='SuperMarioBros-1-1-v0', render=False, random_seed=1234, frame_width=240, frame_height=256, + frames_per_state=4, num_steps_train = 50000000, train_frequency=4, max_ep_steps=2000, batch_size=32, + learning_rate=.00025, replay_mem_size=1000000, intitial_replay_mem_size=50000, epsilon_start=1.0, + epsilon_end=0.1, epsilon_step_end=1000000, discount_rate=0.99, update_target_step=10000, + save_ckpt_step=250000, save_log_step=1000, ckpt_dir='./ckpts', ckpt_file=None, log_dir='./logs/train'): + self.env = env + self.render = render + self.random_seed = random_seed + self.frame_width = frame_width + self.frame_height = frame_height + self.frames_per_state = frames_per_state + self.num_steps_train = num_steps_train + self.train_frequency = train_frequency + self.max_ep_steps = max_ep_steps + self.batch_size = batch_size + self.learning_rate = learning_rate + self.replay_mem_size = replay_mem_size + self.initial_replay_mem_size = intitial_replay_mem_size + self.epsilon_start = epsilon_start + self.epsilon_end = epsilon_end + self.epsilon_step_end = epsilon_step_end + self.discount_rate = discount_rate + self.update_target_step = update_target_step + self.save_ckpt_step = save_ckpt_step + self.save_log_step = save_log_step + self.ckpt_dir = ckpt_dir + self.ckpt_file = ckpt_file + self.log_dir = log_dir + + def train(self): + + # Function to return exploration rate based on current step + def exploration_rate(current_step, exp_rate_start, exp_rate_end, exp_step_end): + if current_step < exp_step_end: + exploration_rate = current_step * ((exp_rate_end-exp_rate_start)/(float(exp_step_end))) + 1 + else: + exploration_rate = exp_rate_end + + return exploration_rate + + # Function to update target network parameters with main network parameters + def update_target_network(from_scope, to_scope): + from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope) + to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope) + + op_holder = [] - return exploration_rate - - # Function to update target network parameters with main network parameters - def update_target_network(from_scope, to_scope): - from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope) - to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope) - - op_holder = [] + # Update old network parameters with new network parameters + for from_var,to_var in zip(from_vars,to_vars): + op_holder.append(to_var.assign(from_var)) + + return op_holder - # Update old network parameters with new network parameters - for from_var,to_var in zip(from_vars,to_vars): - op_holder.append(to_var.assign(from_var)) - return op_holder - - - # Create environment - env = gym_super_mario_bros.make(args.env) - num_actions = 7 - - # Initialise replay memory and state buffer - replay_mem = ReplayMemory(args) # don't exist yet ---------------------------------- - state_buf = StateBuffer(args) - - # Define input placeholders - state_ph = tf.placeholder(tf.uint8, (None, args.frame_height, args.frame_width, args.frames_per_state)) - action_ph = tf.placeholder(tf.int32, (None)) - target_ph = tf.placeholder(tf.float32, (None)) - - # Instantiate DQN network - #DQN = DeepQNetwork(num_actions, state_ph, action_ph, target_ph, args.learning_rate, scope='DQN_main') # Note: One scope cannot be the prefix of another scope (e.g. cannot name this scope 'DQN' and - # target network scope 'DQN_target', as a search for vars in 'DQN' scope will return both networks' vars) - DQN = Model(240, 256) - DQN_predict_op = DQN.predict() #talk to rithvik -------------------------------------------------- - DQN_train_step_op = DQN.train_step() - - # Instantiate DQN target network - #DQN_target = DeepQNetwork(num_actions, state_ph, scope='DQN_target') - - #update_target_op = update_target_network('DQN_main', 'DQN_target') + # Create environment + env = gym_super_mario_bros.make(self.env) + num_actions = 7 - # Create session - config = tf.ConfigProto(allow_soft_placement=True) - config.gpu_options.allow_growth = True - sess = tf.Session(config=config) + # Initialise replay memory and state buffer + replay_mem = ReplayMemory(self) # don't exist yet ---------------------------------- + state_buf = StateBuffer(self) - # Add summaries for Tensorboard visualisation - tf.summary.scalar('Loss', DQN.loss) - reward_var = tf.Variable(0.0, trainable=False) - tf.summary.scalar("Episode Reward", reward_var) - epsilon_var = tf.Variable(args.epsilon_start, trainable=False) - tf.summary.scalar("Exploration Rate", epsilon_var) - summary_op = tf.summary.merge_all() + # Define input placeholders + state_ph = tf.placeholder(tf.uint8, (None, self.frame_height, self.frame_width, self.frames_per_state)) + action_ph = tf.placeholder(tf.int32, (None)) + target_ph = tf.placeholder(tf.float32, (None)) - # Define saver for saving model ckpts - model_name = 'model.ckpt' - checkpoint_path = os.path.join(args.ckpt_dir, model_name) - if not os.path.exists(args.ckpt_dir): - os.makedirs(args.ckpt_dir) - saver = tf.train.Saver(max_to_keep=201) - - # Create summary writer to write summaries to disk - if not os.path.exists(args.log_dir): - os.makedirs(args.log_dir) - summary_writer = tf.summary.FileWriter(args.log_dir, sess.graph) - - # Load ckpt file if given - if args.ckpt_file is not None: - loader = tf.train.Saver() #Restore all variables from ckpt - ckpt = args.ckpt_dir + '/' + args.ckpt_file - ckpt_split = ckpt.split('-') - step_str = ckpt_split[-1] - start_step = int(step_str) - loader.restore(sess, ckpt) - else: - start_step = 0 - sess.run(tf.global_variables_initializer()) - sess.run(update_target_op) - + # Instantiate DQN network + #DQN = DeepQNetwork(num_actions, state_ph, action_ph, target_ph, self.learning_rate, scope='DQN_main') # Note: One scope cannot be the prefix of another scope (e.g. cannot name this scope 'DQN' and + # target network scope 'DQN_target', as a search for vars in 'DQN' scope will return both networks' vars) + DQN = Model(240, 256) + DQN_predict_op = DQN.predict() #talk to rithvik -------------------------------------------------- + DQN_train_step_op = DQN.train_step() - ## Begin training - - env.reset() - - ep_steps = 0 - episode_reward = 0 - episode_rewards = [] - duration_values = [] - - # Initially populate replay memory by taking random actions - - for random_step in range(1, args.initial_replay_mem_size+1): + # Instantiate DQN target network + #DQN_target = DeepQNetwork(num_actions, state_ph, scope='DQN_target') - if args.render: - env.render() - else: - env.render(mode='rgb_array') + #update_target_op = update_target_network('DQN_main', 'DQN_target') + + # Create session + config = tf.ConfigProto(allow_soft_placement=True) + config.gpu_options.allow_growth = True + sess = tf.Session(config=config) + + # Add summaries for Tensorboard visualisation + tf.summary.scalar('Loss', DQN.loss) + reward_var = tf.Variable(0.0, trainable=False) + tf.summary.scalar("Episode Reward", reward_var) + epsilon_var = tf.Variable(self.epsilon_start, trainable=False) + tf.summary.scalar("Exploration Rate", epsilon_var) + summary_op = tf.summary.merge_all() + + # Define saver for saving model ckpts + model_name = 'model.ckpt' + checkpoint_path = os.path.join(self.ckpt_dir, model_name) + if not os.path.exists(self.ckpt_dir): + os.makedirs(self.ckpt_dir) + saver = tf.train.Saver(max_to_keep=201) - action = env.action_space.sample() #get an action ------------------------------------------------------ - frame, reward, terminal, _ = env.step(action) - frame = preprocess_image(frame, args.frame_width, args.frame_height) #should be function from utils --------------- - replay_mem.add(action, reward, frame, terminal) + # Create summary writer to write summaries to disk + if not os.path.exists(self.log_dir): + os.makedirs(self.log_dir) + summary_writer = tf.summary.FileWriter(self.log_dir, sess.graph) - if terminal: - env.reset() + # Load ckpt file if given + if self.ckpt_file is not None: + loader = tf.train.Saver() #Restore all variables from ckpt + ckpt = self.ckpt_dir + '/' + self.ckpt_file + ckpt_split = ckpt.split('-') + step_str = ckpt_split[-1] + start_step = int(step_str) + loader.restore(sess, ckpt) + else: + start_step = 0 + sess.run(tf.global_variables_initializer()) + sess.run(update_target_op) + + + ## Begin training - #sys.stdout.write('\x1b[2K\rStep {:d}/{:d}'.format(random_step, args.initial_replay_mem_size)) - #sys.stdout.flush() - - # Begin training process - reset_env_and_state_buffer(env, state_buf, args) #should be function from utils ----------------------------- - #sys.stdout.write('\n\nTraining...\n\n') - #sys.stdout.flush() - - for train_step in range(start_step+1, args.num_steps_train+1): - start_time = time.time() - # Run 'train_frequency' iterations in the game for every training step - for _ in range(0, args.train_frequency): - ep_steps += 1 + env.reset() + + ep_steps = 0 + episode_reward = 0 + episode_rewards = [] + duration_values = [] + + # Initially populate replay memory by taking random actions + + for random_step in range(1, self.initial_replay_mem_size+1): - if args.render: + if self.render: env.render() else: env.render(mode='rgb_array') - # Use an epsilon-greedy policy to select action - epsilon = exploration_rate(train_step, args.epsilon_start, args.epsilon_end, args.epsilon_step_end) - if random.random() < epsilon: - #Choose random action - action = env.action_space.sample() - else: - #Choose action with highest Q-value according to network's current policy - current_state = np.expand_dims(state_buf.get_state(), 0) - action = sess.run(DQN_predict_op, {state_ph:current_state}) - - # Take action and store experience + action = env.action_space.sample() #get an action ------------------------------------------------------ frame, reward, terminal, _ = env.step(action) - frame = preprocess_image(frame, args.frame_width, args.frame_height) # again utils ------------------- - state_buf.add(frame) - replay_mem.add(action, reward, frame, terminal) - episode_reward += reward + frame = preprocess_image(frame, self.frame_width, self.frame_height) #should be function from utils --------------- + replay_mem.add(action, reward, frame, terminal) - if terminal or ep_steps == args.max_ep_steps: - # Collect total reward of episode - episode_rewards.append(episode_reward) - # Reset episode reward and episode steps counters - episode_reward = 0 - ep_steps = 0 - # Reset environment and state buffer for next episode - reset_env_and_state_buffer(env, state_buf, args) #utilsssss -------------------- - - ## Training step - # Get minibatch from replay mem - states_batch, actions_batch, rewards_batch, next_states_batch, terminals_batch = replay_mem.getMinibatch() - # Calculate target by passing next states through the target network and finding max future Q - #future_Q = sess.run(DQN_target.output, {state_ph:next_states_batch}) - max_future_Q = np.max(future_Q, axis=1) #actually this one i don't know if should be utils or not ------------------ - # Q values of the terminal states is 0 by definition - max_future_Q[terminals_batch] = 0 - targets = rewards_batch + (max_future_Q*args.discount_rate) - - # Execute training step - if train_step % args.save_log_step == 0: - # Train and save logs - average_reward = sum(episode_rewards)/len(episode_rewards) - summary_str, _ = sess.run([summary_op, DQN_train_step_op], {state_ph:states_batch, action_ph:actions_batch, target_ph:targets, reward_var: average_reward, epsilon_var: epsilon}) - summary_writer.add_summary(summary_str, train_step) - # Reset rewards buffer - episode_rewards = [] - else: - # Just train - _ = sess.run(DQN_train_step_op, {state_ph:states_batch, action_ph:actions_batch, target_ph:targets}) - - # Update target networks - if train_step % args.update_target_step == 0: - sess.run(update_target_op) # i'm not sure where this comes from-------------------------- - - # Calculate time per step and display progress to console - duration = time.time() - start_time - duration_values.append(duration) - ave_duration = sum(duration_values)/float(len(duration_values)) + if terminal: + env.reset() + + #sys.stdout.write('\x1b[2K\rStep {:d}/{:d}'.format(random_step, self.initial_replay_mem_size)) + #sys.stdout.flush() - #sys.stdout.write('\x1b[2K\rStep {:d}/{:d} \t ({:.3f} s/step)'.format(train_step, args.num_steps_train, ave_duration)) - #sys.stdout.flush() + # Begin training process + reset_env_and_state_buffer(env, state_buf, None) #should be function from utils ----------------------------- REPLACE NONE LATER + #sys.stdout.write('\n\nTraining...\n\n') + #sys.stdout.flush() - # Save checkpoint - if train_step % args.save_ckpt_step == 0: - saver.save(sess, checkpoint_path, global_step=train_step) - sys.stdout.write('\n Checkpoint saved\n') - sys.stdout.flush() + for train_step in range(start_step+1, self.num_steps_train+1): + start_time = time.time() + # Run 'train_frequency' iterations in the game for every training step + for _ in range(0, self.train_frequency): + ep_steps += 1 + + if self.render: + env.render() + else: + env.render(mode='rgb_array') + + # Use an epsilon-greedy policy to select action + epsilon = exploration_rate(train_step, self.epsilon_start, self.epsilon_end, self.epsilon_step_end) + if random.random() < epsilon: + #Choose random action + action = env.action_space.sample() + else: + #Choose action with highest Q-value according to network's current policy + current_state = np.expand_dims(state_buf.get_state(), 0) + action = sess.run(DQN_predict_op, {state_ph:current_state}) + + # Take action and store experience + frame, reward, terminal, _ = env.step(action) + frame = preprocess_image(frame, self.frame_width, self.frame_height) # again utils ------------------- + state_buf.add(frame) + replay_mem.add(action, reward, frame, terminal) + episode_reward += reward + + if terminal or ep_steps == self.max_ep_steps: + # Collect total reward of episode + episode_rewards.append(episode_reward) + # Reset episode reward and episode steps counters + episode_reward = 0 + ep_steps = 0 + # Reset environment and state buffer for next episode + reset_env_and_state_buffer(env, state_buf, self) #utilsssss -------------------- - # Reset time calculation - duration_values = [] + ## Training step + # Get minibatch from replay mem + states_batch, actions_batch, rewards_batch, next_states_batch, terminals_batch = replay_mem.getMinibatch() + # Calculate target by passing next states through the target network and finding max future Q + #future_Q = sess.run(DQN_target.output, {state_ph:next_states_batch}) + max_future_Q = np.max(future_Q, axis=1) #actually this one i don't know if should be utils or not ------------------ + # Q values of the terminal states is 0 by definition + max_future_Q[terminals_batch] = 0 + targets = rewards_batch + (max_future_Q*self.discount_rate) - - -if __name__ == '__main__': - train_args = get_train_args() - train(train_args) \ No newline at end of file + # Execute training step + if train_step % self.save_log_step == 0: + # Train and save logs + average_reward = sum(episode_rewards)/len(episode_rewards) + summary_str, _ = sess.run([summary_op, DQN_train_step_op], {state_ph:states_batch, action_ph:actions_batch, target_ph:targets, reward_var: average_reward, epsilon_var: epsilon}) + summary_writer.add_summary(summary_str, train_step) + # Reset rewards buffer + episode_rewards = [] + else: + # Just train + _ = sess.run(DQN_train_step_op, {state_ph:states_batch, action_ph:actions_batch, target_ph:targets}) + + # Update target networks + if train_step % self.update_target_step == 0: + sess.run(update_target_op) # i'm not sure where this comes from-------------------------- + + # Calculate time per step and display progress to console + duration = time.time() - start_time + duration_values.append(duration) + ave_duration = sum(duration_values)/float(len(duration_values)) + + #sys.stdout.write('\x1b[2K\rStep {:d}/{:d} \t ({:.3f} s/step)'.format(train_step, self.num_steps_train, ave_duration)) + #sys.stdout.flush() + + # Save checkpoint + if train_step % self.save_ckpt_step == 0: + saver.save(sess, checkpoint_path, global_step=train_step) + sys.stdout.write('\n Checkpoint saved\n') + sys.stdout.flush() + + # Reset time calculation + duration_values = [] + \ No newline at end of file diff --git a/test/test_train.py b/test/test_train.py new file mode 100644 index 0000000..02d4c0d --- /dev/null +++ b/test/test_train.py @@ -0,0 +1,19 @@ +import os + +from src.Model import Model +from src.train import Train + +def test_train_init(): + model = Model(100,50) + trainer = Train(env='SuperMarioBros-1-1-v0') + assert(trainer.env == 'SuperMarioBros-1-1-v0') + assert(trainer.frame_width == 240) + assert(trainer.frame_height == 256) + +def test_train_train(): + model = Model(100,50) + trainer = Train(env='SuperMarioBros-1-1-v0') + trainer.train() + assert(os.path.exists('./ckpts')) + + \ No newline at end of file From d9428916bd55ef15b278ba63c3d46a313685f5b4 Mon Sep 17 00:00:00 2001 From: Timothy Clifford Date: Mon, 25 Oct 2021 10:39:03 -0400 Subject: [PATCH 6/7] why no worky --- src/train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/train.py b/src/train.py index ffe9751..4fb05b2 100644 --- a/src/train.py +++ b/src/train.py @@ -12,10 +12,10 @@ import numpy as np import time import random -import utils -import ReplayMemory # doesn't exist --------------------------------------------------------- +import ReplayMemory import StateBuffer import Model +import utils class Train(): @@ -90,7 +90,7 @@ def update_target_network(from_scope, to_scope): #DQN = DeepQNetwork(num_actions, state_ph, action_ph, target_ph, self.learning_rate, scope='DQN_main') # Note: One scope cannot be the prefix of another scope (e.g. cannot name this scope 'DQN' and # target network scope 'DQN_target', as a search for vars in 'DQN' scope will return both networks' vars) DQN = Model(240, 256) - DQN_predict_op = DQN.predict() #talk to rithvik -------------------------------------------------- + DQN_predict_op = DQN.predict() DQN_train_step_op = DQN.train_step() # Instantiate DQN target network From 6d7897ddbd95dabd3c02db223c769276b9ea0ba5 Mon Sep 17 00:00:00 2001 From: Timothy Clifford Date: Fri, 5 Nov 2021 12:15:57 -0400 Subject: [PATCH 7/7] why not --- src/train.py | 456 +++++++++++++++++++++++++++------------------------ 1 file changed, 238 insertions(+), 218 deletions(-) diff --git a/src/train.py b/src/train.py index 4fb05b2..41ef59b 100644 --- a/src/train.py +++ b/src/train.py @@ -1,253 +1,273 @@ ''' ## Train ## -# Adapted from code to train Deep Q Network on OpenAI Gym environments +# Code to train Deep Q Network on OpenAI Gym environments @author: Mark Sinton (msinto93@gmail.com) ''' import os import sys import argparse -import gym_super_mario_bros +import gym import tensorflow as tf import numpy as np import time import random -import ReplayMemory -import StateBuffer -import Model -import utils - -class Train(): - def __init__(env='SuperMarioBros-1-1-v0', render=False, random_seed=1234, frame_width=240, frame_height=256, - frames_per_state=4, num_steps_train = 50000000, train_frequency=4, max_ep_steps=2000, batch_size=32, - learning_rate=.00025, replay_mem_size=1000000, intitial_replay_mem_size=50000, epsilon_start=1.0, - epsilon_end=0.1, epsilon_step_end=1000000, discount_rate=0.99, update_target_step=10000, - save_ckpt_step=250000, save_log_step=1000, ckpt_dir='./ckpts', ckpt_file=None, log_dir='./logs/train'): - self.env = env - self.render = render - self.random_seed = random_seed - self.frame_width = frame_width - self.frame_height = frame_height - self.frames_per_state = frames_per_state - self.num_steps_train = num_steps_train - self.train_frequency = train_frequency - self.max_ep_steps = max_ep_steps - self.batch_size = batch_size - self.learning_rate = learning_rate - self.replay_mem_size = replay_mem_size - self.initial_replay_mem_size = intitial_replay_mem_size - self.epsilon_start = epsilon_start - self.epsilon_end = epsilon_end - self.epsilon_step_end = epsilon_step_end - self.discount_rate = discount_rate - self.update_target_step = update_target_step - self.save_ckpt_step = save_ckpt_step - self.save_log_step = save_log_step - self.ckpt_dir = ckpt_dir - self.ckpt_file = ckpt_file - self.log_dir = log_dir - - def train(self): - - # Function to return exploration rate based on current step - def exploration_rate(current_step, exp_rate_start, exp_rate_end, exp_step_end): - if current_step < exp_step_end: - exploration_rate = current_step * ((exp_rate_end-exp_rate_start)/(float(exp_step_end))) + 1 - else: - exploration_rate = exp_rate_end - - return exploration_rate - - # Function to update target network parameters with main network parameters - def update_target_network(from_scope, to_scope): - from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope) - to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope) - - op_holder = [] - - # Update old network parameters with new network parameters - for from_var,to_var in zip(from_vars,to_vars): - op_holder.append(to_var.assign(from_var)) +from nes_py.wrappers import JoypadSpace +import gym_super_mario_bros +from gym_super_mario_bros.actions import SIMPLE_MOVEMENT + +from utils.utils import preprocess_image, reset_env_and_state_buffer +from utils.experience_replay import ReplayMemory +from utils.state_buffer import StateBuffer +from utils.network import DeepQNetwork + +def get_train_args(args=None): + train_params = argparse.ArgumentParser() + + # Environment parameters + train_params.add_argument("--env", type=str, default='SuperMarioBros-1-1-v0', help="Environment to use (must have RGB image state space and discrete action space)") + train_params.add_argument("--render", type=bool, default=False, help="Whether or not to display the environment on the screen during training") + train_params.add_argument("--random_seed", type=int, default=1234, help="Random seed for reproducability") + train_params.add_argument("--frame_width", type=int, default=105, help="Frame width after resize.") + train_params.add_argument("--frame_height", type=int, default=80, help="Frame height after resize.") + train_params.add_argument("--frames_per_state", type=int, default=4, help="Sequence of frames which constitutes a single state.") + + # Training parameters + train_params.add_argument("--num_steps_train", type=int, default=50000000, help="Number of steps to train for") + train_params.add_argument("--train_frequency", type=int, default=4, help="Perform training step every N game steps.") + train_params.add_argument("--max_ep_steps", type=int, default=2000, help="Maximum number of steps per episode") + train_params.add_argument("--batch_size", type=int, default=32) + train_params.add_argument("--learning_rate", type=float, default=0.00025) + train_params.add_argument("--replay_mem_size", type=int, default=1000000, help="Maximum size of replay memory buffer") + train_params.add_argument("--initial_replay_mem_size", type=int, default=50000, help="Initial size of replay memory (populated by random actions) before learning can start") + train_params.add_argument("--epsilon_start", type=float, default=1.0, help="Exploration rate at the beginning of training.") + train_params.add_argument("--epsilon_end", type=float, default=0.1, help="Exploration rate at the end of decay.") + train_params.add_argument("--epsilon_step_end", type=int, default=1000000, help="After how many steps to stop decaying the exploration rate.") + train_params.add_argument("--discount_rate", type=float, default=0.99, help="Discount rate (gamma) for future rewards.") + train_params.add_argument("--update_target_step", type=float, default=10000, help="Copy current network parameters to target network every N steps.") + train_params.add_argument("--save_ckpt_step", type=float, default=250000, help="Save checkpoint every N steps") + train_params.add_argument("--save_log_step", type=int, default=1000, help="Save logs every N steps") + + # Files/directories + train_params.add_argument("--ckpt_dir", type=str, default='./ckpts', help="Directory for saving/loading checkpoints") + train_params.add_argument("--ckpt_file", type=str, default=None, help="Checkpoint file to load and resume training from (if None, train from scratch)") + train_params.add_argument("--log_dir", type=str, default='./logs/train', help="Directory for saving logs") + + return train_params.parse_args(args) + + +def train(args): + + # Function to return exploration rate based on current step + def exploration_rate(current_step, exp_rate_start, exp_rate_end, exp_step_end): + if current_step < exp_step_end: + exploration_rate = current_step * ((exp_rate_end-exp_rate_start)/(float(exp_step_end))) + 1 + else: + exploration_rate = exp_rate_end - return op_holder - - - # Create environment - env = gym_super_mario_bros.make(self.env) - num_actions = 7 + return exploration_rate + + # Function to update target network parameters with main network parameters + def update_target_network(from_scope, to_scope): + from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope) + to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope) + + op_holder = [] - # Initialise replay memory and state buffer - replay_mem = ReplayMemory(self) # don't exist yet ---------------------------------- - state_buf = StateBuffer(self) + # Update old network parameters with new network parameters + for from_var,to_var in zip(from_vars,to_vars): + op_holder.append(to_var.assign(from_var)) - # Define input placeholders - state_ph = tf.placeholder(tf.uint8, (None, self.frame_height, self.frame_width, self.frames_per_state)) - action_ph = tf.placeholder(tf.int32, (None)) - target_ph = tf.placeholder(tf.float32, (None)) + return op_holder + + + # Create environment + env = gym_super_mario_bros.make(args.env) + env = JoypadSpace(env, SIMPLE_MOVEMENT) + num_actions = env.action_space.n + + # Initialise replay memory and state buffer + replay_mem = ReplayMemory(args) + state_buf = StateBuffer(args) + + # Define input placeholders + state_ph = tf.placeholder(tf.uint8, (None, args.frame_height, args.frame_width, args.frames_per_state)) + action_ph = tf.placeholder(tf.int32, (None)) + target_ph = tf.placeholder(tf.float32, (None)) + + # Instantiate DQN network + DQN = DeepQNetwork(num_actions, state_ph, action_ph, target_ph, args.learning_rate, scope='DQN_main') # Note: One scope cannot be the prefix of another scope (e.g. cannot name this scope 'DQN' and + # target network scope 'DQN_target', as a search for vars in 'DQN' scope will return both networks' vars) + DQN_predict_op = DQN.predict() + DQN_train_step_op = DQN.train_step() + + # Instantiate DQN target network + DQN_target = DeepQNetwork(num_actions, state_ph, scope='DQN_target') + + update_target_op = update_target_network('DQN_main', 'DQN_target') - # Instantiate DQN network - #DQN = DeepQNetwork(num_actions, state_ph, action_ph, target_ph, self.learning_rate, scope='DQN_main') # Note: One scope cannot be the prefix of another scope (e.g. cannot name this scope 'DQN' and - # target network scope 'DQN_target', as a search for vars in 'DQN' scope will return both networks' vars) - DQN = Model(240, 256) - DQN_predict_op = DQN.predict() - DQN_train_step_op = DQN.train_step() + # Create session + config = tf.ConfigProto(allow_soft_placement=True) + config.gpu_options.allow_growth = True + sess = tf.Session(config=config) - # Instantiate DQN target network - #DQN_target = DeepQNetwork(num_actions, state_ph, scope='DQN_target') + # Add summaries for Tensorboard visualisation + tf.summary.scalar('Loss', DQN.loss) + reward_var = tf.Variable(0.0, trainable=False) + tf.summary.scalar("Episode_Reward", reward_var) + epsilon_var = tf.Variable(args.epsilon_start, trainable=False) + tf.summary.scalar("Exploration_Rate", epsilon_var) + summary_op = tf.summary.merge_all() - #update_target_op = update_target_network('DQN_main', 'DQN_target') - - # Create session - config = tf.ConfigProto(allow_soft_placement=True) - config.gpu_options.allow_growth = True - sess = tf.Session(config=config) - - # Add summaries for Tensorboard visualisation - tf.summary.scalar('Loss', DQN.loss) - reward_var = tf.Variable(0.0, trainable=False) - tf.summary.scalar("Episode Reward", reward_var) - epsilon_var = tf.Variable(self.epsilon_start, trainable=False) - tf.summary.scalar("Exploration Rate", epsilon_var) - summary_op = tf.summary.merge_all() - - # Define saver for saving model ckpts - model_name = 'model.ckpt' - checkpoint_path = os.path.join(self.ckpt_dir, model_name) - if not os.path.exists(self.ckpt_dir): - os.makedirs(self.ckpt_dir) - saver = tf.train.Saver(max_to_keep=201) + # Define saver for saving model ckpts + model_name = 'model.ckpt' + checkpoint_path = os.path.join(args.ckpt_dir, model_name) + if not os.path.exists(args.ckpt_dir): + os.makedirs(args.ckpt_dir) + saver = tf.train.Saver(max_to_keep=201) + + # Create summary writer to write summaries to disk + if not os.path.exists(args.log_dir): + os.makedirs(args.log_dir) + summary_writer = tf.summary.FileWriter(args.log_dir, sess.graph) + + # Load ckpt file if given + if args.ckpt_file is not None: + loader = tf.train.Saver() #Restore all variables from ckpt + ckpt = args.ckpt_dir + '/' + args.ckpt_file + ckpt_split = ckpt.split('-') + step_str = ckpt_split[-1] + start_step = int(step_str) + loader.restore(sess, ckpt) + else: + start_step = 0 + sess.run(tf.global_variables_initializer()) + sess.run(update_target_op) + - # Create summary writer to write summaries to disk - if not os.path.exists(self.log_dir): - os.makedirs(self.log_dir) - summary_writer = tf.summary.FileWriter(self.log_dir, sess.graph) + ## Begin training + + env.reset() + + ep_steps = 0 + episode_reward = 0 + episode_rewards = [] + duration_values = [] + + # Initially populate replay memory by taking random actions + sys.stdout.write('\nPopulating replay memory with random actions...\n') + sys.stdout.flush() + + for random_step in range(1, args.initial_replay_mem_size+1): - # Load ckpt file if given - if self.ckpt_file is not None: - loader = tf.train.Saver() #Restore all variables from ckpt - ckpt = self.ckpt_dir + '/' + self.ckpt_file - ckpt_split = ckpt.split('-') - step_str = ckpt_split[-1] - start_step = int(step_str) - loader.restore(sess, ckpt) + if args.render: + env.render() else: - start_step = 0 - sess.run(tf.global_variables_initializer()) - sess.run(update_target_op) - - - ## Begin training - - env.reset() + env.render(mode='rgb_array') - ep_steps = 0 - episode_reward = 0 - episode_rewards = [] - duration_values = [] - - # Initially populate replay memory by taking random actions + action = env.action_space.sample() + frame, reward, terminal, _ = env.step(action) + frame = preprocess_image(frame, args.frame_width, args.frame_height) + replay_mem.add(action, reward, frame, terminal) - for random_step in range(1, self.initial_replay_mem_size+1): + if terminal: + env.reset() + + sys.stdout.write('\x1b[2K\rStep {:d}/{:d}'.format(random_step, args.initial_replay_mem_size)) + sys.stdout.flush() + + # Begin training process + reset_env_and_state_buffer(env, state_buf, args) + sys.stdout.write('\n\nTraining...\n\n') + sys.stdout.flush() + + for train_step in range(start_step+1, args.num_steps_train+1): + start_time = time.time() + # Run 'train_frequency' iterations in the game for every training step + for _ in range(0, args.train_frequency): + ep_steps += 1 - if self.render: + if args.render: env.render() else: env.render(mode='rgb_array') - action = env.action_space.sample() #get an action ------------------------------------------------------ + # Use an epsilon-greedy policy to select action + epsilon = exploration_rate(train_step, args.epsilon_start, args.epsilon_end, args.epsilon_step_end) + if random.random() < epsilon: + #print("random :(") + #Choose random action + action = env.action_space.sample() + else: + #print("greedy :)") + #Choose action with highest Q-value according to network's current policy + current_state = np.expand_dims(state_buf.get_state(), 0) + action = sess.run(DQN_predict_op, {state_ph:current_state})[0] + + # Take action and store experience + #print(action) frame, reward, terminal, _ = env.step(action) - frame = preprocess_image(frame, self.frame_width, self.frame_height) #should be function from utils --------------- - replay_mem.add(action, reward, frame, terminal) + + frame = preprocess_image(frame, args.frame_width, args.frame_height) + state_buf.add(frame) + replay_mem.add(action, reward, frame, terminal) + episode_reward += reward - if terminal: - env.reset() - - #sys.stdout.write('\x1b[2K\rStep {:d}/{:d}'.format(random_step, self.initial_replay_mem_size)) - #sys.stdout.flush() + if terminal or ep_steps == args.max_ep_steps: + # Collect total reward of episode + episode_rewards.append(episode_reward) + # Reset episode reward and episode steps counters + episode_reward = 0 + ep_steps = 0 + # Reset environment and state buffer for next episode + reset_env_and_state_buffer(env, state_buf, args) - # Begin training process - reset_env_and_state_buffer(env, state_buf, None) #should be function from utils ----------------------------- REPLACE NONE LATER - #sys.stdout.write('\n\nTraining...\n\n') - #sys.stdout.flush() + ## Training step + # Get minibatch from replay mem + states_batch, actions_batch, rewards_batch, next_states_batch, terminals_batch = replay_mem.getMinibatch() + # Calculate target by passing next states through the target network and finding max future Q + future_Q = sess.run(DQN_target.output, {state_ph:next_states_batch}) + max_future_Q = np.max(future_Q, axis=1) + # Q values of the terminal states is 0 by definition + max_future_Q[terminals_batch] = 0 + targets = rewards_batch + (max_future_Q*args.discount_rate) - for train_step in range(start_step+1, self.num_steps_train+1): - start_time = time.time() - # Run 'train_frequency' iterations in the game for every training step - for _ in range(0, self.train_frequency): - ep_steps += 1 - - if self.render: - env.render() - else: - env.render(mode='rgb_array') - - # Use an epsilon-greedy policy to select action - epsilon = exploration_rate(train_step, self.epsilon_start, self.epsilon_end, self.epsilon_step_end) - if random.random() < epsilon: - #Choose random action - action = env.action_space.sample() - else: - #Choose action with highest Q-value according to network's current policy - current_state = np.expand_dims(state_buf.get_state(), 0) - action = sess.run(DQN_predict_op, {state_ph:current_state}) - - # Take action and store experience - frame, reward, terminal, _ = env.step(action) - frame = preprocess_image(frame, self.frame_width, self.frame_height) # again utils ------------------- - state_buf.add(frame) - replay_mem.add(action, reward, frame, terminal) - episode_reward += reward - - if terminal or ep_steps == self.max_ep_steps: - # Collect total reward of episode - episode_rewards.append(episode_reward) - # Reset episode reward and episode steps counters - episode_reward = 0 - ep_steps = 0 - # Reset environment and state buffer for next episode - reset_env_and_state_buffer(env, state_buf, self) #utilsssss -------------------- - - ## Training step - # Get minibatch from replay mem - states_batch, actions_batch, rewards_batch, next_states_batch, terminals_batch = replay_mem.getMinibatch() - # Calculate target by passing next states through the target network and finding max future Q - #future_Q = sess.run(DQN_target.output, {state_ph:next_states_batch}) - max_future_Q = np.max(future_Q, axis=1) #actually this one i don't know if should be utils or not ------------------ - # Q values of the terminal states is 0 by definition - max_future_Q[terminals_batch] = 0 - targets = rewards_batch + (max_future_Q*self.discount_rate) - - # Execute training step - if train_step % self.save_log_step == 0: - # Train and save logs - average_reward = sum(episode_rewards)/len(episode_rewards) - summary_str, _ = sess.run([summary_op, DQN_train_step_op], {state_ph:states_batch, action_ph:actions_batch, target_ph:targets, reward_var: average_reward, epsilon_var: epsilon}) - summary_writer.add_summary(summary_str, train_step) - # Reset rewards buffer - episode_rewards = [] - else: - # Just train - _ = sess.run(DQN_train_step_op, {state_ph:states_batch, action_ph:actions_batch, target_ph:targets}) - - # Update target networks - if train_step % self.update_target_step == 0: - sess.run(update_target_op) # i'm not sure where this comes from-------------------------- - - # Calculate time per step and display progress to console - duration = time.time() - start_time - duration_values.append(duration) - ave_duration = sum(duration_values)/float(len(duration_values)) + # Execute training step + if train_step % args.save_log_step == 0: + # Train and save logs + average_reward = sum(episode_rewards)/len(episode_rewards) + summary_str, _ = sess.run([summary_op, DQN_train_step_op], {state_ph:states_batch, action_ph:actions_batch, target_ph:targets, reward_var: average_reward, epsilon_var: epsilon}) + summary_writer.add_summary(summary_str, train_step) + # Reset rewards buffer + episode_rewards = [] + else: + # Just train + _ = sess.run(DQN_train_step_op, {state_ph:states_batch, action_ph:actions_batch, target_ph:targets}) + + # Update target networks + if train_step % args.update_target_step == 0: + sess.run(update_target_op) + + # Calculate time per step and display progress to console + duration = time.time() - start_time + duration_values.append(duration) + ave_duration = sum(duration_values)/float(len(duration_values)) + + sys.stdout.write('\x1b[2K\rStep {:d}/{:d} \t ({:.3f} s/step)'.format(train_step, args.num_steps_train, ave_duration)) + sys.stdout.flush() + + # Save checkpoint + if train_step % args.save_ckpt_step == 0: + saver.save(sess, checkpoint_path, global_step=train_step) + sys.stdout.write('\n Checkpoint saved\n') + sys.stdout.flush() - #sys.stdout.write('\x1b[2K\rStep {:d}/{:d} \t ({:.3f} s/step)'.format(train_step, self.num_steps_train, ave_duration)) - #sys.stdout.flush() + # Reset time calculation + duration_values = [] - # Save checkpoint - if train_step % self.save_ckpt_step == 0: - saver.save(sess, checkpoint_path, global_step=train_step) - sys.stdout.write('\n Checkpoint saved\n') - sys.stdout.flush() - - # Reset time calculation - duration_values = [] - \ No newline at end of file + + +if __name__ == '__main__': + train_args = get_train_args() + train(train_args) \ No newline at end of file