From 209f31b452cb3c50b4bf6c6caf94649d1514d46c Mon Sep 17 00:00:00 2001
From: Timothy Clifford <tcliff30@terpmail.umd.edu>
Date: Sun, 10 Oct 2021 20:10:32 -0400
Subject: [PATCH 1/7] not supposed to work: just a base for linking issue

---
 src/train.py | 264 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 264 insertions(+)
 create mode 100644 src/train.py

diff --git a/src/train.py b/src/train.py
new file mode 100644
index 0000000..4b8d76b
--- /dev/null
+++ b/src/train.py
@@ -0,0 +1,264 @@
+'''
+## Train ##
+# Code to train Deep Q Network on OpenAI Gym environments
+@author: Mark Sinton (msinto93@gmail.com) 
+'''
+'^^^should i get rid of this since Ive made a lot of changes or no?'
+import os
+import sys
+import argparse
+import gym_super_mario_bros
+import tensorflow as tf
+import numpy as np
+import time
+import random
+
+from utils.utils import preprocess_image, reset_env_and_state_buffer
+from utils.experience_replay import ReplayMemory   
+from utils.state_buffer import StateBuffer
+from utils.network import DeepQNetwork
+    
+def get_train_args():
+    train_params = argparse.ArgumentParser()
+    
+    # Environment parameters
+    train_params.add_argument("--env", type=str, default='BreakoutDeterministic-v4', help="Environment to use (must have RGB image state space and discrete action space)")
+    train_params.add_argument("--render", type=bool, default=False, help="Whether or not to display the environment on the screen during training")
+    train_params.add_argument("--random_seed", type=int, default=1234, help="Random seed for reproducability")
+    train_params.add_argument("--frame_width", type=int, default=105, help="Frame width after resize.")
+    train_params.add_argument("--frame_height", type=int, default=80, help="Frame height after resize.")
+    train_params.add_argument("--frames_per_state", type=int, default=4, help="Sequence of frames which constitutes a single state.")
+    
+    # Training parameters
+    train_params.add_argument("--num_steps_train", type=int, default=50000000, help="Number of steps to train for")    
+    train_params.add_argument("--train_frequency", type=int, default=4, help="Perform training step every N game steps.")
+    train_params.add_argument("--max_ep_steps", type=int, default=2000, help="Maximum number of steps per episode")
+    train_params.add_argument("--batch_size", type=int, default=32)
+    train_params.add_argument("--learning_rate", type=float, default=0.00025)
+    train_params.add_argument("--replay_mem_size", type=int, default=1000000, help="Maximum size of replay memory buffer")
+    train_params.add_argument("--initial_replay_mem_size", type=int, default=50000, help="Initial size of replay memory (populated by random actions) before learning can start")
+    train_params.add_argument("--epsilon_start", type=float, default=1.0, help="Exploration rate at the beginning of training.")
+    train_params.add_argument("--epsilon_end", type=float, default=0.1, help="Exploration rate at the end of decay.")
+    train_params.add_argument("--epsilon_step_end", type=int, default=1000000, help="After how many steps to stop decaying the exploration rate.")
+    train_params.add_argument("--discount_rate", type=float, default=0.99, help="Discount rate (gamma) for future rewards.")
+    train_params.add_argument("--update_target_step", type=float, default=10000, help="Copy current network parameters to target network every N steps.")
+    train_params.add_argument("--save_ckpt_step", type=float, default=250000, help="Save checkpoint every N steps")
+    train_params.add_argument("--save_log_step", type=int, default=1000, help="Save logs every N steps")
+    
+    # Files/directories
+    train_params.add_argument("--ckpt_dir", type=str, default='./ckpts', help="Directory for saving/loading checkpoints")
+    train_params.add_argument("--ckpt_file", type=str, default=None, help="Checkpoint file to load and resume training from (if None, train from scratch)")
+    train_params.add_argument("--log_dir", type=str, default='./logs/train', help="Directory for saving logs")
+    
+    return train_params.parse_args()
+           
+    
+def train(args):
+    
+    # Function to return exploration rate based on current step
+    def exploration_rate(current_step, exp_rate_start, exp_rate_end, exp_step_end):
+        if current_step < exp_step_end:
+            exploration_rate = current_step * ((exp_rate_end-exp_rate_start)/(float(exp_step_end))) + 1
+        else:
+            exploration_rate = exp_rate_end
+            
+        return exploration_rate
+    
+    # Function to update target network parameters with main network parameters
+    def update_target_network(from_scope, to_scope):    
+        from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)    
+        to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)
+    
+        op_holder = []
+        
+        # Update old network parameters with new network parameters
+        for from_var,to_var in zip(from_vars,to_vars):
+            op_holder.append(to_var.assign(from_var))
+        
+        return op_holder
+    
+    
+    # Create environment
+    env = gym_super_mario_bros.make(args.env)
+    num_actions = env.action_space.n
+    
+    # Initialise replay memory and state buffer
+    replay_mem = ReplayMemory(args)
+    state_buf = StateBuffer(args)
+    
+    # Define input placeholders    
+    state_ph = tf.placeholder(tf.uint8, (None, args.frame_height, args.frame_width, args.frames_per_state))
+    action_ph = tf.placeholder(tf.int32, (None))
+    target_ph = tf.placeholder(tf.float32, (None))
+    
+    # Instantiate DQN network
+    DQN = DeepQNetwork(num_actions, state_ph, action_ph, target_ph, args.learning_rate, scope='DQN_main')   # Note: One scope cannot be the prefix of another scope (e.g. cannot name this scope 'DQN' and 
+                                                                                                            # target network scope 'DQN_target', as a search for vars in 'DQN' scope will return both networks' vars)
+    DQN_predict_op = DQN.predict()
+    DQN_train_step_op = DQN.train_step()
+    
+    # Instantiate DQN target network
+    DQN_target = DeepQNetwork(num_actions, state_ph, scope='DQN_target')
+    
+    update_target_op = update_target_network('DQN_main', 'DQN_target')
+        
+    # Create session
+    config = tf.ConfigProto(allow_soft_placement=True)
+    config.gpu_options.allow_growth = True
+    sess = tf.Session(config=config)       
+        
+    # Add summaries for Tensorboard visualisation
+    tf.summary.scalar('Loss', DQN.loss)  
+    reward_var = tf.Variable(0.0, trainable=False)
+    tf.summary.scalar("Episode Reward", reward_var)
+    epsilon_var = tf.Variable(args.epsilon_start, trainable=False)
+    tf.summary.scalar("Exploration Rate", epsilon_var)
+    summary_op = tf.summary.merge_all() 
+        
+    # Define saver for saving model ckpts
+    model_name = 'model.ckpt'
+    checkpoint_path = os.path.join(args.ckpt_dir, model_name)        
+    if not os.path.exists(args.ckpt_dir):
+        os.makedirs(args.ckpt_dir)
+    saver = tf.train.Saver(max_to_keep=201)  
+    
+    # Create summary writer to write summaries to disk
+    if not os.path.exists(args.log_dir):
+        os.makedirs(args.log_dir)
+    summary_writer = tf.summary.FileWriter(args.log_dir, sess.graph)
+    
+    # Load ckpt file if given
+    if args.ckpt_file is not None:
+        loader = tf.train.Saver()   #Restore all variables from ckpt
+        ckpt = args.ckpt_dir + '/' + args.ckpt_file
+        ckpt_split = ckpt.split('-')
+        step_str = ckpt_split[-1]
+        start_step = int(step_str)    
+        loader.restore(sess, ckpt)
+    else:
+        start_step = 0
+        sess.run(tf.global_variables_initializer())
+        sess.run(update_target_op)
+
+        
+    ## Begin training
+                       
+    env.reset()
+    
+    ep_steps = 0
+    episode_reward = 0
+    episode_rewards = []
+    duration_values = []
+
+    # Initially populate replay memory by taking random actions 
+    sys.stdout.write('\nPopulating replay memory with random actions...\n')   
+    sys.stdout.flush()          
+    
+    for random_step in range(1, args.initial_replay_mem_size+1):
+        
+        if args.render:
+            env.render()
+        else:
+            env.render(mode='rgb_array')
+        
+        action = env.action_space.sample()
+        frame, reward, terminal, _ = env.step(action)
+        frame = preprocess_image(frame, args.frame_width, args.frame_height)
+        replay_mem.add(action, reward, frame, terminal)
+        
+        if terminal:
+            env.reset()
+                        
+        sys.stdout.write('\x1b[2K\rStep {:d}/{:d}'.format(random_step, args.initial_replay_mem_size))
+        sys.stdout.flush() 
+    
+    # Begin training process         
+    reset_env_and_state_buffer(env, state_buf, args)
+    sys.stdout.write('\n\nTraining...\n\n')   
+    sys.stdout.flush()
+    
+    for train_step in range(start_step+1, args.num_steps_train+1):      
+        start_time = time.time()  
+        # Run 'train_frequency' iterations in the game for every training step       
+        for _ in range(0, args.train_frequency):
+            ep_steps += 1
+            
+            if args.render:
+                env.render()
+            else:
+                env.render(mode='rgb_array')
+            
+            # Use an epsilon-greedy policy to select action
+            epsilon = exploration_rate(train_step, args.epsilon_start, args.epsilon_end, args.epsilon_step_end)
+            if random.random() < epsilon:
+                #Choose random action
+                action = env.action_space.sample()
+            else:
+                #Choose action with highest Q-value according to network's current policy
+                current_state = np.expand_dims(state_buf.get_state(), 0)
+                action = sess.run(DQN_predict_op, {state_ph:current_state})
+                   
+            # Take action and store experience
+            frame, reward, terminal, _ = env.step(action)
+            frame = preprocess_image(frame, args.frame_width, args.frame_height)
+            state_buf.add(frame)
+            replay_mem.add(action, reward, frame, terminal) 
+            episode_reward += reward     
+            
+            if terminal or ep_steps == args.max_ep_steps:  
+                # Collect total reward of episode              
+                episode_rewards.append(episode_reward)
+                # Reset episode reward and episode steps counters
+                episode_reward = 0
+                ep_steps = 0
+                # Reset environment and state buffer for next episode
+                reset_env_and_state_buffer(env, state_buf, args)                
+        
+        ## Training step    
+        # Get minibatch from replay mem
+        states_batch, actions_batch, rewards_batch, next_states_batch, terminals_batch = replay_mem.getMinibatch()
+        # Calculate target by passing next states through the target network and finding max future Q
+        future_Q = sess.run(DQN_target.output, {state_ph:next_states_batch})
+        max_future_Q = np.max(future_Q, axis=1)
+        # Q values of the terminal states is 0 by definition
+        max_future_Q[terminals_batch] = 0
+        targets = rewards_batch + (max_future_Q*args.discount_rate)
+        
+        # Execute training step
+        if train_step % args.save_log_step == 0:
+            # Train and save logs
+            average_reward = sum(episode_rewards)/len(episode_rewards)
+            summary_str, _ = sess.run([summary_op, DQN_train_step_op], {state_ph:states_batch, action_ph:actions_batch, target_ph:targets, reward_var: average_reward, epsilon_var: epsilon})
+            summary_writer.add_summary(summary_str, train_step)
+            # Reset rewards buffer
+            episode_rewards = []
+        else:
+            # Just train
+            _ = sess.run(DQN_train_step_op, {state_ph:states_batch, action_ph:actions_batch, target_ph:targets})
+        
+        # Update target networks    
+        if train_step % args.update_target_step == 0:
+            sess.run(update_target_op)
+        
+        # Calculate time per step and display progress to console   
+        duration = time.time() - start_time
+        duration_values.append(duration)
+        ave_duration = sum(duration_values)/float(len(duration_values))
+        
+        sys.stdout.write('\x1b[2K\rStep {:d}/{:d} \t ({:.3f} s/step)'.format(train_step, args.num_steps_train, ave_duration))
+        sys.stdout.flush()       
+        
+        # Save checkpoint       
+        if train_step % args.save_ckpt_step == 0:
+            saver.save(sess, checkpoint_path, global_step=train_step)
+            sys.stdout.write('\n Checkpoint saved\n')   
+            sys.stdout.flush() 
+            
+            # Reset time calculation
+            duration_values = []
+            
+                  
+
+if  __name__ == '__main__':
+    train_args = get_train_args()
+    train(train_args)         
\ No newline at end of file

From ee35f3b4b815d47a89508ab43c6c3ed419677710 Mon Sep 17 00:00:00 2001
From: Timothy Clifford <tcliff30@terpmail.umd.edu>
Date: Tue, 19 Oct 2021 10:23:22 -0400
Subject: [PATCH 2/7] a

---
 src/train.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/train.py b/src/train.py
index 4b8d76b..6f70052 100644
--- a/src/train.py
+++ b/src/train.py
@@ -17,6 +17,7 @@
 from utils.experience_replay import ReplayMemory   
 from utils.state_buffer import StateBuffer
 from utils.network import DeepQNetwork
+
     
 def get_train_args():
     train_params = argparse.ArgumentParser()

From 7b763ca1a771d216ac1888d63024e70463b0ab05 Mon Sep 17 00:00:00 2001
From: Timothy Clifford <tcliff30@terpmail.umd.edu>
Date: Tue, 19 Oct 2021 10:52:39 -0400
Subject: [PATCH 3/7] checkpoint line 140ish

---
 src/train.py | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/src/train.py b/src/train.py
index 6f70052..f1962e1 100644
--- a/src/train.py
+++ b/src/train.py
@@ -1,9 +1,9 @@
 '''
 ## Train ##
-# Code to train Deep Q Network on OpenAI Gym environments
+# Adapted from code to train Deep Q Network on OpenAI Gym environments
 @author: Mark Sinton (msinto93@gmail.com) 
 '''
-'^^^should i get rid of this since Ive made a lot of changes or no?'
+
 import os
 import sys
 import argparse
@@ -12,12 +12,12 @@
 import numpy as np
 import time
 import random
-
-from utils.utils import preprocess_image, reset_env_and_state_buffer
-from utils.experience_replay import ReplayMemory   
-from utils.state_buffer import StateBuffer
-from utils.network import DeepQNetwork
-
+import utils
+''' dont exist yet in git 
+import ReplayMemory
+import StateBuffer
+'''
+import Model
     
 def get_train_args():
     train_params = argparse.ArgumentParser()
@@ -81,27 +81,28 @@ def update_target_network(from_scope, to_scope):
     
     # Create environment
     env = gym_super_mario_bros.make(args.env)
-    num_actions = env.action_space.n
+    num_actions = 7
     
     # Initialise replay memory and state buffer
     replay_mem = ReplayMemory(args)
     state_buf = StateBuffer(args)
     
     # Define input placeholders    
-    state_ph = tf.placeholder(tf.uint8, (None, args.frame_height, args.frame_width, args.frames_per_state))
-    action_ph = tf.placeholder(tf.int32, (None))
-    target_ph = tf.placeholder(tf.float32, (None))
+    #state_ph = tf.placeholder(tf.uint8, (None, args.frame_height, args.frame_width, args.frames_per_state))
+    #action_ph = tf.placeholder(tf.int32, (None))
+    #target_ph = tf.placeholder(tf.float32, (None))
     
     # Instantiate DQN network
-    DQN = DeepQNetwork(num_actions, state_ph, action_ph, target_ph, args.learning_rate, scope='DQN_main')   # Note: One scope cannot be the prefix of another scope (e.g. cannot name this scope 'DQN' and 
+    #DQN = DeepQNetwork(num_actions, state_ph, action_ph, target_ph, args.learning_rate, scope='DQN_main')   # Note: One scope cannot be the prefix of another scope (e.g. cannot name this scope 'DQN' and   
                                                                                                             # target network scope 'DQN_target', as a search for vars in 'DQN' scope will return both networks' vars)
+    DQN = Model(240, 256)
     DQN_predict_op = DQN.predict()
     DQN_train_step_op = DQN.train_step()
     
     # Instantiate DQN target network
-    DQN_target = DeepQNetwork(num_actions, state_ph, scope='DQN_target')
+    #DQN_target = DeepQNetwork(num_actions, state_ph, scope='DQN_target')
     
-    update_target_op = update_target_network('DQN_main', 'DQN_target')
+    #update_target_op = update_target_network('DQN_main', 'DQN_target')
         
     # Create session
     config = tf.ConfigProto(allow_soft_placement=True)
@@ -142,7 +143,7 @@ def update_target_network(from_scope, to_scope):
         sess.run(update_target_op)
 
         
-    ## Begin training
+    ## Begin training # THIS IS WHERE I AM AT LOOKY HERE TIM WHEN YOU GO
                        
     env.reset()
     
@@ -219,7 +220,7 @@ def update_target_network(from_scope, to_scope):
         # Get minibatch from replay mem
         states_batch, actions_batch, rewards_batch, next_states_batch, terminals_batch = replay_mem.getMinibatch()
         # Calculate target by passing next states through the target network and finding max future Q
-        future_Q = sess.run(DQN_target.output, {state_ph:next_states_batch})
+        #future_Q = sess.run(DQN_target.output, {state_ph:next_states_batch})
         max_future_Q = np.max(future_Q, axis=1)
         # Q values of the terminal states is 0 by definition
         max_future_Q[terminals_batch] = 0

From 690cfdddbb5a8c7c654e53e2df67dfc001d1cede Mon Sep 17 00:00:00 2001
From: Timothy Clifford <tcliff30@terpmail.umd.edu>
Date: Tue, 19 Oct 2021 13:17:55 -0400
Subject: [PATCH 4/7] comments and fixes

---
 src/train.py | 46 +++++++++++++++++++++-------------------------
 1 file changed, 21 insertions(+), 25 deletions(-)

diff --git a/src/train.py b/src/train.py
index f1962e1..a661bcb 100644
--- a/src/train.py
+++ b/src/train.py
@@ -13,10 +13,8 @@
 import time
 import random
 import utils
-''' dont exist yet in git 
-import ReplayMemory
+import ReplayMemory # doesn't exist ---------------------------------------------------------
 import StateBuffer
-'''
 import Model
     
 def get_train_args():
@@ -84,19 +82,19 @@ def update_target_network(from_scope, to_scope):
     num_actions = 7
     
     # Initialise replay memory and state buffer
-    replay_mem = ReplayMemory(args)
+    replay_mem = ReplayMemory(args) # don't exist yet ----------------------------------
     state_buf = StateBuffer(args)
     
     # Define input placeholders    
-    #state_ph = tf.placeholder(tf.uint8, (None, args.frame_height, args.frame_width, args.frames_per_state))
-    #action_ph = tf.placeholder(tf.int32, (None))
-    #target_ph = tf.placeholder(tf.float32, (None))
+    state_ph = tf.placeholder(tf.uint8, (None, args.frame_height, args.frame_width, args.frames_per_state))
+    action_ph = tf.placeholder(tf.int32, (None))
+    target_ph = tf.placeholder(tf.float32, (None))
     
     # Instantiate DQN network
     #DQN = DeepQNetwork(num_actions, state_ph, action_ph, target_ph, args.learning_rate, scope='DQN_main')   # Note: One scope cannot be the prefix of another scope (e.g. cannot name this scope 'DQN' and   
                                                                                                             # target network scope 'DQN_target', as a search for vars in 'DQN' scope will return both networks' vars)
     DQN = Model(240, 256)
-    DQN_predict_op = DQN.predict()
+    DQN_predict_op = DQN.predict() #talk to rithvik --------------------------------------------------
     DQN_train_step_op = DQN.train_step()
     
     # Instantiate DQN target network
@@ -143,7 +141,7 @@ def update_target_network(from_scope, to_scope):
         sess.run(update_target_op)
 
         
-    ## Begin training # THIS IS WHERE I AM AT LOOKY HERE TIM WHEN YOU GO
+    ## Begin training 
                        
     env.reset()
     
@@ -152,9 +150,7 @@ def update_target_network(from_scope, to_scope):
     episode_rewards = []
     duration_values = []
 
-    # Initially populate replay memory by taking random actions 
-    sys.stdout.write('\nPopulating replay memory with random actions...\n')   
-    sys.stdout.flush()          
+    # Initially populate replay memory by taking random actions       
     
     for random_step in range(1, args.initial_replay_mem_size+1):
         
@@ -163,21 +159,21 @@ def update_target_network(from_scope, to_scope):
         else:
             env.render(mode='rgb_array')
         
-        action = env.action_space.sample()
+        action = env.action_space.sample() #get an action ------------------------------------------------------
         frame, reward, terminal, _ = env.step(action)
-        frame = preprocess_image(frame, args.frame_width, args.frame_height)
+        frame = preprocess_image(frame, args.frame_width, args.frame_height) #should be function from utils ---------------
         replay_mem.add(action, reward, frame, terminal)
         
         if terminal:
             env.reset()
                         
-        sys.stdout.write('\x1b[2K\rStep {:d}/{:d}'.format(random_step, args.initial_replay_mem_size))
-        sys.stdout.flush() 
+        #sys.stdout.write('\x1b[2K\rStep {:d}/{:d}'.format(random_step, args.initial_replay_mem_size))
+        #sys.stdout.flush() 
     
     # Begin training process         
-    reset_env_and_state_buffer(env, state_buf, args)
-    sys.stdout.write('\n\nTraining...\n\n')   
-    sys.stdout.flush()
+    reset_env_and_state_buffer(env, state_buf, args) #should be function from utils -----------------------------
+    #sys.stdout.write('\n\nTraining...\n\n')   
+    #sys.stdout.flush()
     
     for train_step in range(start_step+1, args.num_steps_train+1):      
         start_time = time.time()  
@@ -202,7 +198,7 @@ def update_target_network(from_scope, to_scope):
                    
             # Take action and store experience
             frame, reward, terminal, _ = env.step(action)
-            frame = preprocess_image(frame, args.frame_width, args.frame_height)
+            frame = preprocess_image(frame, args.frame_width, args.frame_height) # again utils -------------------
             state_buf.add(frame)
             replay_mem.add(action, reward, frame, terminal) 
             episode_reward += reward     
@@ -214,14 +210,14 @@ def update_target_network(from_scope, to_scope):
                 episode_reward = 0
                 ep_steps = 0
                 # Reset environment and state buffer for next episode
-                reset_env_and_state_buffer(env, state_buf, args)                
+                reset_env_and_state_buffer(env, state_buf, args)          #utilsssss   --------------------   
         
         ## Training step    
         # Get minibatch from replay mem
         states_batch, actions_batch, rewards_batch, next_states_batch, terminals_batch = replay_mem.getMinibatch()
         # Calculate target by passing next states through the target network and finding max future Q
         #future_Q = sess.run(DQN_target.output, {state_ph:next_states_batch})
-        max_future_Q = np.max(future_Q, axis=1)
+        max_future_Q = np.max(future_Q, axis=1) #actually this one i don't know if should be utils or not ------------------
         # Q values of the terminal states is 0 by definition
         max_future_Q[terminals_batch] = 0
         targets = rewards_batch + (max_future_Q*args.discount_rate)
@@ -240,15 +236,15 @@ def update_target_network(from_scope, to_scope):
         
         # Update target networks    
         if train_step % args.update_target_step == 0:
-            sess.run(update_target_op)
+            sess.run(update_target_op) # i'm not sure where this comes from--------------------------
         
         # Calculate time per step and display progress to console   
         duration = time.time() - start_time
         duration_values.append(duration)
         ave_duration = sum(duration_values)/float(len(duration_values))
         
-        sys.stdout.write('\x1b[2K\rStep {:d}/{:d} \t ({:.3f} s/step)'.format(train_step, args.num_steps_train, ave_duration))
-        sys.stdout.flush()       
+        #sys.stdout.write('\x1b[2K\rStep {:d}/{:d} \t ({:.3f} s/step)'.format(train_step, args.num_steps_train, ave_duration))
+        #sys.stdout.flush()       
         
         # Save checkpoint       
         if train_step % args.save_ckpt_step == 0:

From 2848ef9b80c43158196338110b7c0f70ced4ee76 Mon Sep 17 00:00:00 2001
From: Timothy Clifford <tcliff30@terpmail.umd.edu>
Date: Fri, 22 Oct 2021 12:55:55 -0400
Subject: [PATCH 5/7] friday grind

---
 src/train.py       | 433 ++++++++++++++++++++++-----------------------
 test/test_train.py |  19 ++
 2 files changed, 231 insertions(+), 221 deletions(-)
 create mode 100644 test/test_train.py

diff --git a/src/train.py b/src/train.py
index a661bcb..ffe9751 100644
--- a/src/train.py
+++ b/src/train.py
@@ -17,246 +17,237 @@
 import StateBuffer
 import Model
     
-def get_train_args():
-    train_params = argparse.ArgumentParser()
-    
-    # Environment parameters
-    train_params.add_argument("--env", type=str, default='BreakoutDeterministic-v4', help="Environment to use (must have RGB image state space and discrete action space)")
-    train_params.add_argument("--render", type=bool, default=False, help="Whether or not to display the environment on the screen during training")
-    train_params.add_argument("--random_seed", type=int, default=1234, help="Random seed for reproducability")
-    train_params.add_argument("--frame_width", type=int, default=105, help="Frame width after resize.")
-    train_params.add_argument("--frame_height", type=int, default=80, help="Frame height after resize.")
-    train_params.add_argument("--frames_per_state", type=int, default=4, help="Sequence of frames which constitutes a single state.")
-    
-    # Training parameters
-    train_params.add_argument("--num_steps_train", type=int, default=50000000, help="Number of steps to train for")    
-    train_params.add_argument("--train_frequency", type=int, default=4, help="Perform training step every N game steps.")
-    train_params.add_argument("--max_ep_steps", type=int, default=2000, help="Maximum number of steps per episode")
-    train_params.add_argument("--batch_size", type=int, default=32)
-    train_params.add_argument("--learning_rate", type=float, default=0.00025)
-    train_params.add_argument("--replay_mem_size", type=int, default=1000000, help="Maximum size of replay memory buffer")
-    train_params.add_argument("--initial_replay_mem_size", type=int, default=50000, help="Initial size of replay memory (populated by random actions) before learning can start")
-    train_params.add_argument("--epsilon_start", type=float, default=1.0, help="Exploration rate at the beginning of training.")
-    train_params.add_argument("--epsilon_end", type=float, default=0.1, help="Exploration rate at the end of decay.")
-    train_params.add_argument("--epsilon_step_end", type=int, default=1000000, help="After how many steps to stop decaying the exploration rate.")
-    train_params.add_argument("--discount_rate", type=float, default=0.99, help="Discount rate (gamma) for future rewards.")
-    train_params.add_argument("--update_target_step", type=float, default=10000, help="Copy current network parameters to target network every N steps.")
-    train_params.add_argument("--save_ckpt_step", type=float, default=250000, help="Save checkpoint every N steps")
-    train_params.add_argument("--save_log_step", type=int, default=1000, help="Save logs every N steps")
-    
-    # Files/directories
-    train_params.add_argument("--ckpt_dir", type=str, default='./ckpts', help="Directory for saving/loading checkpoints")
-    train_params.add_argument("--ckpt_file", type=str, default=None, help="Checkpoint file to load and resume training from (if None, train from scratch)")
-    train_params.add_argument("--log_dir", type=str, default='./logs/train', help="Directory for saving logs")
-    
-    return train_params.parse_args()
-           
-    
-def train(args):
-    
-    # Function to return exploration rate based on current step
-    def exploration_rate(current_step, exp_rate_start, exp_rate_end, exp_step_end):
-        if current_step < exp_step_end:
-            exploration_rate = current_step * ((exp_rate_end-exp_rate_start)/(float(exp_step_end))) + 1
-        else:
-            exploration_rate = exp_rate_end
+class Train():
+
+    def __init__(env='SuperMarioBros-1-1-v0', render=False, random_seed=1234, frame_width=240, frame_height=256, 
+                 frames_per_state=4, num_steps_train = 50000000, train_frequency=4, max_ep_steps=2000, batch_size=32,
+                 learning_rate=.00025, replay_mem_size=1000000, intitial_replay_mem_size=50000, epsilon_start=1.0,
+                 epsilon_end=0.1, epsilon_step_end=1000000, discount_rate=0.99, update_target_step=10000, 
+                 save_ckpt_step=250000, save_log_step=1000, ckpt_dir='./ckpts', ckpt_file=None, log_dir='./logs/train'):
+        self.env = env
+        self.render = render
+        self.random_seed = random_seed
+        self.frame_width = frame_width
+        self.frame_height = frame_height
+        self.frames_per_state = frames_per_state
+        self.num_steps_train = num_steps_train
+        self.train_frequency = train_frequency
+        self.max_ep_steps = max_ep_steps
+        self.batch_size = batch_size
+        self.learning_rate = learning_rate
+        self.replay_mem_size = replay_mem_size
+        self.initial_replay_mem_size = intitial_replay_mem_size
+        self.epsilon_start = epsilon_start
+        self.epsilon_end = epsilon_end
+        self.epsilon_step_end = epsilon_step_end
+        self.discount_rate = discount_rate
+        self.update_target_step = update_target_step
+        self.save_ckpt_step = save_ckpt_step
+        self.save_log_step = save_log_step
+        self.ckpt_dir = ckpt_dir
+        self.ckpt_file = ckpt_file
+        self.log_dir = log_dir
+         
+    def train(self):
+        
+        # Function to return exploration rate based on current step
+        def exploration_rate(current_step, exp_rate_start, exp_rate_end, exp_step_end):
+            if current_step < exp_step_end:
+                exploration_rate = current_step * ((exp_rate_end-exp_rate_start)/(float(exp_step_end))) + 1
+            else:
+                exploration_rate = exp_rate_end
+                
+            return exploration_rate
+        
+        # Function to update target network parameters with main network parameters
+        def update_target_network(from_scope, to_scope):    
+            from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)    
+            to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)
+        
+            op_holder = []
             
-        return exploration_rate
-    
-    # Function to update target network parameters with main network parameters
-    def update_target_network(from_scope, to_scope):    
-        from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)    
-        to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)
-    
-        op_holder = []
+            # Update old network parameters with new network parameters
+            for from_var,to_var in zip(from_vars,to_vars):
+                op_holder.append(to_var.assign(from_var))
+            
+            return op_holder
         
-        # Update old network parameters with new network parameters
-        for from_var,to_var in zip(from_vars,to_vars):
-            op_holder.append(to_var.assign(from_var))
         
-        return op_holder
-    
-    
-    # Create environment
-    env = gym_super_mario_bros.make(args.env)
-    num_actions = 7
-    
-    # Initialise replay memory and state buffer
-    replay_mem = ReplayMemory(args) # don't exist yet ----------------------------------
-    state_buf = StateBuffer(args)
-    
-    # Define input placeholders    
-    state_ph = tf.placeholder(tf.uint8, (None, args.frame_height, args.frame_width, args.frames_per_state))
-    action_ph = tf.placeholder(tf.int32, (None))
-    target_ph = tf.placeholder(tf.float32, (None))
-    
-    # Instantiate DQN network
-    #DQN = DeepQNetwork(num_actions, state_ph, action_ph, target_ph, args.learning_rate, scope='DQN_main')   # Note: One scope cannot be the prefix of another scope (e.g. cannot name this scope 'DQN' and   
-                                                                                                            # target network scope 'DQN_target', as a search for vars in 'DQN' scope will return both networks' vars)
-    DQN = Model(240, 256)
-    DQN_predict_op = DQN.predict() #talk to rithvik --------------------------------------------------
-    DQN_train_step_op = DQN.train_step()
-    
-    # Instantiate DQN target network
-    #DQN_target = DeepQNetwork(num_actions, state_ph, scope='DQN_target')
-    
-    #update_target_op = update_target_network('DQN_main', 'DQN_target')
+        # Create environment
+        env = gym_super_mario_bros.make(self.env)
+        num_actions = 7
         
-    # Create session
-    config = tf.ConfigProto(allow_soft_placement=True)
-    config.gpu_options.allow_growth = True
-    sess = tf.Session(config=config)       
+        # Initialise replay memory and state buffer
+        replay_mem = ReplayMemory(self) # don't exist yet ----------------------------------
+        state_buf = StateBuffer(self)
         
-    # Add summaries for Tensorboard visualisation
-    tf.summary.scalar('Loss', DQN.loss)  
-    reward_var = tf.Variable(0.0, trainable=False)
-    tf.summary.scalar("Episode Reward", reward_var)
-    epsilon_var = tf.Variable(args.epsilon_start, trainable=False)
-    tf.summary.scalar("Exploration Rate", epsilon_var)
-    summary_op = tf.summary.merge_all() 
+        # Define input placeholders    
+        state_ph = tf.placeholder(tf.uint8, (None, self.frame_height, self.frame_width, self.frames_per_state))
+        action_ph = tf.placeholder(tf.int32, (None))
+        target_ph = tf.placeholder(tf.float32, (None))
         
-    # Define saver for saving model ckpts
-    model_name = 'model.ckpt'
-    checkpoint_path = os.path.join(args.ckpt_dir, model_name)        
-    if not os.path.exists(args.ckpt_dir):
-        os.makedirs(args.ckpt_dir)
-    saver = tf.train.Saver(max_to_keep=201)  
-    
-    # Create summary writer to write summaries to disk
-    if not os.path.exists(args.log_dir):
-        os.makedirs(args.log_dir)
-    summary_writer = tf.summary.FileWriter(args.log_dir, sess.graph)
-    
-    # Load ckpt file if given
-    if args.ckpt_file is not None:
-        loader = tf.train.Saver()   #Restore all variables from ckpt
-        ckpt = args.ckpt_dir + '/' + args.ckpt_file
-        ckpt_split = ckpt.split('-')
-        step_str = ckpt_split[-1]
-        start_step = int(step_str)    
-        loader.restore(sess, ckpt)
-    else:
-        start_step = 0
-        sess.run(tf.global_variables_initializer())
-        sess.run(update_target_op)
-
+        # Instantiate DQN network
+        #DQN = DeepQNetwork(num_actions, state_ph, action_ph, target_ph, self.learning_rate, scope='DQN_main')   # Note: One scope cannot be the prefix of another scope (e.g. cannot name this scope 'DQN' and   
+                                                                                                                # target network scope 'DQN_target', as a search for vars in 'DQN' scope will return both networks' vars)
+        DQN = Model(240, 256)
+        DQN_predict_op = DQN.predict() #talk to rithvik --------------------------------------------------
+        DQN_train_step_op = DQN.train_step()
         
-    ## Begin training 
-                       
-    env.reset()
-    
-    ep_steps = 0
-    episode_reward = 0
-    episode_rewards = []
-    duration_values = []
-
-    # Initially populate replay memory by taking random actions       
-    
-    for random_step in range(1, args.initial_replay_mem_size+1):
+        # Instantiate DQN target network
+        #DQN_target = DeepQNetwork(num_actions, state_ph, scope='DQN_target')
         
-        if args.render:
-            env.render()
-        else:
-            env.render(mode='rgb_array')
+        #update_target_op = update_target_network('DQN_main', 'DQN_target')
+            
+        # Create session
+        config = tf.ConfigProto(allow_soft_placement=True)
+        config.gpu_options.allow_growth = True
+        sess = tf.Session(config=config)       
+            
+        # Add summaries for Tensorboard visualisation
+        tf.summary.scalar('Loss', DQN.loss)  
+        reward_var = tf.Variable(0.0, trainable=False)
+        tf.summary.scalar("Episode Reward", reward_var)
+        epsilon_var = tf.Variable(self.epsilon_start, trainable=False)
+        tf.summary.scalar("Exploration Rate", epsilon_var)
+        summary_op = tf.summary.merge_all() 
+            
+        # Define saver for saving model ckpts
+        model_name = 'model.ckpt'
+        checkpoint_path = os.path.join(self.ckpt_dir, model_name)        
+        if not os.path.exists(self.ckpt_dir):
+            os.makedirs(self.ckpt_dir)
+        saver = tf.train.Saver(max_to_keep=201)  
         
-        action = env.action_space.sample() #get an action ------------------------------------------------------
-        frame, reward, terminal, _ = env.step(action)
-        frame = preprocess_image(frame, args.frame_width, args.frame_height) #should be function from utils ---------------
-        replay_mem.add(action, reward, frame, terminal)
+        # Create summary writer to write summaries to disk
+        if not os.path.exists(self.log_dir):
+            os.makedirs(self.log_dir)
+        summary_writer = tf.summary.FileWriter(self.log_dir, sess.graph)
         
-        if terminal:
-            env.reset()
+        # Load ckpt file if given
+        if self.ckpt_file is not None:
+            loader = tf.train.Saver()   #Restore all variables from ckpt
+            ckpt = self.ckpt_dir + '/' + self.ckpt_file
+            ckpt_split = ckpt.split('-')
+            step_str = ckpt_split[-1]
+            start_step = int(step_str)    
+            loader.restore(sess, ckpt)
+        else:
+            start_step = 0
+            sess.run(tf.global_variables_initializer())
+            sess.run(update_target_op)
+
+            
+        ## Begin training 
                         
-        #sys.stdout.write('\x1b[2K\rStep {:d}/{:d}'.format(random_step, args.initial_replay_mem_size))
-        #sys.stdout.flush() 
-    
-    # Begin training process         
-    reset_env_and_state_buffer(env, state_buf, args) #should be function from utils -----------------------------
-    #sys.stdout.write('\n\nTraining...\n\n')   
-    #sys.stdout.flush()
-    
-    for train_step in range(start_step+1, args.num_steps_train+1):      
-        start_time = time.time()  
-        # Run 'train_frequency' iterations in the game for every training step       
-        for _ in range(0, args.train_frequency):
-            ep_steps += 1
+        env.reset()
+        
+        ep_steps = 0
+        episode_reward = 0
+        episode_rewards = []
+        duration_values = []
+
+        # Initially populate replay memory by taking random actions       
+        
+        for random_step in range(1, self.initial_replay_mem_size+1):
             
-            if args.render:
+            if self.render:
                 env.render()
             else:
                 env.render(mode='rgb_array')
             
-            # Use an epsilon-greedy policy to select action
-            epsilon = exploration_rate(train_step, args.epsilon_start, args.epsilon_end, args.epsilon_step_end)
-            if random.random() < epsilon:
-                #Choose random action
-                action = env.action_space.sample()
-            else:
-                #Choose action with highest Q-value according to network's current policy
-                current_state = np.expand_dims(state_buf.get_state(), 0)
-                action = sess.run(DQN_predict_op, {state_ph:current_state})
-                   
-            # Take action and store experience
+            action = env.action_space.sample() #get an action ------------------------------------------------------
             frame, reward, terminal, _ = env.step(action)
-            frame = preprocess_image(frame, args.frame_width, args.frame_height) # again utils -------------------
-            state_buf.add(frame)
-            replay_mem.add(action, reward, frame, terminal) 
-            episode_reward += reward     
+            frame = preprocess_image(frame, self.frame_width, self.frame_height) #should be function from utils ---------------
+            replay_mem.add(action, reward, frame, terminal)
             
-            if terminal or ep_steps == args.max_ep_steps:  
-                # Collect total reward of episode              
-                episode_rewards.append(episode_reward)
-                # Reset episode reward and episode steps counters
-                episode_reward = 0
-                ep_steps = 0
-                # Reset environment and state buffer for next episode
-                reset_env_and_state_buffer(env, state_buf, args)          #utilsssss   --------------------   
-        
-        ## Training step    
-        # Get minibatch from replay mem
-        states_batch, actions_batch, rewards_batch, next_states_batch, terminals_batch = replay_mem.getMinibatch()
-        # Calculate target by passing next states through the target network and finding max future Q
-        #future_Q = sess.run(DQN_target.output, {state_ph:next_states_batch})
-        max_future_Q = np.max(future_Q, axis=1) #actually this one i don't know if should be utils or not ------------------
-        # Q values of the terminal states is 0 by definition
-        max_future_Q[terminals_batch] = 0
-        targets = rewards_batch + (max_future_Q*args.discount_rate)
-        
-        # Execute training step
-        if train_step % args.save_log_step == 0:
-            # Train and save logs
-            average_reward = sum(episode_rewards)/len(episode_rewards)
-            summary_str, _ = sess.run([summary_op, DQN_train_step_op], {state_ph:states_batch, action_ph:actions_batch, target_ph:targets, reward_var: average_reward, epsilon_var: epsilon})
-            summary_writer.add_summary(summary_str, train_step)
-            # Reset rewards buffer
-            episode_rewards = []
-        else:
-            # Just train
-            _ = sess.run(DQN_train_step_op, {state_ph:states_batch, action_ph:actions_batch, target_ph:targets})
-        
-        # Update target networks    
-        if train_step % args.update_target_step == 0:
-            sess.run(update_target_op) # i'm not sure where this comes from--------------------------
-        
-        # Calculate time per step and display progress to console   
-        duration = time.time() - start_time
-        duration_values.append(duration)
-        ave_duration = sum(duration_values)/float(len(duration_values))
+            if terminal:
+                env.reset()
+                            
+            #sys.stdout.write('\x1b[2K\rStep {:d}/{:d}'.format(random_step, self.initial_replay_mem_size))
+            #sys.stdout.flush() 
         
-        #sys.stdout.write('\x1b[2K\rStep {:d}/{:d} \t ({:.3f} s/step)'.format(train_step, args.num_steps_train, ave_duration))
-        #sys.stdout.flush()       
+        # Begin training process         
+        reset_env_and_state_buffer(env, state_buf, None) #should be function from utils ----------------------------- REPLACE NONE LATER
+        #sys.stdout.write('\n\nTraining...\n\n')   
+        #sys.stdout.flush()
         
-        # Save checkpoint       
-        if train_step % args.save_ckpt_step == 0:
-            saver.save(sess, checkpoint_path, global_step=train_step)
-            sys.stdout.write('\n Checkpoint saved\n')   
-            sys.stdout.flush() 
+        for train_step in range(start_step+1, self.num_steps_train+1):      
+            start_time = time.time()  
+            # Run 'train_frequency' iterations in the game for every training step       
+            for _ in range(0, self.train_frequency):
+                ep_steps += 1
+                
+                if self.render:
+                    env.render()
+                else:
+                    env.render(mode='rgb_array')
+                
+                # Use an epsilon-greedy policy to select action
+                epsilon = exploration_rate(train_step, self.epsilon_start, self.epsilon_end, self.epsilon_step_end)
+                if random.random() < epsilon:
+                    #Choose random action
+                    action = env.action_space.sample()
+                else:
+                    #Choose action with highest Q-value according to network's current policy
+                    current_state = np.expand_dims(state_buf.get_state(), 0)
+                    action = sess.run(DQN_predict_op, {state_ph:current_state})
+                    
+                # Take action and store experience
+                frame, reward, terminal, _ = env.step(action)
+                frame = preprocess_image(frame, self.frame_width, self.frame_height) # again utils -------------------
+                state_buf.add(frame)
+                replay_mem.add(action, reward, frame, terminal) 
+                episode_reward += reward     
+                
+                if terminal or ep_steps == self.max_ep_steps:  
+                    # Collect total reward of episode              
+                    episode_rewards.append(episode_reward)
+                    # Reset episode reward and episode steps counters
+                    episode_reward = 0
+                    ep_steps = 0
+                    # Reset environment and state buffer for next episode
+                    reset_env_and_state_buffer(env, state_buf, self)          #utilsssss   --------------------   
             
-            # Reset time calculation
-            duration_values = []
+            ## Training step    
+            # Get minibatch from replay mem
+            states_batch, actions_batch, rewards_batch, next_states_batch, terminals_batch = replay_mem.getMinibatch()
+            # Calculate target by passing next states through the target network and finding max future Q
+            #future_Q = sess.run(DQN_target.output, {state_ph:next_states_batch})
+            max_future_Q = np.max(future_Q, axis=1) #actually this one i don't know if should be utils or not ------------------
+            # Q values of the terminal states is 0 by definition
+            max_future_Q[terminals_batch] = 0
+            targets = rewards_batch + (max_future_Q*self.discount_rate)
             
-                  
-
-if  __name__ == '__main__':
-    train_args = get_train_args()
-    train(train_args)         
\ No newline at end of file
+            # Execute training step
+            if train_step % self.save_log_step == 0:
+                # Train and save logs
+                average_reward = sum(episode_rewards)/len(episode_rewards)
+                summary_str, _ = sess.run([summary_op, DQN_train_step_op], {state_ph:states_batch, action_ph:actions_batch, target_ph:targets, reward_var: average_reward, epsilon_var: epsilon})
+                summary_writer.add_summary(summary_str, train_step)
+                # Reset rewards buffer
+                episode_rewards = []
+            else:
+                # Just train
+                _ = sess.run(DQN_train_step_op, {state_ph:states_batch, action_ph:actions_batch, target_ph:targets})
+            
+            # Update target networks    
+            if train_step % self.update_target_step == 0:
+                sess.run(update_target_op) # i'm not sure where this comes from--------------------------
+            
+            # Calculate time per step and display progress to console   
+            duration = time.time() - start_time
+            duration_values.append(duration)
+            ave_duration = sum(duration_values)/float(len(duration_values))
+            
+            #sys.stdout.write('\x1b[2K\rStep {:d}/{:d} \t ({:.3f} s/step)'.format(train_step, self.num_steps_train, ave_duration))
+            #sys.stdout.flush()       
+            
+            # Save checkpoint       
+            if train_step % self.save_ckpt_step == 0:
+                saver.save(sess, checkpoint_path, global_step=train_step)
+                sys.stdout.write('\n Checkpoint saved\n')   
+                sys.stdout.flush() 
+                
+                # Reset time calculation
+                duration_values = []
+                       
\ No newline at end of file
diff --git a/test/test_train.py b/test/test_train.py
new file mode 100644
index 0000000..02d4c0d
--- /dev/null
+++ b/test/test_train.py
@@ -0,0 +1,19 @@
+import os
+
+from src.Model import Model
+from src.train import Train
+
+def test_train_init():
+  model = Model(100,50)
+  trainer = Train(env='SuperMarioBros-1-1-v0')
+  assert(trainer.env == 'SuperMarioBros-1-1-v0')
+  assert(trainer.frame_width == 240)
+  assert(trainer.frame_height == 256)
+
+def test_train_train():
+  model = Model(100,50)
+  trainer = Train(env='SuperMarioBros-1-1-v0')
+  trainer.train()
+  assert(os.path.exists('./ckpts'))
+
+  
\ No newline at end of file

From d9428916bd55ef15b278ba63c3d46a313685f5b4 Mon Sep 17 00:00:00 2001
From: Timothy Clifford <tcliff30@terpmail.umd.edu>
Date: Mon, 25 Oct 2021 10:39:03 -0400
Subject: [PATCH 6/7] why no worky

---
 src/train.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/train.py b/src/train.py
index ffe9751..4fb05b2 100644
--- a/src/train.py
+++ b/src/train.py
@@ -12,10 +12,10 @@
 import numpy as np
 import time
 import random
-import utils
-import ReplayMemory # doesn't exist ---------------------------------------------------------
+import ReplayMemory
 import StateBuffer
 import Model
+import utils
     
 class Train():
 
@@ -90,7 +90,7 @@ def update_target_network(from_scope, to_scope):
         #DQN = DeepQNetwork(num_actions, state_ph, action_ph, target_ph, self.learning_rate, scope='DQN_main')   # Note: One scope cannot be the prefix of another scope (e.g. cannot name this scope 'DQN' and   
                                                                                                                 # target network scope 'DQN_target', as a search for vars in 'DQN' scope will return both networks' vars)
         DQN = Model(240, 256)
-        DQN_predict_op = DQN.predict() #talk to rithvik --------------------------------------------------
+        DQN_predict_op = DQN.predict() 
         DQN_train_step_op = DQN.train_step()
         
         # Instantiate DQN target network

From 6d7897ddbd95dabd3c02db223c769276b9ea0ba5 Mon Sep 17 00:00:00 2001
From: Timothy Clifford <tcliff30@terpmail.umd.edu>
Date: Fri, 5 Nov 2021 12:15:57 -0400
Subject: [PATCH 7/7] why not

---
 src/train.py | 456 +++++++++++++++++++++++++++------------------------
 1 file changed, 238 insertions(+), 218 deletions(-)

diff --git a/src/train.py b/src/train.py
index 4fb05b2..41ef59b 100644
--- a/src/train.py
+++ b/src/train.py
@@ -1,253 +1,273 @@
 '''
 ## Train ##
-# Adapted from code to train Deep Q Network on OpenAI Gym environments
+# Code to train Deep Q Network on OpenAI Gym environments
 @author: Mark Sinton (msinto93@gmail.com) 
 '''
 
 import os
 import sys
 import argparse
-import gym_super_mario_bros
+import gym
 import tensorflow as tf
 import numpy as np
 import time
 import random
-import ReplayMemory
-import StateBuffer
-import Model
-import utils
-    
-class Train():
 
-    def __init__(env='SuperMarioBros-1-1-v0', render=False, random_seed=1234, frame_width=240, frame_height=256, 
-                 frames_per_state=4, num_steps_train = 50000000, train_frequency=4, max_ep_steps=2000, batch_size=32,
-                 learning_rate=.00025, replay_mem_size=1000000, intitial_replay_mem_size=50000, epsilon_start=1.0,
-                 epsilon_end=0.1, epsilon_step_end=1000000, discount_rate=0.99, update_target_step=10000, 
-                 save_ckpt_step=250000, save_log_step=1000, ckpt_dir='./ckpts', ckpt_file=None, log_dir='./logs/train'):
-        self.env = env
-        self.render = render
-        self.random_seed = random_seed
-        self.frame_width = frame_width
-        self.frame_height = frame_height
-        self.frames_per_state = frames_per_state
-        self.num_steps_train = num_steps_train
-        self.train_frequency = train_frequency
-        self.max_ep_steps = max_ep_steps
-        self.batch_size = batch_size
-        self.learning_rate = learning_rate
-        self.replay_mem_size = replay_mem_size
-        self.initial_replay_mem_size = intitial_replay_mem_size
-        self.epsilon_start = epsilon_start
-        self.epsilon_end = epsilon_end
-        self.epsilon_step_end = epsilon_step_end
-        self.discount_rate = discount_rate
-        self.update_target_step = update_target_step
-        self.save_ckpt_step = save_ckpt_step
-        self.save_log_step = save_log_step
-        self.ckpt_dir = ckpt_dir
-        self.ckpt_file = ckpt_file
-        self.log_dir = log_dir
-         
-    def train(self):
-        
-        # Function to return exploration rate based on current step
-        def exploration_rate(current_step, exp_rate_start, exp_rate_end, exp_step_end):
-            if current_step < exp_step_end:
-                exploration_rate = current_step * ((exp_rate_end-exp_rate_start)/(float(exp_step_end))) + 1
-            else:
-                exploration_rate = exp_rate_end
-                
-            return exploration_rate
-        
-        # Function to update target network parameters with main network parameters
-        def update_target_network(from_scope, to_scope):    
-            from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)    
-            to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)
-        
-            op_holder = []
-            
-            # Update old network parameters with new network parameters
-            for from_var,to_var in zip(from_vars,to_vars):
-                op_holder.append(to_var.assign(from_var))
+from nes_py.wrappers import JoypadSpace
+import gym_super_mario_bros
+from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
+
+from utils.utils import preprocess_image, reset_env_and_state_buffer
+from utils.experience_replay import ReplayMemory   
+from utils.state_buffer import StateBuffer
+from utils.network import DeepQNetwork
+    
+def get_train_args(args=None):
+    train_params = argparse.ArgumentParser()
+    
+    # Environment parameters
+    train_params.add_argument("--env", type=str, default='SuperMarioBros-1-1-v0', help="Environment to use (must have RGB image state space and discrete action space)")
+    train_params.add_argument("--render", type=bool, default=False, help="Whether or not to display the environment on the screen during training")
+    train_params.add_argument("--random_seed", type=int, default=1234, help="Random seed for reproducability")
+    train_params.add_argument("--frame_width", type=int, default=105, help="Frame width after resize.")
+    train_params.add_argument("--frame_height", type=int, default=80, help="Frame height after resize.")
+    train_params.add_argument("--frames_per_state", type=int, default=4, help="Sequence of frames which constitutes a single state.")
+    
+    # Training parameters
+    train_params.add_argument("--num_steps_train", type=int, default=50000000, help="Number of steps to train for")    
+    train_params.add_argument("--train_frequency", type=int, default=4, help="Perform training step every N game steps.")
+    train_params.add_argument("--max_ep_steps", type=int, default=2000, help="Maximum number of steps per episode")
+    train_params.add_argument("--batch_size", type=int, default=32)
+    train_params.add_argument("--learning_rate", type=float, default=0.00025)
+    train_params.add_argument("--replay_mem_size", type=int, default=1000000, help="Maximum size of replay memory buffer")
+    train_params.add_argument("--initial_replay_mem_size", type=int, default=50000, help="Initial size of replay memory (populated by random actions) before learning can start")
+    train_params.add_argument("--epsilon_start", type=float, default=1.0, help="Exploration rate at the beginning of training.")
+    train_params.add_argument("--epsilon_end", type=float, default=0.1, help="Exploration rate at the end of decay.")
+    train_params.add_argument("--epsilon_step_end", type=int, default=1000000, help="After how many steps to stop decaying the exploration rate.")
+    train_params.add_argument("--discount_rate", type=float, default=0.99, help="Discount rate (gamma) for future rewards.")
+    train_params.add_argument("--update_target_step", type=float, default=10000, help="Copy current network parameters to target network every N steps.")
+    train_params.add_argument("--save_ckpt_step", type=float, default=250000, help="Save checkpoint every N steps")
+    train_params.add_argument("--save_log_step", type=int, default=1000, help="Save logs every N steps")
+    
+    # Files/directories
+    train_params.add_argument("--ckpt_dir", type=str, default='./ckpts', help="Directory for saving/loading checkpoints")
+    train_params.add_argument("--ckpt_file", type=str, default=None, help="Checkpoint file to load and resume training from (if None, train from scratch)")
+    train_params.add_argument("--log_dir", type=str, default='./logs/train', help="Directory for saving logs")
+    
+    return train_params.parse_args(args)
+           
+    
+def train(args):
+    
+    # Function to return exploration rate based on current step
+    def exploration_rate(current_step, exp_rate_start, exp_rate_end, exp_step_end):
+        if current_step < exp_step_end:
+            exploration_rate = current_step * ((exp_rate_end-exp_rate_start)/(float(exp_step_end))) + 1
+        else:
+            exploration_rate = exp_rate_end
             
-            return op_holder
-        
-        
-        # Create environment
-        env = gym_super_mario_bros.make(self.env)
-        num_actions = 7
+        return exploration_rate
+    
+    # Function to update target network parameters with main network parameters
+    def update_target_network(from_scope, to_scope):    
+        from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)    
+        to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)
+    
+        op_holder = []
         
-        # Initialise replay memory and state buffer
-        replay_mem = ReplayMemory(self) # don't exist yet ----------------------------------
-        state_buf = StateBuffer(self)
+        # Update old network parameters with new network parameters
+        for from_var,to_var in zip(from_vars,to_vars):
+            op_holder.append(to_var.assign(from_var))
         
-        # Define input placeholders    
-        state_ph = tf.placeholder(tf.uint8, (None, self.frame_height, self.frame_width, self.frames_per_state))
-        action_ph = tf.placeholder(tf.int32, (None))
-        target_ph = tf.placeholder(tf.float32, (None))
+        return op_holder
+    
+    
+    # Create environment
+    env = gym_super_mario_bros.make(args.env)
+    env = JoypadSpace(env, SIMPLE_MOVEMENT)
+    num_actions = env.action_space.n
+    
+    # Initialise replay memory and state buffer
+    replay_mem = ReplayMemory(args)
+    state_buf = StateBuffer(args)
+    
+    # Define input placeholders    
+    state_ph = tf.placeholder(tf.uint8, (None, args.frame_height, args.frame_width, args.frames_per_state))
+    action_ph = tf.placeholder(tf.int32, (None))
+    target_ph = tf.placeholder(tf.float32, (None))
+    
+    # Instantiate DQN network
+    DQN = DeepQNetwork(num_actions, state_ph, action_ph, target_ph, args.learning_rate, scope='DQN_main')   # Note: One scope cannot be the prefix of another scope (e.g. cannot name this scope 'DQN' and 
+                                                                                                            # target network scope 'DQN_target', as a search for vars in 'DQN' scope will return both networks' vars)
+    DQN_predict_op = DQN.predict()
+    DQN_train_step_op = DQN.train_step()
+    
+    # Instantiate DQN target network
+    DQN_target = DeepQNetwork(num_actions, state_ph, scope='DQN_target')
+    
+    update_target_op = update_target_network('DQN_main', 'DQN_target')
         
-        # Instantiate DQN network
-        #DQN = DeepQNetwork(num_actions, state_ph, action_ph, target_ph, self.learning_rate, scope='DQN_main')   # Note: One scope cannot be the prefix of another scope (e.g. cannot name this scope 'DQN' and   
-                                                                                                                # target network scope 'DQN_target', as a search for vars in 'DQN' scope will return both networks' vars)
-        DQN = Model(240, 256)
-        DQN_predict_op = DQN.predict() 
-        DQN_train_step_op = DQN.train_step()
+    # Create session
+    config = tf.ConfigProto(allow_soft_placement=True)
+    config.gpu_options.allow_growth = True
+    sess = tf.Session(config=config)       
         
-        # Instantiate DQN target network
-        #DQN_target = DeepQNetwork(num_actions, state_ph, scope='DQN_target')
+    # Add summaries for Tensorboard visualisation
+    tf.summary.scalar('Loss', DQN.loss)  
+    reward_var = tf.Variable(0.0, trainable=False)
+    tf.summary.scalar("Episode_Reward", reward_var)
+    epsilon_var = tf.Variable(args.epsilon_start, trainable=False)
+    tf.summary.scalar("Exploration_Rate", epsilon_var)
+    summary_op = tf.summary.merge_all() 
         
-        #update_target_op = update_target_network('DQN_main', 'DQN_target')
-            
-        # Create session
-        config = tf.ConfigProto(allow_soft_placement=True)
-        config.gpu_options.allow_growth = True
-        sess = tf.Session(config=config)       
-            
-        # Add summaries for Tensorboard visualisation
-        tf.summary.scalar('Loss', DQN.loss)  
-        reward_var = tf.Variable(0.0, trainable=False)
-        tf.summary.scalar("Episode Reward", reward_var)
-        epsilon_var = tf.Variable(self.epsilon_start, trainable=False)
-        tf.summary.scalar("Exploration Rate", epsilon_var)
-        summary_op = tf.summary.merge_all() 
-            
-        # Define saver for saving model ckpts
-        model_name = 'model.ckpt'
-        checkpoint_path = os.path.join(self.ckpt_dir, model_name)        
-        if not os.path.exists(self.ckpt_dir):
-            os.makedirs(self.ckpt_dir)
-        saver = tf.train.Saver(max_to_keep=201)  
+    # Define saver for saving model ckpts
+    model_name = 'model.ckpt'
+    checkpoint_path = os.path.join(args.ckpt_dir, model_name)        
+    if not os.path.exists(args.ckpt_dir):
+        os.makedirs(args.ckpt_dir)
+    saver = tf.train.Saver(max_to_keep=201)  
+    
+    # Create summary writer to write summaries to disk
+    if not os.path.exists(args.log_dir):
+        os.makedirs(args.log_dir)
+    summary_writer = tf.summary.FileWriter(args.log_dir, sess.graph)
+    
+    # Load ckpt file if given
+    if args.ckpt_file is not None:
+        loader = tf.train.Saver()   #Restore all variables from ckpt
+        ckpt = args.ckpt_dir + '/' + args.ckpt_file
+        ckpt_split = ckpt.split('-')
+        step_str = ckpt_split[-1]
+        start_step = int(step_str)    
+        loader.restore(sess, ckpt)
+    else:
+        start_step = 0
+        sess.run(tf.global_variables_initializer())
+        sess.run(update_target_op)
+
         
-        # Create summary writer to write summaries to disk
-        if not os.path.exists(self.log_dir):
-            os.makedirs(self.log_dir)
-        summary_writer = tf.summary.FileWriter(self.log_dir, sess.graph)
+    ## Begin training
+                       
+    env.reset()
+    
+    ep_steps = 0
+    episode_reward = 0
+    episode_rewards = []
+    duration_values = []
+
+    # Initially populate replay memory by taking random actions 
+    sys.stdout.write('\nPopulating replay memory with random actions...\n')   
+    sys.stdout.flush()          
+    
+    for random_step in range(1, args.initial_replay_mem_size+1):
         
-        # Load ckpt file if given
-        if self.ckpt_file is not None:
-            loader = tf.train.Saver()   #Restore all variables from ckpt
-            ckpt = self.ckpt_dir + '/' + self.ckpt_file
-            ckpt_split = ckpt.split('-')
-            step_str = ckpt_split[-1]
-            start_step = int(step_str)    
-            loader.restore(sess, ckpt)
+        if args.render:
+            env.render()
         else:
-            start_step = 0
-            sess.run(tf.global_variables_initializer())
-            sess.run(update_target_op)
-
-            
-        ## Begin training 
-                        
-        env.reset()
+            env.render(mode='rgb_array')
         
-        ep_steps = 0
-        episode_reward = 0
-        episode_rewards = []
-        duration_values = []
-
-        # Initially populate replay memory by taking random actions       
+        action = env.action_space.sample()
+        frame, reward, terminal, _ = env.step(action)
+        frame = preprocess_image(frame, args.frame_width, args.frame_height)
+        replay_mem.add(action, reward, frame, terminal)
         
-        for random_step in range(1, self.initial_replay_mem_size+1):
+        if terminal:
+            env.reset()
+                        
+        sys.stdout.write('\x1b[2K\rStep {:d}/{:d}'.format(random_step, args.initial_replay_mem_size))
+        sys.stdout.flush() 
+    
+    # Begin training process         
+    reset_env_and_state_buffer(env, state_buf, args)
+    sys.stdout.write('\n\nTraining...\n\n')   
+    sys.stdout.flush()
+    
+    for train_step in range(start_step+1, args.num_steps_train+1):      
+        start_time = time.time()  
+        # Run 'train_frequency' iterations in the game for every training step       
+        for _ in range(0, args.train_frequency):
+            ep_steps += 1
             
-            if self.render:
+            if args.render:
                 env.render()
             else:
                 env.render(mode='rgb_array')
             
-            action = env.action_space.sample() #get an action ------------------------------------------------------
+            # Use an epsilon-greedy policy to select action
+            epsilon = exploration_rate(train_step, args.epsilon_start, args.epsilon_end, args.epsilon_step_end)
+            if random.random() < epsilon:
+                #print("random :(")
+                #Choose random action
+                action = env.action_space.sample()
+            else:
+                #print("greedy :)")
+                #Choose action with highest Q-value according to network's current policy
+                current_state = np.expand_dims(state_buf.get_state(), 0)
+                action = sess.run(DQN_predict_op, {state_ph:current_state})[0]
+                   
+            # Take action and store experience
+            #print(action)
             frame, reward, terminal, _ = env.step(action)
-            frame = preprocess_image(frame, self.frame_width, self.frame_height) #should be function from utils ---------------
-            replay_mem.add(action, reward, frame, terminal)
+      
+            frame = preprocess_image(frame, args.frame_width, args.frame_height)
+            state_buf.add(frame)
+            replay_mem.add(action, reward, frame, terminal) 
+            episode_reward += reward     
             
-            if terminal:
-                env.reset()
-                            
-            #sys.stdout.write('\x1b[2K\rStep {:d}/{:d}'.format(random_step, self.initial_replay_mem_size))
-            #sys.stdout.flush() 
+            if terminal or ep_steps == args.max_ep_steps:  
+                # Collect total reward of episode              
+                episode_rewards.append(episode_reward)
+                # Reset episode reward and episode steps counters
+                episode_reward = 0
+                ep_steps = 0
+                # Reset environment and state buffer for next episode
+                reset_env_and_state_buffer(env, state_buf, args)                
         
-        # Begin training process         
-        reset_env_and_state_buffer(env, state_buf, None) #should be function from utils ----------------------------- REPLACE NONE LATER
-        #sys.stdout.write('\n\nTraining...\n\n')   
-        #sys.stdout.flush()
+        ## Training step    
+        # Get minibatch from replay mem
+        states_batch, actions_batch, rewards_batch, next_states_batch, terminals_batch = replay_mem.getMinibatch()
+        # Calculate target by passing next states through the target network and finding max future Q
+        future_Q = sess.run(DQN_target.output, {state_ph:next_states_batch})
+        max_future_Q = np.max(future_Q, axis=1)
+        # Q values of the terminal states is 0 by definition
+        max_future_Q[terminals_batch] = 0
+        targets = rewards_batch + (max_future_Q*args.discount_rate)
         
-        for train_step in range(start_step+1, self.num_steps_train+1):      
-            start_time = time.time()  
-            # Run 'train_frequency' iterations in the game for every training step       
-            for _ in range(0, self.train_frequency):
-                ep_steps += 1
-                
-                if self.render:
-                    env.render()
-                else:
-                    env.render(mode='rgb_array')
-                
-                # Use an epsilon-greedy policy to select action
-                epsilon = exploration_rate(train_step, self.epsilon_start, self.epsilon_end, self.epsilon_step_end)
-                if random.random() < epsilon:
-                    #Choose random action
-                    action = env.action_space.sample()
-                else:
-                    #Choose action with highest Q-value according to network's current policy
-                    current_state = np.expand_dims(state_buf.get_state(), 0)
-                    action = sess.run(DQN_predict_op, {state_ph:current_state})
-                    
-                # Take action and store experience
-                frame, reward, terminal, _ = env.step(action)
-                frame = preprocess_image(frame, self.frame_width, self.frame_height) # again utils -------------------
-                state_buf.add(frame)
-                replay_mem.add(action, reward, frame, terminal) 
-                episode_reward += reward     
-                
-                if terminal or ep_steps == self.max_ep_steps:  
-                    # Collect total reward of episode              
-                    episode_rewards.append(episode_reward)
-                    # Reset episode reward and episode steps counters
-                    episode_reward = 0
-                    ep_steps = 0
-                    # Reset environment and state buffer for next episode
-                    reset_env_and_state_buffer(env, state_buf, self)          #utilsssss   --------------------   
-            
-            ## Training step    
-            # Get minibatch from replay mem
-            states_batch, actions_batch, rewards_batch, next_states_batch, terminals_batch = replay_mem.getMinibatch()
-            # Calculate target by passing next states through the target network and finding max future Q
-            #future_Q = sess.run(DQN_target.output, {state_ph:next_states_batch})
-            max_future_Q = np.max(future_Q, axis=1) #actually this one i don't know if should be utils or not ------------------
-            # Q values of the terminal states is 0 by definition
-            max_future_Q[terminals_batch] = 0
-            targets = rewards_batch + (max_future_Q*self.discount_rate)
-            
-            # Execute training step
-            if train_step % self.save_log_step == 0:
-                # Train and save logs
-                average_reward = sum(episode_rewards)/len(episode_rewards)
-                summary_str, _ = sess.run([summary_op, DQN_train_step_op], {state_ph:states_batch, action_ph:actions_batch, target_ph:targets, reward_var: average_reward, epsilon_var: epsilon})
-                summary_writer.add_summary(summary_str, train_step)
-                # Reset rewards buffer
-                episode_rewards = []
-            else:
-                # Just train
-                _ = sess.run(DQN_train_step_op, {state_ph:states_batch, action_ph:actions_batch, target_ph:targets})
-            
-            # Update target networks    
-            if train_step % self.update_target_step == 0:
-                sess.run(update_target_op) # i'm not sure where this comes from--------------------------
-            
-            # Calculate time per step and display progress to console   
-            duration = time.time() - start_time
-            duration_values.append(duration)
-            ave_duration = sum(duration_values)/float(len(duration_values))
+        # Execute training step
+        if train_step % args.save_log_step == 0:
+            # Train and save logs
+            average_reward = sum(episode_rewards)/len(episode_rewards)
+            summary_str, _ = sess.run([summary_op, DQN_train_step_op], {state_ph:states_batch, action_ph:actions_batch, target_ph:targets, reward_var: average_reward, epsilon_var: epsilon})
+            summary_writer.add_summary(summary_str, train_step)
+            # Reset rewards buffer
+            episode_rewards = []
+        else:
+            # Just train
+            _ = sess.run(DQN_train_step_op, {state_ph:states_batch, action_ph:actions_batch, target_ph:targets})
+        
+        # Update target networks    
+        if train_step % args.update_target_step == 0:
+            sess.run(update_target_op)
+        
+        # Calculate time per step and display progress to console   
+        duration = time.time() - start_time
+        duration_values.append(duration)
+        ave_duration = sum(duration_values)/float(len(duration_values))
+        
+        sys.stdout.write('\x1b[2K\rStep {:d}/{:d} \t ({:.3f} s/step)'.format(train_step, args.num_steps_train, ave_duration))
+        sys.stdout.flush()       
+        
+        # Save checkpoint       
+        if train_step % args.save_ckpt_step == 0:
+            saver.save(sess, checkpoint_path, global_step=train_step)
+            sys.stdout.write('\n Checkpoint saved\n')   
+            sys.stdout.flush() 
             
-            #sys.stdout.write('\x1b[2K\rStep {:d}/{:d} \t ({:.3f} s/step)'.format(train_step, self.num_steps_train, ave_duration))
-            #sys.stdout.flush()       
+            # Reset time calculation
+            duration_values = []
             
-            # Save checkpoint       
-            if train_step % self.save_ckpt_step == 0:
-                saver.save(sess, checkpoint_path, global_step=train_step)
-                sys.stdout.write('\n Checkpoint saved\n')   
-                sys.stdout.flush() 
-                
-                # Reset time calculation
-                duration_values = []
-                       
\ No newline at end of file
+                  
+
+if  __name__ == '__main__':
+    train_args = get_train_args()
+    train(train_args)         
\ No newline at end of file