diff --git a/webots/controllers/RL_Supervisor/agent.py b/webots/controllers/RL_Supervisor/agent.py index 6949414..f45ef80 100644 --- a/webots/controllers/RL_Supervisor/agent.py +++ b/webots/controllers/RL_Supervisor/agent.py @@ -33,8 +33,8 @@ import numpy as np # pylint: disable=import-error import tensorflow as tf # pylint: disable=import-error import tensorflow_probability as tfp # pylint: disable=import-error -from trajectory_buffer import Buffer -from networks import Networks +from trajectory_buffer import Memory +from networks import Models ################################################################################ # Variables @@ -57,16 +57,16 @@ MODE_CHANNEL_NAME = "MODE" CMD_ID_SET_READY_STATE = 1 CMD_ID_SET_TRAINING_STATE = 2 -POSITION_DATA = [-0.24713614078815466, 0.01, 0.013994298332013683] +POSITION_DATA = [-0.24713614078815466, -0.04863962992854465, 0.013994298332013683] ORIENTATION_DATA = [ -1.0564747468923541e-06, 8.746699709178704e-07, 0.9999999999990595, - 1.5880805820884731, + 1.5880805820884731 ] MAX_SENSOR_VALUE = 1000 -MIN_STD_DEV = 0.1 # Minimum standard deviation -STD_DEV_FACTOR = 0.9995 # Discounter standard deviation factor +MIN_STD_DEV = 0.01 # Minimum standard deviation +STD_DEV_FACTOR = 0.995 # Discounter standard deviation factor ################################################################################ # Classes @@ -74,15 +74,18 @@ class Agent: # pylint: disable=too-many-instance-attributes - """The Agent class represents an intelligent agent that makes decisions to - control motors based on the position of the robot.""" + """ + The Agent class represents an intelligent agent that makes decisions to + control motors based on the position of the robot. + """ # pylint: disable=too-many-arguments def __init__( self, smp_server, gamma=0.99, - alpha=0.0003, + actor_alpha=0.0001, + critic_alpha=0.0003, gae_lambda=0.95, policy_clip=0.2, batch_size=64, @@ -91,17 +94,16 @@ def __init__( max_buffer_length=65536, ): self.__serialmux = smp_server - self.__alpha = alpha self.__policy_clip = policy_clip self.__chkpt_dir = chkpt_dir self.train_mode = False self.__top_speed = top_speed - self.__buffer = Buffer(batch_size, max_buffer_length, gamma, gae_lambda) - self.__neural_network = Networks(self.__alpha) + self.__memory = Memory(batch_size, max_buffer_length, gamma, gae_lambda) + self.__neural_network = Models(actor_alpha, critic_alpha) self.__training_index = 0 # Track batch index during training self.__current_batch = None # Saving of the current batch which is in process - self.__std_dev = 0.9 - self.n_epochs = 10 + self.__std_dev = 0.05 + self.n_epochs = 3 self.done = False self.action = None self.value = None @@ -139,7 +141,7 @@ def store_transition( reward: The reward received. done: Indicating whether the target sequence has been reached. """ - self.__buffer.store_memory(state, action, probs, value, reward, done) + self.__memory.store_memory(state, action, probs, value, reward, done) def save_models(self): """Saves the models in the specified file.""" @@ -250,7 +252,7 @@ def update(self, robot_node): # Checks whether the sequence has ended if it is set to Training mode. if (self.train_mode is True) and ( - (self.done is True) or (self.__buffer.is_memory_full() is True) + (self.done is True) or (self.__memory.is_memory_full() is True) ): cmd_payload = struct.pack("B", CMD_ID_SET_TRAINING_STATE) self.data_sent = self.__serialmux.send_data("CMD", cmd_payload) @@ -277,11 +279,13 @@ def update(self, robot_node): if self.data_sent is False: self.unsent_data.append(("SPEED_SET", motorcontrol)) + self.reinitialize(robot_node) self.data_sent = self.__serialmux.send_data("CMD", cmd_payload) # Failed to send data. Appends the data to unsent_data List if self.data_sent is False: self.unsent_data.append(("CMD", cmd_payload)) + self.state = "IDLE" def normalize_sensor_data(self, sensor_data): """ @@ -314,29 +318,26 @@ def determine_reward(self, sensor_data): float: the Resulting Reward """ - reward = self.__buffer.calculate_reward(sensor_data) + reward = self.__memory.calculate_reward(sensor_data) return reward # pylint: disable=too-many-arguments # pylint: disable=too-many-locals - def learn(self, states, actions, old_probs, values, rewards, dones): + def learn(self, states, actions, old_probs, values, rewards, advantages): """ Perform training to optimize model weights. Parameters ---------- - states: The saved states observed during interactions with the environment. - actions: The saved actions taken in response to the observed states. - old_probs: The saved probabilities of the actions taken, based on the previous policy. - values: The saved estimated values of the observed states. - rewards: The saved rewards received for taking the actions. - dones: The saved flags indicating whether the target sequence or episode has - been completed. + states: The saved states observed during interactions with the environment. + actions: The saved actions taken in response to the observed states. + old_probs: The saved probabilities of the actions taken, based on the previous policy. + values: The saved estimated values of the observed states. + rewards: The saved rewards received for taking the actions. + advantages: the computed advantage values for each state in a given Data size. """ - for _ in range(self.n_epochs): - #the computed advantage values for each state in a given batch of experiences. - advantages = self.__buffer.calculate_advantages(rewards, values, dones) + for _ in range(self.n_epochs): # optimize Actor Network weights with tf.GradientTape() as tape: @@ -407,8 +408,8 @@ def learn(self, states, actions, old_probs, values, rewards, dones): self.critic_loss_history.append(critic_loss.numpy()) self.reward_history.append(sum(rewards)) - # saving logs in a CSV file - self.save_logs_to_csv() + # saving logs in a CSV file + self.save_logs_to_csv() def save_logs_to_csv(self): """Function for saving logs in a CSV file""" @@ -436,7 +437,7 @@ def perform_training(self): if self.__current_batch is None: # Grab sample from memory - self.__current_batch = self.__buffer.generate_batches() + self.__current_batch = self.__memory.generate_batches() # Perform training with mini batches. if self.__training_index < len(self.__current_batch[-1]): @@ -446,18 +447,19 @@ def perform_training(self): old_prob_arr, vals_arr, reward_arr, - dones_arr, + advatage_arr, batches, ) = self.__current_batch batch = batches[self.__training_index] + # pylint: disable=too-many-arguments self.learn( state_arr[batch], action_arr[batch], old_prob_arr[batch], vals_arr[batch], reward_arr[batch], - dones_arr[batch], + advatage_arr[batch] ) self.__training_index += 1 @@ -466,7 +468,7 @@ def perform_training(self): self.__training_index = 0 self.__current_batch = None self.done = False - self.__buffer.clear_memory() + self.__memory.clear_memory() self.state = "IDLE" self.num_episodes += 1 cmd_payload = struct.pack("B", CMD_ID_SET_READY_STATE) diff --git a/webots/controllers/RL_Supervisor/networks.py b/webots/controllers/RL_Supervisor/networks.py index f362459..1535330 100644 --- a/webots/controllers/RL_Supervisor/networks.py +++ b/webots/controllers/RL_Supervisor/networks.py @@ -28,9 +28,9 @@ # Imports ################################################################################ -import tensorflow as tf # pylint: disable=import-error -from tensorflow.keras import layers # pylint: disable=import-error -from tensorflow.keras.regularizers import l2 # pylint: disable=import-error +from tensorflow import keras # pylint: disable=import-error +from keras import layers # pylint: disable=import-error +from keras.regularizers import l2 # pylint: disable=import-error ################################################################################ # Variables @@ -42,15 +42,16 @@ ################################################################################ -class Networks: - """Class for building networks of actors and critics""" +class Models: + """Class for building networks of actors and critics.""" - def __init__(self, alpha): - self.__learning_rate = alpha + def __init__(self, actor_alpha, critic_alpha): + self.__actor_learning_rate = actor_alpha + self.__critic_learning_rate = critic_alpha self.actor_network = self.build_actor_network() self.critic_network = self.build_critic_network() - self.actor_optimizer = tf.keras.optimizers.Adam(self.__learning_rate) - self.critic_optimizer = tf.keras.optimizers.Adam(self.__learning_rate) + self.actor_optimizer = keras.optimizers.Adam(self.__actor_learning_rate) + self.critic_optimizer = keras.optimizers.Adam(self.__critic_learning_rate) def build_actor_network(self): """Build Actor Network.""" @@ -84,7 +85,7 @@ def build_actor_network(self): bias_initializer="zeros", )(fc3) - return tf.keras.models.Model(inputs=state_input, outputs=mean) + return keras.models.Model(inputs=state_input, outputs=mean) def build_critic_network(self): """Build Critic Network""" @@ -113,7 +114,7 @@ def build_critic_network(self): )(fc2) value = layers.Dense(1)(fc3) # Value output - return tf.keras.models.Model(inputs=state_input, outputs=value) + return keras.models.Model(inputs=state_input, outputs=value) ################################################################################ diff --git a/webots/controllers/RL_Supervisor/rl_supervisor.py b/webots/controllers/RL_Supervisor/rl_supervisor.py index f5f7ac8..b620956 100644 --- a/webots/controllers/RL_Supervisor/rl_supervisor.py +++ b/webots/controllers/RL_Supervisor/rl_supervisor.py @@ -64,7 +64,7 @@ MODE_CHANNEL_NAME = "MODE" -MIN_NUMBER_OF_STEPS = 200 +MIN_NUMBER_OF_STEPS = 400 SENSOR_ID_MOST_LEFT = 0 SENSOR_ID_MOST_RIGHT = 4 @@ -76,8 +76,8 @@ ################################################################################ -class RobotController: - """Class for data flow control logic""" +class RobotController: # pylint: disable=too-many-instance-attributes + """Class for data flow control logic.""" def __init__(self, smp_server, tick_size, agent): self.__smp_server = smp_server @@ -86,6 +86,7 @@ def __init__(self, smp_server, tick_size, agent): self.__no_line_detection_count = 0 self.__timestamp = 0 # Elapsed time since reset [ms] self.last_sensor_data = None + self.start_stop_line_detected = False self.steps = 0 def callback_status(self, payload: bytearray) -> None: @@ -94,8 +95,7 @@ def callback_status(self, payload: bytearray) -> None: # perform action on robot status feedback if payload[0] == STATUS_CHANNEL_ERROR_VAL: print("robot has reached error-state (max. lap time passed in robot)") - - self.__agent.done = 1 + self.__agent.done = True def callback_line_sensors(self, payload: bytearray) -> None: """Callback LINE_SENS Channel.""" @@ -109,21 +109,24 @@ def callback_line_sensors(self, payload: bytearray) -> None: self.__no_line_detection_count = 0 # Detect start/stop line - is_start_stop = all( - value >= LINE_SENSOR_ON_TRACK_MIN_VALUE for value in sensor_data - ) + if ((sensor_data[SENSOR_ID_MOST_LEFT] >= LINE_SENSOR_ON_TRACK_MIN_VALUE) and + (sensor_data[SENSOR_ID_MOST_RIGHT] >= LINE_SENSOR_ON_TRACK_MIN_VALUE)): + self.start_stop_line_detected = True + # Detect Start/Stop Line before Finish Trajectories - if (is_start_stop is True) and (self.steps < MIN_NUMBER_OF_STEPS): + if (self.start_stop_line_detected is True) and (self.steps < MIN_NUMBER_OF_STEPS): sensor_data = list(sensor_data) sensor_data[SENSOR_ID_MOST_LEFT] = 0 sensor_data[SENSOR_ID_MOST_RIGHT] = 0 + self.start_stop_line_detected = False # sequence stop criterion debounce no line detection and start/stop line detected if self.__no_line_detection_count >= 30 or ( - is_start_stop and (self.steps >= MIN_NUMBER_OF_STEPS) + (self.start_stop_line_detected is True) and (self.steps >= MIN_NUMBER_OF_STEPS) ): self.__agent.done = True self.__no_line_detection_count = 0 + self.steps = 0 # The sequence of states and actions is stored in memory for the training phase. if self.__agent.train_mode: @@ -136,7 +139,7 @@ def callback_line_sensors(self, payload: bytearray) -> None: # Start storage The data after the second received sensor data if self.last_sensor_data is not None: - normalized_sensor_data = self.__agent.normalize_sensor_data(sensor_data) + normalized_sensor_data = self.__agent.normalize_sensor_data(self.last_sensor_data) self.__agent.store_transition( normalized_sensor_data, self.__agent.action, @@ -278,7 +281,6 @@ def main_loop(): # Start the training elif agent.state == "TRAINING": supervisor.last_sensor_data = None - controller.steps = 0 agent.perform_training() print(f"#{agent.num_episodes} actor loss: {agent.actor_loss_history[-1]:.4f}," diff --git a/webots/controllers/RL_Supervisor/trajectory_buffer.py b/webots/controllers/RL_Supervisor/trajectory_buffer.py index 44f17fc..151dcf5 100644 --- a/webots/controllers/RL_Supervisor/trajectory_buffer.py +++ b/webots/controllers/RL_Supervisor/trajectory_buffer.py @@ -40,8 +40,8 @@ ################################################################################ -class Buffer: # pylint: disable=too-many-instance-attributes - """Class for store and manage experience tuples during reinforcement learning""" +class Memory: # pylint: disable=too-many-instance-attributes + """Class for store and manage experience tuples during Reinforcement learning.""" # pylint: disable=too-many-arguments def __init__(self, batch_size, max_length, gamma, gae_lambda): @@ -51,6 +51,7 @@ def __init__(self, batch_size, max_length, gamma, gae_lambda): self.__actions = [] self.__rewards = [] self.__dones = [] + self.__advatages = [] self.__batch_size = batch_size self.__max_length = max_length self.__batch_size = batch_size @@ -85,14 +86,18 @@ def generate_batches(self): # Create batches by dividing the indices into groups of the batch_size batches = [indices[indx : indx + self.__batch_size] for indx in batch_start] + # the computed advantage values for each state in a given Data size. + self.__advatages = self.calculate_advantages(self.__rewards, + self.__vals, self.__dones) return ( np.array(self.__states), np.array(self.__actions), np.array(self.__probs), np.array(self.__vals), np.array(self.__rewards), - np.array(self.__dones), + np.array(self.__advatages), batches, + ) def get_sum_rewards(self) -> float: @@ -139,6 +144,8 @@ def clear_memory(self): self.__actions = [] self.__rewards = [] self.__dones = [] + self.__advatages = [] + self.__current_index = 0 def is_memory_full(self): """ @@ -148,8 +155,11 @@ def is_memory_full(self): ---------- - Bool: Memory is full or not """ + is_full = False + + if self.__current_index >= self.__max_length: + is_full = True - is_full = self.__current_index >= self.__max_length return is_full def calculate_advantages(self, rewards, values, dones): @@ -167,19 +177,19 @@ def calculate_advantages(self, rewards, values, dones): Returns ---------- NumPy array of float32: the computed advantage values for each - state in a given batch of experiences. + state in a given Data size. """ - mini_batch_size = rewards.shape[0] + data_size = len(rewards) # Create empty advantages array - advantages = np.zeros(mini_batch_size, dtype=np.float32) + advantages = np.zeros(data_size, dtype=np.float32) - for start_index in range(mini_batch_size): + for start_index in range(data_size-1): discount = 1 advantage = 0 - for future_index in range(start_index, mini_batch_size - 1): + for future_index in range(start_index, data_size - 1): # Calculate the temporal difference (TD) delta = ( @@ -198,10 +208,6 @@ def calculate_advantages(self, rewards, values, dones): # Update the discount factor for the next step discount *= self.__gamma * self.__gae_lambda - # Stop if a terminal state is reached - if dones[future_index]: - break - # Save the calculated advantage for the current state advantages[start_index] = advantage