Skip to content

Commit

Permalink
Rename classes, fix bugs in calculate_advantages, and update hyperpar…
Browse files Browse the repository at this point in the history
…ameter values
  • Loading branch information
Akram authored and Akram committed Sep 3, 2024
1 parent d39d111 commit 812fbb5
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 70 deletions.
70 changes: 36 additions & 34 deletions webots/controllers/RL_Supervisor/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@
import numpy as np # pylint: disable=import-error
import tensorflow as tf # pylint: disable=import-error
import tensorflow_probability as tfp # pylint: disable=import-error
from trajectory_buffer import Buffer
from networks import Networks
from trajectory_buffer import Memory
from networks import Models

################################################################################
# Variables
Expand All @@ -57,32 +57,35 @@
MODE_CHANNEL_NAME = "MODE"
CMD_ID_SET_READY_STATE = 1
CMD_ID_SET_TRAINING_STATE = 2
POSITION_DATA = [-0.24713614078815466, 0.01, 0.013994298332013683]
POSITION_DATA = [-0.24713614078815466, -0.04863962992854465, 0.013994298332013683]
ORIENTATION_DATA = [
-1.0564747468923541e-06,
8.746699709178704e-07,
0.9999999999990595,
1.5880805820884731,
1.5880805820884731
]
MAX_SENSOR_VALUE = 1000
MIN_STD_DEV = 0.1 # Minimum standard deviation
STD_DEV_FACTOR = 0.9995 # Discounter standard deviation factor
MIN_STD_DEV = 0.01 # Minimum standard deviation
STD_DEV_FACTOR = 0.995 # Discounter standard deviation factor

################################################################################
# Classes
################################################################################


class Agent: # pylint: disable=too-many-instance-attributes
"""The Agent class represents an intelligent agent that makes decisions to
control motors based on the position of the robot."""
"""
The Agent class represents an intelligent agent that makes decisions to
control motors based on the position of the robot.
"""

# pylint: disable=too-many-arguments
def __init__(
self,
smp_server,
gamma=0.99,
alpha=0.0003,
actor_alpha=0.0001,
critic_alpha=0.0003,
gae_lambda=0.95,
policy_clip=0.2,
batch_size=64,
Expand All @@ -91,17 +94,16 @@ def __init__(
max_buffer_length=65536,
):
self.__serialmux = smp_server
self.__alpha = alpha
self.__policy_clip = policy_clip
self.__chkpt_dir = chkpt_dir
self.train_mode = False
self.__top_speed = top_speed
self.__buffer = Buffer(batch_size, max_buffer_length, gamma, gae_lambda)
self.__neural_network = Networks(self.__alpha)
self.__memory = Memory(batch_size, max_buffer_length, gamma, gae_lambda)
self.__neural_network = Models(actor_alpha, critic_alpha)
self.__training_index = 0 # Track batch index during training
self.__current_batch = None # Saving of the current batch which is in process
self.__std_dev = 0.9
self.n_epochs = 10
self.__std_dev = 0.05
self.n_epochs = 3
self.done = False
self.action = None
self.value = None
Expand Down Expand Up @@ -139,7 +141,7 @@ def store_transition(
reward: The reward received.
done: Indicating whether the target sequence has been reached.
"""
self.__buffer.store_memory(state, action, probs, value, reward, done)
self.__memory.store_memory(state, action, probs, value, reward, done)

def save_models(self):
"""Saves the models in the specified file."""
Expand Down Expand Up @@ -250,7 +252,7 @@ def update(self, robot_node):

# Checks whether the sequence has ended if it is set to Training mode.
if (self.train_mode is True) and (
(self.done is True) or (self.__buffer.is_memory_full() is True)
(self.done is True) or (self.__memory.is_memory_full() is True)
):
cmd_payload = struct.pack("B", CMD_ID_SET_TRAINING_STATE)
self.data_sent = self.__serialmux.send_data("CMD", cmd_payload)
Expand All @@ -277,11 +279,13 @@ def update(self, robot_node):
if self.data_sent is False:
self.unsent_data.append(("SPEED_SET", motorcontrol))

self.reinitialize(robot_node)
self.data_sent = self.__serialmux.send_data("CMD", cmd_payload)

# Failed to send data. Appends the data to unsent_data List
if self.data_sent is False:
self.unsent_data.append(("CMD", cmd_payload))
self.state = "IDLE"

def normalize_sensor_data(self, sensor_data):
"""
Expand Down Expand Up @@ -314,29 +318,26 @@ def determine_reward(self, sensor_data):
float: the Resulting Reward
"""
reward = self.__buffer.calculate_reward(sensor_data)
reward = self.__memory.calculate_reward(sensor_data)
return reward

# pylint: disable=too-many-arguments
# pylint: disable=too-many-locals
def learn(self, states, actions, old_probs, values, rewards, dones):
def learn(self, states, actions, old_probs, values, rewards, advantages):
"""
Perform training to optimize model weights.
Parameters
----------
states: The saved states observed during interactions with the environment.
actions: The saved actions taken in response to the observed states.
old_probs: The saved probabilities of the actions taken, based on the previous policy.
values: The saved estimated values of the observed states.
rewards: The saved rewards received for taking the actions.
dones: The saved flags indicating whether the target sequence or episode has
been completed.
states: The saved states observed during interactions with the environment.
actions: The saved actions taken in response to the observed states.
old_probs: The saved probabilities of the actions taken, based on the previous policy.
values: The saved estimated values of the observed states.
rewards: The saved rewards received for taking the actions.
advantages: the computed advantage values for each state in a given Data size.
"""
for _ in range(self.n_epochs):

#the computed advantage values for each state in a given batch of experiences.
advantages = self.__buffer.calculate_advantages(rewards, values, dones)
for _ in range(self.n_epochs):

# optimize Actor Network weights
with tf.GradientTape() as tape:
Expand Down Expand Up @@ -407,8 +408,8 @@ def learn(self, states, actions, old_probs, values, rewards, dones):
self.critic_loss_history.append(critic_loss.numpy())
self.reward_history.append(sum(rewards))

# saving logs in a CSV file
self.save_logs_to_csv()
# saving logs in a CSV file
self.save_logs_to_csv()

def save_logs_to_csv(self):
"""Function for saving logs in a CSV file"""
Expand Down Expand Up @@ -436,7 +437,7 @@ def perform_training(self):
if self.__current_batch is None:

# Grab sample from memory
self.__current_batch = self.__buffer.generate_batches()
self.__current_batch = self.__memory.generate_batches()

# Perform training with mini batches.
if self.__training_index < len(self.__current_batch[-1]):
Expand All @@ -446,18 +447,19 @@ def perform_training(self):
old_prob_arr,
vals_arr,
reward_arr,
dones_arr,
advatage_arr,
batches,
) = self.__current_batch
batch = batches[self.__training_index]

# pylint: disable=too-many-arguments
self.learn(
state_arr[batch],
action_arr[batch],
old_prob_arr[batch],
vals_arr[batch],
reward_arr[batch],
dones_arr[batch],
advatage_arr[batch]
)
self.__training_index += 1

Expand All @@ -466,7 +468,7 @@ def perform_training(self):
self.__training_index = 0
self.__current_batch = None
self.done = False
self.__buffer.clear_memory()
self.__memory.clear_memory()
self.state = "IDLE"
self.num_episodes += 1
cmd_payload = struct.pack("B", CMD_ID_SET_READY_STATE)
Expand Down
23 changes: 12 additions & 11 deletions webots/controllers/RL_Supervisor/networks.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@
# Imports
################################################################################

import tensorflow as tf # pylint: disable=import-error
from tensorflow.keras import layers # pylint: disable=import-error
from tensorflow.keras.regularizers import l2 # pylint: disable=import-error
from tensorflow import keras # pylint: disable=import-error
from keras import layers # pylint: disable=import-error
from keras.regularizers import l2 # pylint: disable=import-error

################################################################################
# Variables
Expand All @@ -42,15 +42,16 @@
################################################################################


class Networks:
"""Class for building networks of actors and critics"""
class Models:
"""Class for building networks of actors and critics."""

def __init__(self, alpha):
self.__learning_rate = alpha
def __init__(self, actor_alpha, critic_alpha):
self.__actor_learning_rate = actor_alpha
self.__critic_learning_rate = critic_alpha
self.actor_network = self.build_actor_network()
self.critic_network = self.build_critic_network()
self.actor_optimizer = tf.keras.optimizers.Adam(self.__learning_rate)
self.critic_optimizer = tf.keras.optimizers.Adam(self.__learning_rate)
self.actor_optimizer = keras.optimizers.Adam(self.__actor_learning_rate)
self.critic_optimizer = keras.optimizers.Adam(self.__critic_learning_rate)

def build_actor_network(self):
"""Build Actor Network."""
Expand Down Expand Up @@ -84,7 +85,7 @@ def build_actor_network(self):
bias_initializer="zeros",
)(fc3)

return tf.keras.models.Model(inputs=state_input, outputs=mean)
return keras.models.Model(inputs=state_input, outputs=mean)

def build_critic_network(self):
"""Build Critic Network"""
Expand Down Expand Up @@ -113,7 +114,7 @@ def build_critic_network(self):
)(fc2)
value = layers.Dense(1)(fc3) # Value output

return tf.keras.models.Model(inputs=state_input, outputs=value)
return keras.models.Model(inputs=state_input, outputs=value)


################################################################################
Expand Down
26 changes: 14 additions & 12 deletions webots/controllers/RL_Supervisor/rl_supervisor.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@

MODE_CHANNEL_NAME = "MODE"

MIN_NUMBER_OF_STEPS = 200
MIN_NUMBER_OF_STEPS = 400
SENSOR_ID_MOST_LEFT = 0
SENSOR_ID_MOST_RIGHT = 4

Expand All @@ -76,8 +76,8 @@
################################################################################


class RobotController:
"""Class for data flow control logic"""
class RobotController: # pylint: disable=too-many-instance-attributes
"""Class for data flow control logic."""

def __init__(self, smp_server, tick_size, agent):
self.__smp_server = smp_server
Expand All @@ -86,6 +86,7 @@ def __init__(self, smp_server, tick_size, agent):
self.__no_line_detection_count = 0
self.__timestamp = 0 # Elapsed time since reset [ms]
self.last_sensor_data = None
self.start_stop_line_detected = False
self.steps = 0

def callback_status(self, payload: bytearray) -> None:
Expand All @@ -94,8 +95,7 @@ def callback_status(self, payload: bytearray) -> None:
# perform action on robot status feedback
if payload[0] == STATUS_CHANNEL_ERROR_VAL:
print("robot has reached error-state (max. lap time passed in robot)")

self.__agent.done = 1
self.__agent.done = True

def callback_line_sensors(self, payload: bytearray) -> None:
"""Callback LINE_SENS Channel."""
Expand All @@ -109,21 +109,24 @@ def callback_line_sensors(self, payload: bytearray) -> None:
self.__no_line_detection_count = 0

# Detect start/stop line
is_start_stop = all(
value >= LINE_SENSOR_ON_TRACK_MIN_VALUE for value in sensor_data
)
if ((sensor_data[SENSOR_ID_MOST_LEFT] >= LINE_SENSOR_ON_TRACK_MIN_VALUE) and
(sensor_data[SENSOR_ID_MOST_RIGHT] >= LINE_SENSOR_ON_TRACK_MIN_VALUE)):
self.start_stop_line_detected = True

# Detect Start/Stop Line before Finish Trajectories
if (is_start_stop is True) and (self.steps < MIN_NUMBER_OF_STEPS):
if (self.start_stop_line_detected is True) and (self.steps < MIN_NUMBER_OF_STEPS):
sensor_data = list(sensor_data)
sensor_data[SENSOR_ID_MOST_LEFT] = 0
sensor_data[SENSOR_ID_MOST_RIGHT] = 0
self.start_stop_line_detected = False

# sequence stop criterion debounce no line detection and start/stop line detected
if self.__no_line_detection_count >= 30 or (
is_start_stop and (self.steps >= MIN_NUMBER_OF_STEPS)
(self.start_stop_line_detected is True) and (self.steps >= MIN_NUMBER_OF_STEPS)
):
self.__agent.done = True
self.__no_line_detection_count = 0
self.steps = 0

# The sequence of states and actions is stored in memory for the training phase.
if self.__agent.train_mode:
Expand All @@ -136,7 +139,7 @@ def callback_line_sensors(self, payload: bytearray) -> None:

# Start storage The data after the second received sensor data
if self.last_sensor_data is not None:
normalized_sensor_data = self.__agent.normalize_sensor_data(sensor_data)
normalized_sensor_data = self.__agent.normalize_sensor_data(self.last_sensor_data)
self.__agent.store_transition(
normalized_sensor_data,
self.__agent.action,
Expand Down Expand Up @@ -278,7 +281,6 @@ def main_loop():
# Start the training
elif agent.state == "TRAINING":
supervisor.last_sensor_data = None
controller.steps = 0
agent.perform_training()

print(f"#{agent.num_episodes} actor loss: {agent.actor_loss_history[-1]:.4f},"
Expand Down
Loading

0 comments on commit 812fbb5

Please sign in to comment.