Skip to content

Commit

Permalink
added comments, renamed variables, and updated values
Browse files Browse the repository at this point in the history
  • Loading branch information
Akram authored and Akram committed Aug 19, 2024
1 parent 849771d commit 0a5001c
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 38 deletions.
64 changes: 34 additions & 30 deletions webots/controllers/RL_Supervisor/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def __init__(
self.__neural_network = Networks(self.__alpha)
self.__training_index = 0 # Track batch index during training
self.__current_batch = None # Saving of the current batch which is in process
self.__std_dev = 1
self.__std_dev = 0.9
self.done = False
self.action = None
self.value = None
Expand Down Expand Up @@ -156,13 +156,14 @@ def predict_action(self, state):
m_state = self.normalize_sensor_data(state)
state = tf.convert_to_tensor([m_state], dtype=tf.float32)

# Calculation of probabilities by the Actor neural network
probs = self.__neural_network.actor_network(state)
# output from the Actor Network.
action_mean = self.__neural_network.actor_network(state)

# Training mode is set.
if self.train_mode is True:
# Create a normal distribution with the calculated probabilities
# and the standard deviation
dist = tfp.distributions.Normal(probs, self.__std_dev)

# Create a normal distribution
dist = tfp.distributions.Normal(action_mean, self.__std_dev)

# Sampling an action from the normal distribution
sampled_action = dist.sample()
Expand All @@ -185,8 +186,10 @@ def predict_action(self, state):
self.action = transformed_action.numpy()[0]
self.value = value.numpy()[0]
self.adjusted_log_prob = adjusted_log_prob.numpy()[0]

# Driving mode is set
else:
self.action = probs.numpy()[0]
self.action = action_mean.numpy()[0]

return self.action

Expand Down Expand Up @@ -240,6 +243,7 @@ def update(self, robot_node):
self.data_sent = self.__serialmux.send_data(
"SPEED_SET", motorcontrol
) # stop the motors immediately

# Failed to send data. Appends the data to unsent_data List
if self.data_sent is False:
self.unsent_data.append(("SPEED_SET", motorcontrol))
Expand All @@ -253,29 +257,9 @@ def update(self, robot_node):
def normalize_sensor_data(self, sensor_data):
"""The normalize_sensor_data function scales the sensor data to a range between 0 and 1."""

# Normalized sensor data
sensor_data = np.array(sensor_data) / MAX_SENSOR_VALUE

return sensor_data

def calculate_reward(self, sensor_data):
"""
The calculate_reward function evaluates the consequences of a certain
action performed in a certain state by calculating the resulting reward
"""
estimated_pos = self.calculate_position(sensor_data)

# Return reward between 0 and 10
if 500 <= estimated_pos <= 2000:
reward = ((1 / 150) * estimated_pos) - (10 / 3)
return reward

if 2000 < estimated_pos <= 3500:
reward = ((-1 / 150) * estimated_pos) + (70 / 3)
return reward

return 0

def calculate_position(self, sensor_data):
"""
Determines the deviation and returns an estimated position of the robot
Expand Down Expand Up @@ -339,6 +323,26 @@ def calculate_position(self, sensor_data):

return estimated_pos

def calculate_reward(self, sensor_data):
"""
The calculate_reward function evaluates the consequences of a certain
action performed in a certain state by calculating the resulting reward.
A reward of 1 means that the robot is in the center of the Line.
"""
estimated_pos = self.calculate_position(sensor_data)

# Reward scaled between 0 and 1 If robot is in line.
if 500 <= estimated_pos <= 2000:
reward = (((1 / 150) * estimated_pos) - (10 / 3)) / 10
return reward

# Reward scaled between 1 and 0 If robot is in line.
if 2000 < estimated_pos <= 3500:
reward = (((-1 / 150) * estimated_pos) + (70 / 3))/10
return reward

return 0

def calculate_advantages(self, rewards, values, dones):
"""Calculate advantages for each state in a mini-batch."""

Expand Down Expand Up @@ -386,7 +390,7 @@ def learn(self, states, actions, old_probs, values, rewards, dones):

advantages = self.calculate_advantages(rewards, values, dones)

# optimize Actor Network weights
# optimize Actor Network weights
with tf.GradientTape() as tape:
states = tf.convert_to_tensor(states)
actions = tf.convert_to_tensor(actions)
Expand Down Expand Up @@ -426,7 +430,7 @@ def learn(self, states, actions, old_probs, values, rewards, dones):
# optimize Critic Network weights
with tf.GradientTape() as tape:

# The critical value represents the expected return from state 𝑠𝑡.
# The critical value represents the expected return from state 𝑠𝑡.
# It provides an estimate of how good it is to be in a given state.
critic_value = self.__neural_network.critic_network(states)

Expand Down Expand Up @@ -479,7 +483,7 @@ def perform_training(self):
# Grab sample from memory
self.__current_batch = self.__memory.generate_batches()

# Perform training with mini batchtes.
# Perform training with mini batches.
if self.__training_index < len(self.__current_batch[-1]):
(
state_arr,
Expand Down
27 changes: 23 additions & 4 deletions webots/controllers/RL_Supervisor/networks.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

import tensorflow as tf # pylint: disable=import-error
from tensorflow.keras import layers # pylint: disable=import-error
from tensorflow.keras.regularizers import l2 # pylint: disable=import-error

################################################################################
# Variables
Expand Down Expand Up @@ -59,22 +60,31 @@ def build_actor_network(self):
64,
activation="relu",
kernel_initializer="he_normal",
kernel_regularizer=l2(0.01),
bias_initializer="zeros",
)(state_input)
fc2 = layers.Dense(
64,
activation="relu",
kernel_initializer="he_normal",
kernel_regularizer=l2(0.01),
bias_initializer="zeros",
)(fc1)
mu = layers.Dense(
fc3 = layers.Dense(
32,
activation="relu",
kernel_initializer="he_normal",
kernel_regularizer=l2(0.01),
bias_initializer="zeros",
)(fc2)
mean = layers.Dense(
1,
activation="tanh",
kernel_initializer="glorot_uniform",
bias_initializer="zeros",
)(fc2)
)(fc3)

return tf.keras.models.Model(inputs=state_input, outputs=mu)
return tf.keras.models.Model(inputs=state_input, outputs=mean)

def build_critic_network(self):
"""Build Critic Network"""
Expand All @@ -84,15 +94,24 @@ def build_critic_network(self):
64,
activation="relu",
kernel_initializer="he_normal",
kernel_regularizer=l2(0.01),
bias_initializer="zeros",
)(state_input)
fc2 = layers.Dense(
64,
activation="relu",
kernel_initializer="he_normal",
kernel_regularizer=l2(0.01),
bias_initializer="zeros",
)(fc1)
value = layers.Dense(1)(fc2) # Value output
fc3 = layers.Dense(
32,
activation="relu",
kernel_initializer="he_normal",
kernel_regularizer=l2(0.01),
bias_initializer="zeros",
)(fc2)
value = layers.Dense(1)(fc3) # Value output

return tf.keras.models.Model(inputs=state_input, outputs=value)

Expand Down
17 changes: 13 additions & 4 deletions webots/controllers/RL_Supervisor/rl_supervisor.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,13 +101,14 @@ def callback_line_sensors(self, payload: bytearray) -> None:
"""Callback LINE_SENS Channel."""
sensor_data = struct.unpack("5H", payload)
self.steps += 1
# determine lost line condition

# Determine lost line condition
if all(value == 0 for value in sensor_data):
self.__no_line_detection_count += 1
else:
self.__no_line_detection_count = 0

# detect start/stop line
# Detect start/stop line
is_start_stop = all(
value >= LINE_SENSOR_ON_TRACK_MIN_VALUE for value in sensor_data
)
Expand All @@ -126,7 +127,12 @@ def callback_line_sensors(self, payload: bytearray) -> None:

# The sequence of states and actions is stored in memory for the training phase.
if self.__agent.train_mode:
reward = self.__agent.calculate_reward(sensor_data)

# receive a -1 punishment if the robot leaves the line
if self.__no_line_detection_count > 0:
reward = -1
else:
reward = self.__agent.calculate_reward(sensor_data)

# Start storage The data after the second received sensor data
if self.last_sensor_data is not None:
Expand Down Expand Up @@ -159,6 +165,8 @@ def load_models(self, path) -> None:
"""Load Model if exist"""
if os.path.exists(path):
self.__agent.load_models()
else:
print("No model available")

def retry_unsent_data(self, unsent_data: list) -> bool:
"""Resent any unsent Data"""
Expand Down Expand Up @@ -259,6 +267,7 @@ def main_loop():
if status != -1:

controller.load_models(PATH)

# simulation loop
while supervisor.step(timestep) != -1:
controller.process()
Expand All @@ -273,7 +282,7 @@ def main_loop():
agent.perform_training()

if 1000 <= agent.num_episodes:
print(f"Episodes: {agent.num_episodes}")
print(f"The number of episodes:{agent.num_episodes}")

# Resent any unsent Data
if agent.unsent_data:
Expand Down

0 comments on commit 0a5001c

Please sign in to comment.