From 25ef8ea66c1686946c94a71b2ed86a39d40981bb Mon Sep 17 00:00:00 2001 From: Kaya Celebi Date: Sun, 8 Sep 2024 14:05:49 -0400 Subject: [PATCH] Implemented episode stops for learning agent module --- sim | 76 ------------------------- src/machine_learning/learning_agents.py | 73 +++++++++++++++++------- 2 files changed, 52 insertions(+), 97 deletions(-) delete mode 100644 sim diff --git a/sim b/sim deleted file mode 100644 index 7af8dfd..0000000 --- a/sim +++ /dev/null @@ -1,76 +0,0 @@ -import numpy as np - -''' - Automata is state by state - Note: might be better to do this in C for speedup - - idea -- create reinforcement learning model - - we have state, action - - model learns to add points to the CA state to achieve some outcome - - consider: ask agent to create a certain state, or some qualification using k moves - - how does it do it - - - we need pattern detection - - we need cycle detection - - can we transform each image to a graph... - - each state to a graph? - - - some web server where you can pkay / analyze differnet automata setup - - have the interactive code etc - - need to port it all to JS -''' - - - -''' - Brute force update -- O(n^2) - - NOTE: this is *not* updating in-place -''' -def update(state, rule): - state = state_fix(state) - - x, y = state.shape - new_state = np.zeros(state.shape, dtype = int) - - for i in range(x): - for j in range(y): - neighbors = get_neighbors(i, j, shape = state.shape) - new_state[i, j] = rule(neighbors, cell = state[i, j], state = state) - - return new_state - -def np_update(state): - x, y = state.shape - new_state = np.zeros(state.shape) - - # write a lambda to generate the update more efficiently - return - -''' - Returns all valid neighbors -''' -def get_neighbors(i, j, shape): - # list comp generates all neighbors including center. If center or invalid neighbor, - # does i-10, j-10 as coord to remove in next step - neighbors = np.reshape([[[i + u, j + v] if in_range(i + u, j + v, shape = shape) else [i - 10, j - 10] for u in [-1, 0, 1]] for v in [-1, 0, 1]], (-1, 2)) - return neighbors[~np.all(np.logical_or(neighbors == [i, j], neighbors == [i - 10, j - 10]), axis = 1)] # make sure to exlude center and not in-range values - -''' - Check the provided coord is in range of the matrix -''' -def in_range(i, j, shape): - if i < shape[0] and i > -1 and j > -1 and j < shape[1]: - return True - return False - -def get_random_state(shape): - return np.random.randint(0, 2, size = shape) - -def state_fix(state): - if type(state) != np.array: - state = np.array(state, dtype = int) - if len(state.shape) == 1: - state = state.reshape((1, -1)) - - return state \ No newline at end of file diff --git a/src/machine_learning/learning_agents.py b/src/machine_learning/learning_agents.py index 09a50f8..5429b9e 100644 --- a/src/machine_learning/learning_agents.py +++ b/src/machine_learning/learning_agents.py @@ -20,28 +20,28 @@ def get_action(self, state): class ValueEstimationAgent(Agent): """ - Abstract agent which assigns values to (state,action) - Q-Values for an environment. As well as a value to a - state and a policy given respectively by, - - V(s) = max_{a in actions} Q(s,a) - policy(s) = arg_max_{a in actions} Q(s,a) - - Both ValueIterationAgent and QLearningAgent inherit - from this agent. While a ValueIterationAgent has - a model of the environment via a MarkovDecisionProcess - (see mdp.py) that is used to estimate Q-Values before - ever actually acting, the QLearningAgent estimates - Q-Values while acting in the environment. - """ + Abstract agent which assigns values to (state,action) + Q-Values for an environment. As well as a value to a + state and a policy given respectively by, + + V(s) = max_{a in actions} Q(s,a) + policy(s) = arg_max_{a in actions} Q(s,a) + + Both ValueIterationAgent and QLearningAgent inherit + from this agent. While a ValueIterationAgent has + a model of the environment via a MarkovDecisionProcess + (see mdp.py) that is used to estimate Q-Values before + ever actually acting, the QLearningAgent estimates + Q-Values while acting in the environment. + """ def __init__(self, alpha, epsilon, gamma, num_training): ''' - alpha - learning rate - epsilon - exploration rate - gamma - discount factor - num_training - number of training episodes, i.e. no learning after these many episodes - ''' + alpha - learning rate + epsilon - exploration rate + gamma - discount factor + num_training - number of training episodes, i.e. no learning after these many episodes + ''' self.alpha = alpha self.epsilon = epsilon self.gamma = gamma @@ -71,12 +71,43 @@ def observe_transition(self, state, action, next_state, delta_reward): self.episode_rewards += delta_reward self.update(state, action, next_state, delta_reward) + ''' + Called by env when new episode is starting + ''' def start_episode(self): - ... + self.last_state = None + self.last_action = None + self.episode_rewards = 0.0 def stop_episode(self): - ... + if self.is_in_training(): + self.accum_train_rewards += self.episode_rewards + else: + self.accum_test_rewards += self.episode_rewards + + self.episodes_so_far += 1 + + # set vars for testing + if self.is_in_testing: + self.epsilon = 0.0 # no exploration + self.alpha = 0.0 # no learning + + + def is_in_training(self): + return self.episodes_so_far < self.num_training + + def is_in_testing(self): + return not self.is_in_training + + + ''' + actionFn: Function which takes a state and returns the list of legal actions + alpha - learning rate + epsilon - exploration rate + gamma - discount factor + numTraining - number of training episodes, i.e. no learning after these many episodes + ''' def __init__(self, action_func = None, num_training = 100, epsilon = 0.5, alpha = 0.5, gamma = 1): # we should never be in this position, overwrite this later if action_func is None: