From 25ef8ea66c1686946c94a71b2ed86a39d40981bb Mon Sep 17 00:00:00 2001
From: Kaya Celebi <kayacelebi17@gmail.com>
Date: Sun, 8 Sep 2024 14:05:49 -0400
Subject: [PATCH] Implemented episode stops for learning agent module

---
 sim                                     | 76 -------------------------
 src/machine_learning/learning_agents.py | 73 +++++++++++++++++-------
 2 files changed, 52 insertions(+), 97 deletions(-)
 delete mode 100644 sim

diff --git a/sim b/sim
deleted file mode 100644
index 7af8dfd..0000000
--- a/sim
+++ /dev/null
@@ -1,76 +0,0 @@
-import numpy as np
-
-'''
-	Automata is state by state
-	Note: might be better to do this in C for speedup
-
-	idea -- create reinforcement learning model
-	- we have state, action
-	- model learns to add points to the CA state to achieve some outcome
-	- consider: ask agent to create a certain state, or some qualification using k moves
-			- how does it do it
-
-	- we need pattern detection
-	- we need cycle detection
-		- can we transform each image to a graph...
-		- each state to a graph? 
-
-	- some web server where you can pkay / analyze differnet automata setup
-		- have the interactive code etc
-		- need to port it all to JS
-'''
-
-
-
-'''
-	Brute force update -- O(n^2)
-
-	NOTE: this is *not* updating in-place
-'''
-def update(state, rule):
-	state = state_fix(state)
-
-	x, y = state.shape
-	new_state = np.zeros(state.shape, dtype = int)
-
-	for i in range(x):
-		for j in range(y):
-			neighbors = get_neighbors(i, j, shape = state.shape)
-			new_state[i, j] = rule(neighbors, cell = state[i, j], state = state)
-
-	return new_state
-
-def np_update(state):
-	x, y = state.shape
-	new_state = np.zeros(state.shape)
-
-	# write a lambda to generate the update more efficiently
-	return 
-
-'''
-	Returns all valid neighbors
-'''
-def get_neighbors(i, j, shape):
-	# list comp generates all neighbors including center. If center or invalid neighbor,
-	# does i-10, j-10 as coord to remove in next step
-	neighbors = np.reshape([[[i + u, j + v] if in_range(i + u, j + v, shape = shape) else [i - 10, j - 10] for u in [-1, 0, 1]] for v in [-1, 0, 1]], (-1, 2))
-	return neighbors[~np.all(np.logical_or(neighbors == [i, j], neighbors == [i - 10, j - 10]), axis = 1)] # make sure to exlude center and not in-range values
-
-'''
-	Check the provided coord is in range of the matrix
-'''
-def in_range(i, j, shape):
-	if i < shape[0] and i > -1 and j > -1 and j < shape[1]:
-		return True
-	return False
-
-def get_random_state(shape):
-	return np.random.randint(0, 2, size = shape)
-
-def state_fix(state):
-	if type(state) != np.array:
-		state = np.array(state, dtype = int)
-		if len(state.shape) == 1:
-			state = state.reshape((1, -1))
-
-	return state
\ No newline at end of file
diff --git a/src/machine_learning/learning_agents.py b/src/machine_learning/learning_agents.py
index 09a50f8..5429b9e 100644
--- a/src/machine_learning/learning_agents.py
+++ b/src/machine_learning/learning_agents.py
@@ -20,28 +20,28 @@ def get_action(self, state):
 class ValueEstimationAgent(Agent):
 	
 	"""
-      Abstract agent which assigns values to (state,action)
-      Q-Values for an environment. As well as a value to a
-      state and a policy given respectively by,
-
-      V(s) = max_{a in actions} Q(s,a)
-      policy(s) = arg_max_{a in actions} Q(s,a)
-
-      Both ValueIterationAgent and QLearningAgent inherit
-      from this agent. While a ValueIterationAgent has
-      a model of the environment via a MarkovDecisionProcess
-      (see mdp.py) that is used to estimate Q-Values before
-      ever actually acting, the QLearningAgent estimates
-      Q-Values while acting in the environment.
-    """
+		Abstract agent which assigns values to (state,action)
+		Q-Values for an environment. As well as a value to a
+		state and a policy given respectively by,
+
+		V(s) = max_{a in actions} Q(s,a)
+		policy(s) = arg_max_{a in actions} Q(s,a)
+
+		Both ValueIterationAgent and QLearningAgent inherit
+		from this agent. While a ValueIterationAgent has
+		a model of the environment via a MarkovDecisionProcess
+		(see mdp.py) that is used to estimate Q-Values before
+		ever actually acting, the QLearningAgent estimates
+		Q-Values while acting in the environment.
+	"""
 
 	def __init__(self, alpha, epsilon, gamma, num_training):
 		'''
-	        alpha    - learning rate
-	        epsilon  - exploration rate
-	        gamma    - discount factor
-	        num_training - number of training episodes, i.e. no learning after these many episodes
-        '''
+			alpha    - learning rate
+			epsilon  - exploration rate
+			gamma    - discount factor
+			num_training - number of training episodes, i.e. no learning after these many episodes
+		'''
 		self.alpha = alpha
 		self.epsilon = epsilon
 		self.gamma = gamma
@@ -71,12 +71,43 @@ def observe_transition(self, state, action, next_state, delta_reward):
 		self.episode_rewards += delta_reward
 		self.update(state, action, next_state, delta_reward)
 
+	'''
+		Called by env when new episode is starting
+	'''
 	def start_episode(self):
-		...
+		self.last_state = None
+		self.last_action = None
+		self.episode_rewards = 0.0
 
 	def stop_episode(self):
-		...
+		if self.is_in_training():
+			self.accum_train_rewards += self.episode_rewards
+		else:
+			self.accum_test_rewards += self.episode_rewards
+
+		self.episodes_so_far += 1
+
+		# set vars for testing
+		if self.is_in_testing:
+			self.epsilon = 0.0 		# no exploration
+			self.alpha = 0.0 		# no learning
+
+
+	def is_in_training(self):
+		return self.episodes_so_far < self.num_training
+
+	def is_in_testing(self):
+		return not self.is_in_training
+	
+
+	'''
+		actionFn: Function which takes a state and returns the list of legal actions
 
+		alpha    - learning rate
+		epsilon  - exploration rate
+		gamma    - discount factor
+		numTraining - number of training episodes, i.e. no learning after these many episodes
+	'''
 	def __init__(self, action_func = None, num_training = 100, epsilon = 0.5, alpha = 0.5, gamma = 1):
 		# we should never be in this position, overwrite this later
 		if action_func is None: