kcelebi · kcelebi · Sep 9, 2024 · Sep 8, 2024 · Sep 8, 2024 · Sep 8, 2024
diff --git a/nb/q-learning-v0.ipynb b/nb/q-learning-v0.ipynb
diff --git a/requirements.txt b/requirements.txt
@@ -2,4 +2,5 @@ pytest
 numpy
 pandas
 matplotlib
-pathlib
+pathlib
+tqdm
diff --git a/sim b/sim
diff --git a/src/analysis/analysis.py b/src/analysis/analysis.py
@@ -1,4 +1,5 @@
 import numpy as np
+import matplotlib.pyplot as plt
 
 import sys
 sys.path.append('../')
@@ -35,9 +36,9 @@ def plot_state(state):
 
 	fig, axs = plt.subplots()
 	if state.shape[0] > 1:
-		axs.imshow(~state[0], cmap = 'gray')
+		axs.imshow(state, cmap = 'gray')
 	else:
-		axs.imshow(~state, cmap = 'gray')
+		axs.imshow(state[0], cmap = 'gray')
 
 	axs.set_xticks(np.arange(len(state))+0.5)
 	axs.set_yticks(np.arange(len(state))+0.5)

diff --git a/src/machine_learning/learning_agents.py b/src/machine_learning/learning_agents.py
@@ -20,28 +20,28 @@ def get_action(self, state):
 class ValueEstimationAgent(Agent):
 
 	"""
-      Abstract agent which assigns values to (state,action)
-      Q-Values for an environment. As well as a value to a
-      state and a policy given respectively by,
-
-      V(s) = max_{a in actions} Q(s,a)
-      policy(s) = arg_max_{a in actions} Q(s,a)
-
-      Both ValueIterationAgent and QLearningAgent inherit
-      from this agent. While a ValueIterationAgent has
-      a model of the environment via a MarkovDecisionProcess
-      (see mdp.py) that is used to estimate Q-Values before
-      ever actually acting, the QLearningAgent estimates
-      Q-Values while acting in the environment.
-    """
-
-	def __init__(self, alpha, epsilon, gamma, num_training):
+		Abstract agent which assigns values to (state,action)
+		Q-Values for an environment. As well as a value to a
+		state and a policy given respectively by,
+
+		V(s) = max_{a in actions} Q(s,a)
+		policy(s) = arg_max_{a in actions} Q(s,a)
+
+		Both ValueIterationAgent and QLearningAgent inherit
+		from this agent. While a ValueIterationAgent has
+		a model of the environment via a MarkovDecisionProcess
+		(see mdp.py) that is used to estimate Q-Values before
+		ever actually acting, the QLearningAgent estimates
+		Q-Values while acting in the environment.
+	"""
+
+	def __init__(self, alpha = 1.0, epsilon = 0.05, gamma = 0.8, num_training = 10):
+		'''
+			alpha    - learning rate
+			epsilon  - exploration rate
+			gamma    - discount factor
+			num_training - number of training episodes, i.e. no learning after these many episodes
 		'''
-	        alpha    - learning rate
-	        epsilon  - exploration rate
-	        gamma    - discount factor
-	        num_training - number of training episodes, i.e. no learning after these many episodes
-        '''
 		self.alpha = alpha
 		self.epsilon = epsilon
 		self.gamma = gamma
@@ -64,33 +64,104 @@ class ReinforcementAgent(ValueEstimationAgent):
 	def update(self, state, action, next_state, reward):
 		raise NotImplementedError()
 
+
+	'''
+		Provided custom action_func -- to be overloaded into a new class
+
+		Provides the possible legal actions for CA agent
+	'''
 	def get_legal_actions(self, state):
 		return self.action_func(state)
 
+	'''
+		Called by observe_func after we have actually transitioned ot next state 
+		-- then we have to record it
+	'''
 	def observe_transition(self, state, action, next_state, delta_reward):
 		self.episode_rewards += delta_reward
 		self.update(state, action, next_state, delta_reward)
 
+
+	'''
+		At each point in the game, we observe the state we have just arrived at
+		and assess how that affects our score. 
+	'''
+	def observe_function(self, state):
+
+		# ensure we don't call at first episode
+		if self.last_state is not None:
+			delta_reward = self.reward_func(state) - self.reward_func(self.last_state)
+
+			self.observe_transition(
+				state = self.last_state,
+				action = self.last_action,
+				next_state = state,
+				delta_reward = delta_reward
+			)
+
+		return state
+
+	def do_action(self, state, action):
+		self.last_state = state
+		self.last_action = action
+
+	'''
+		Called by env when new episode is starting
+	'''
 	def start_episode(self):
-		...
+		self.last_state = None
+		self.last_action = None
+		self.episode_rewards = 0.0
 
+	'''
+		Called by env at the end of an episode
+	'''
 	def stop_episode(self):
-		...
+		if self.is_in_training():
+			self.accum_train_rewards += self.episode_rewards
+		else:
+			self.accum_test_rewards += self.episode_rewards
+
+		self.episodes_so_far += 1
+
+		# stop the learning for testing stage
+		if self.is_in_testing:
+			self.epsilon = 0.0 		# no exploration
+			self.alpha = 0.0 		# no learning
 
-	def __init__(self, action_func = None, num_training = 100, epsilon = 0.5, alpha = 0.5, gamma = 1):
-		# we should never be in this position, overwrite this later
-		if action_func is None:
-			action_func = lambda state: state.get_legal_actions() # not possible, state not an obj
+
+	def is_in_training(self):
+		return self.episodes_so_far < self.num_training
+
+	def is_in_testing(self):
+		return not self.is_in_training
+
+
+	'''
+		action_func: Function which takes a state and returns the list of legal actions
+
+		alpha    - learning rate
+		epsilon  - exploration rate
+		gamma    - discount factor
+		numTraining - number of training episodes, i.e. no learning after these many episodes
+	'''
+	def __init__(self, action_func = None, reward_func = None, num_training = 100, epsilon = 0.5, alpha = 0.5, gamma = 1):
 
 		self.action_func = action_func
+		self.reward_func = reward_func
 		self.episodes_so_far = 0
 		self.accum_train_rewards = 0.0
-		self.accum_train_rewards = 0.0
+		self.accum_test_rewards = 0.0
 		self.num_training = int(num_training)
 		self.epsilon = float(epsilon)
 		self.alpha = float(alpha)
 		self.discount = float(gamma)
 
+
+	def final(self, state):
+		delta_reward = get_score() - self.last_state.get_score()
+		#... finsih implementing later
+
 
 
 

diff --git a/src/machine_learning/q_learning_agents.py b/src/machine_learning/q_learning_agents.py
@@ -4,6 +4,7 @@
 sys.path.append('../')
 
 from machine_learning.learning_agents import *
+import machine_learning.util as util
 from path_handler import PathHandler as PH
 import sim.automata as atm
 from sim.rules import Rules
@@ -14,32 +15,33 @@ class QLearningAgent(ReinforcementAgent):
 	def __init__(self, **args):
 		ReinforcementAgent.__init__(self, **args)
 
-		self.q_values = util.Counter()?
+		# how are we going to implement this counter obj
+		self.q_values = util.Counter()
 
 
 	def get_Q_value(self, state, action):
-		"""
+		'''
 			Returns Q(state,action)
 			Should return 0.0 if we have never seen a state
 			or the Q node value otherwise
-		"""
+		'''
 		return self.q_values[(state, action)]
 
 	def compute_value_from_Q_values(self, state):
-		"""
+		'''
 			Returns max_action Q(state,action)
 			where the max is over legal actions.  Note that if
 			there are no legal actions, which is the case at the
 			terminal state, you should return a value of 0.0.
-		"""
+		'''
 
 		legal_actions = self.get_legal_actions(state)
 
 		#Terminal
 		if len(legal_actions) == 0:
 			return 0.0
 
-		return max([self.getQValue(state, a) for a in legal_actions])
+		return max([self.get_Q_value(state, a) for a in legal_actions])
 
 
 	def compute_action_from_Q_values(self, state):
@@ -54,7 +56,10 @@ def compute_action_from_Q_values(self, state):
 		if len(legal_actions) == 0:
 			return None
 
-		best = sorted([(a, self.get_Q_value(state, a)) for a in legal_actions], key = lambda x: x[1], reverse = True)
+		best = sorted(
+			[(a, self.get_Q_value(state, a)) for a in legal_actions],
+			key = lambda x: x[1], reverse = True
+		)
 		return random.choice([b[0] for b in best if b[1] == best[0][1]])
 
 	def get_action(self, state):
@@ -68,3 +73,18 @@ def get_action(self, state):
 			return random.choice(legal_actions)
 		return self.compute_action_from_Q_values(state)
 
+
+	# update our q values here
+	def update(self, state, action, next_state, reward):
+		# from value estmation parent.parent
+		NSQ = self.get_value(next_state)
+		self.q_values[(state, action)] = self.get_Q_value(state, action) + self.alpha * (reward + self.discount*NSQ - self.get_Q_value(state, action))
+
+
+	def get_policy(self, state):
+		return self.compute_action_from_Q_values(state)
+
+	def get_value(self, state):
+		return self.compute_value_from_Q_values(state)
+
+
diff --git a/src/machine_learning/state.py b/src/machine_learning/state.py
@@ -0,0 +1,45 @@
+import numpy as np
+
+import sys
+sys.path.append('../')
+
+from path_handler import PathHandler as PH
+import sim.automata as atm
+from sim.rules import Rules
+
+class State:
+
+	def __init__(self, state, rule):
+		self.values = state
+		self.rule = rule
+
+		self._shape = self.values.shape
+
+	# fix this dogshit hash function
+	def __hash__(self):
+		return hash(self.values.tostring())
+
+	def __repr__(self):
+		return self.values.__repr__()
+
+	def copy(self):
+		return State(self.values, self.rule)
+
+	@property
+	def shape(self):
+		return self.values.shape
+
+	@property
+	def flat(self):
+		return self.values.flat
+
+
+	'''
+		Generate successor states based on given action
+	'''
+	def get_successor(self, action):
+		new_state = self.values.copy()
+		new_state[action // self.values.shape[1], action % self.values.shape[1]] = 1
+		new_state = atm.update(new_state, rule = self.rule)
+
+		return State(new_state, rule = self.rule)
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,4 +2,5 @@ pytest @@
     numpy
     pandas
     matplotlib
-    pathlib
+    pathlib
+    tqdm