Skip to content

Commit

Permalink
Merge pull request #5 from kcelebi/dev
Browse files Browse the repository at this point in the history
Reinforcement Learning for CA implemented
  • Loading branch information
kcelebi authored Sep 9, 2024
2 parents 7782f16 + 456fe71 commit a0575a1
Show file tree
Hide file tree
Showing 10 changed files with 640 additions and 115 deletions.
215 changes: 215 additions & 0 deletions nb/q-learning-v0.ipynb

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ pytest
numpy
pandas
matplotlib
pathlib
pathlib
tqdm
76 changes: 0 additions & 76 deletions sim

This file was deleted.

5 changes: 3 additions & 2 deletions src/analysis/analysis.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import matplotlib.pyplot as plt

import sys
sys.path.append('../')
Expand Down Expand Up @@ -35,9 +36,9 @@ def plot_state(state):

fig, axs = plt.subplots()
if state.shape[0] > 1:
axs.imshow(~state[0], cmap = 'gray')
axs.imshow(state, cmap = 'gray')
else:
axs.imshow(~state, cmap = 'gray')
axs.imshow(state[0], cmap = 'gray')

axs.set_xticks(np.arange(len(state))+0.5)
axs.set_yticks(np.arange(len(state))+0.5)
Expand Down
127 changes: 99 additions & 28 deletions src/machine_learning/learning_agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,28 +20,28 @@ def get_action(self, state):
class ValueEstimationAgent(Agent):

"""
Abstract agent which assigns values to (state,action)
Q-Values for an environment. As well as a value to a
state and a policy given respectively by,
V(s) = max_{a in actions} Q(s,a)
policy(s) = arg_max_{a in actions} Q(s,a)
Both ValueIterationAgent and QLearningAgent inherit
from this agent. While a ValueIterationAgent has
a model of the environment via a MarkovDecisionProcess
(see mdp.py) that is used to estimate Q-Values before
ever actually acting, the QLearningAgent estimates
Q-Values while acting in the environment.
"""

def __init__(self, alpha, epsilon, gamma, num_training):
Abstract agent which assigns values to (state,action)
Q-Values for an environment. As well as a value to a
state and a policy given respectively by,
V(s) = max_{a in actions} Q(s,a)
policy(s) = arg_max_{a in actions} Q(s,a)
Both ValueIterationAgent and QLearningAgent inherit
from this agent. While a ValueIterationAgent has
a model of the environment via a MarkovDecisionProcess
(see mdp.py) that is used to estimate Q-Values before
ever actually acting, the QLearningAgent estimates
Q-Values while acting in the environment.
"""

def __init__(self, alpha = 1.0, epsilon = 0.05, gamma = 0.8, num_training = 10):
'''
alpha - learning rate
epsilon - exploration rate
gamma - discount factor
num_training - number of training episodes, i.e. no learning after these many episodes
'''
alpha - learning rate
epsilon - exploration rate
gamma - discount factor
num_training - number of training episodes, i.e. no learning after these many episodes
'''
self.alpha = alpha
self.epsilon = epsilon
self.gamma = gamma
Expand All @@ -64,33 +64,104 @@ class ReinforcementAgent(ValueEstimationAgent):
def update(self, state, action, next_state, reward):
raise NotImplementedError()


'''
Provided custom action_func -- to be overloaded into a new class
Provides the possible legal actions for CA agent
'''
def get_legal_actions(self, state):
return self.action_func(state)

'''
Called by observe_func after we have actually transitioned ot next state
-- then we have to record it
'''
def observe_transition(self, state, action, next_state, delta_reward):
self.episode_rewards += delta_reward
self.update(state, action, next_state, delta_reward)


'''
At each point in the game, we observe the state we have just arrived at
and assess how that affects our score.
'''
def observe_function(self, state):

# ensure we don't call at first episode
if self.last_state is not None:
delta_reward = self.reward_func(state) - self.reward_func(self.last_state)

self.observe_transition(
state = self.last_state,
action = self.last_action,
next_state = state,
delta_reward = delta_reward
)

return state

def do_action(self, state, action):
self.last_state = state
self.last_action = action

'''
Called by env when new episode is starting
'''
def start_episode(self):
...
self.last_state = None
self.last_action = None
self.episode_rewards = 0.0

'''
Called by env at the end of an episode
'''
def stop_episode(self):
...
if self.is_in_training():
self.accum_train_rewards += self.episode_rewards
else:
self.accum_test_rewards += self.episode_rewards

self.episodes_so_far += 1

# stop the learning for testing stage
if self.is_in_testing:
self.epsilon = 0.0 # no exploration
self.alpha = 0.0 # no learning

def __init__(self, action_func = None, num_training = 100, epsilon = 0.5, alpha = 0.5, gamma = 1):
# we should never be in this position, overwrite this later
if action_func is None:
action_func = lambda state: state.get_legal_actions() # not possible, state not an obj

def is_in_training(self):
return self.episodes_so_far < self.num_training

def is_in_testing(self):
return not self.is_in_training


'''
action_func: Function which takes a state and returns the list of legal actions
alpha - learning rate
epsilon - exploration rate
gamma - discount factor
numTraining - number of training episodes, i.e. no learning after these many episodes
'''
def __init__(self, action_func = None, reward_func = None, num_training = 100, epsilon = 0.5, alpha = 0.5, gamma = 1):

self.action_func = action_func
self.reward_func = reward_func
self.episodes_so_far = 0
self.accum_train_rewards = 0.0
self.accum_train_rewards = 0.0
self.accum_test_rewards = 0.0
self.num_training = int(num_training)
self.epsilon = float(epsilon)
self.alpha = float(alpha)
self.discount = float(gamma)


def final(self, state):
delta_reward = get_score() - self.last_state.get_score()
#... finsih implementing later




Expand Down
34 changes: 27 additions & 7 deletions src/machine_learning/q_learning_agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
sys.path.append('../')

from machine_learning.learning_agents import *
import machine_learning.util as util
from path_handler import PathHandler as PH
import sim.automata as atm
from sim.rules import Rules
Expand All @@ -14,32 +15,33 @@ class QLearningAgent(ReinforcementAgent):
def __init__(self, **args):
ReinforcementAgent.__init__(self, **args)

self.q_values = util.Counter()?
# how are we going to implement this counter obj
self.q_values = util.Counter()


def get_Q_value(self, state, action):
"""
'''
Returns Q(state,action)
Should return 0.0 if we have never seen a state
or the Q node value otherwise
"""
'''
return self.q_values[(state, action)]

def compute_value_from_Q_values(self, state):
"""
'''
Returns max_action Q(state,action)
where the max is over legal actions. Note that if
there are no legal actions, which is the case at the
terminal state, you should return a value of 0.0.
"""
'''

legal_actions = self.get_legal_actions(state)

#Terminal
if len(legal_actions) == 0:
return 0.0

return max([self.getQValue(state, a) for a in legal_actions])
return max([self.get_Q_value(state, a) for a in legal_actions])


def compute_action_from_Q_values(self, state):
Expand All @@ -54,7 +56,10 @@ def compute_action_from_Q_values(self, state):
if len(legal_actions) == 0:
return None

best = sorted([(a, self.get_Q_value(state, a)) for a in legal_actions], key = lambda x: x[1], reverse = True)
best = sorted(
[(a, self.get_Q_value(state, a)) for a in legal_actions],
key = lambda x: x[1], reverse = True
)
return random.choice([b[0] for b in best if b[1] == best[0][1]])

def get_action(self, state):
Expand All @@ -68,3 +73,18 @@ def get_action(self, state):
return random.choice(legal_actions)
return self.compute_action_from_Q_values(state)


# update our q values here
def update(self, state, action, next_state, reward):
# from value estmation parent.parent
NSQ = self.get_value(next_state)
self.q_values[(state, action)] = self.get_Q_value(state, action) + self.alpha * (reward + self.discount*NSQ - self.get_Q_value(state, action))


def get_policy(self, state):
return self.compute_action_from_Q_values(state)

def get_value(self, state):
return self.compute_value_from_Q_values(state)


45 changes: 45 additions & 0 deletions src/machine_learning/state.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import numpy as np

import sys
sys.path.append('../')

from path_handler import PathHandler as PH
import sim.automata as atm
from sim.rules import Rules

class State:

def __init__(self, state, rule):
self.values = state
self.rule = rule

self._shape = self.values.shape

# fix this dogshit hash function
def __hash__(self):
return hash(self.values.tostring())

def __repr__(self):
return self.values.__repr__()

def copy(self):
return State(self.values, self.rule)

@property
def shape(self):
return self.values.shape

@property
def flat(self):
return self.values.flat


'''
Generate successor states based on given action
'''
def get_successor(self, action):
new_state = self.values.copy()
new_state[action // self.values.shape[1], action % self.values.shape[1]] = 1
new_state = atm.update(new_state, rule = self.rule)

return State(new_state, rule = self.rule)
Loading

0 comments on commit a0575a1

Please sign in to comment.