Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reinforcement Learning for CA implemented #5

Merged
merged 8 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
215 changes: 215 additions & 0 deletions nb/q-learning-v0.ipynb

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ pytest
numpy
pandas
matplotlib
pathlib
pathlib
tqdm
76 changes: 0 additions & 76 deletions sim

This file was deleted.

5 changes: 3 additions & 2 deletions src/analysis/analysis.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import matplotlib.pyplot as plt

import sys
sys.path.append('../')
Expand Down Expand Up @@ -35,9 +36,9 @@ def plot_state(state):

fig, axs = plt.subplots()
if state.shape[0] > 1:
axs.imshow(~state[0], cmap = 'gray')
axs.imshow(state, cmap = 'gray')
else:
axs.imshow(~state, cmap = 'gray')
axs.imshow(state[0], cmap = 'gray')

axs.set_xticks(np.arange(len(state))+0.5)
axs.set_yticks(np.arange(len(state))+0.5)
Expand Down
127 changes: 99 additions & 28 deletions src/machine_learning/learning_agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,28 +20,28 @@ def get_action(self, state):
class ValueEstimationAgent(Agent):

"""
Abstract agent which assigns values to (state,action)
Q-Values for an environment. As well as a value to a
state and a policy given respectively by,

V(s) = max_{a in actions} Q(s,a)
policy(s) = arg_max_{a in actions} Q(s,a)

Both ValueIterationAgent and QLearningAgent inherit
from this agent. While a ValueIterationAgent has
a model of the environment via a MarkovDecisionProcess
(see mdp.py) that is used to estimate Q-Values before
ever actually acting, the QLearningAgent estimates
Q-Values while acting in the environment.
"""

def __init__(self, alpha, epsilon, gamma, num_training):
Abstract agent which assigns values to (state,action)
Q-Values for an environment. As well as a value to a
state and a policy given respectively by,

V(s) = max_{a in actions} Q(s,a)
policy(s) = arg_max_{a in actions} Q(s,a)

Both ValueIterationAgent and QLearningAgent inherit
from this agent. While a ValueIterationAgent has
a model of the environment via a MarkovDecisionProcess
(see mdp.py) that is used to estimate Q-Values before
ever actually acting, the QLearningAgent estimates
Q-Values while acting in the environment.
"""

def __init__(self, alpha = 1.0, epsilon = 0.05, gamma = 0.8, num_training = 10):
'''
alpha - learning rate
epsilon - exploration rate
gamma - discount factor
num_training - number of training episodes, i.e. no learning after these many episodes
'''
alpha - learning rate
epsilon - exploration rate
gamma - discount factor
num_training - number of training episodes, i.e. no learning after these many episodes
'''
self.alpha = alpha
self.epsilon = epsilon
self.gamma = gamma
Expand All @@ -64,33 +64,104 @@ class ReinforcementAgent(ValueEstimationAgent):
def update(self, state, action, next_state, reward):
raise NotImplementedError()


'''
Provided custom action_func -- to be overloaded into a new class

Provides the possible legal actions for CA agent
'''
def get_legal_actions(self, state):
return self.action_func(state)

'''
Called by observe_func after we have actually transitioned ot next state
-- then we have to record it
'''
def observe_transition(self, state, action, next_state, delta_reward):
self.episode_rewards += delta_reward
self.update(state, action, next_state, delta_reward)


'''
At each point in the game, we observe the state we have just arrived at
and assess how that affects our score.
'''
def observe_function(self, state):

# ensure we don't call at first episode
if self.last_state is not None:
delta_reward = self.reward_func(state) - self.reward_func(self.last_state)

self.observe_transition(
state = self.last_state,
action = self.last_action,
next_state = state,
delta_reward = delta_reward
)

return state

def do_action(self, state, action):
self.last_state = state
self.last_action = action

'''
Called by env when new episode is starting
'''
def start_episode(self):
...
self.last_state = None
self.last_action = None
self.episode_rewards = 0.0

'''
Called by env at the end of an episode
'''
def stop_episode(self):
...
if self.is_in_training():
self.accum_train_rewards += self.episode_rewards
else:
self.accum_test_rewards += self.episode_rewards

self.episodes_so_far += 1

# stop the learning for testing stage
if self.is_in_testing:
self.epsilon = 0.0 # no exploration
self.alpha = 0.0 # no learning

def __init__(self, action_func = None, num_training = 100, epsilon = 0.5, alpha = 0.5, gamma = 1):
# we should never be in this position, overwrite this later
if action_func is None:
action_func = lambda state: state.get_legal_actions() # not possible, state not an obj

def is_in_training(self):
return self.episodes_so_far < self.num_training

def is_in_testing(self):
return not self.is_in_training


'''
action_func: Function which takes a state and returns the list of legal actions

alpha - learning rate
epsilon - exploration rate
gamma - discount factor
numTraining - number of training episodes, i.e. no learning after these many episodes
'''
def __init__(self, action_func = None, reward_func = None, num_training = 100, epsilon = 0.5, alpha = 0.5, gamma = 1):

self.action_func = action_func
self.reward_func = reward_func
self.episodes_so_far = 0
self.accum_train_rewards = 0.0
self.accum_train_rewards = 0.0
self.accum_test_rewards = 0.0
self.num_training = int(num_training)
self.epsilon = float(epsilon)
self.alpha = float(alpha)
self.discount = float(gamma)


def final(self, state):
delta_reward = get_score() - self.last_state.get_score()
#... finsih implementing later




Expand Down
34 changes: 27 additions & 7 deletions src/machine_learning/q_learning_agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
sys.path.append('../')

from machine_learning.learning_agents import *
import machine_learning.util as util
from path_handler import PathHandler as PH
import sim.automata as atm
from sim.rules import Rules
Expand All @@ -14,32 +15,33 @@ class QLearningAgent(ReinforcementAgent):
def __init__(self, **args):
ReinforcementAgent.__init__(self, **args)

self.q_values = util.Counter()?
# how are we going to implement this counter obj
self.q_values = util.Counter()


def get_Q_value(self, state, action):
"""
'''
Returns Q(state,action)
Should return 0.0 if we have never seen a state
or the Q node value otherwise
"""
'''
return self.q_values[(state, action)]

def compute_value_from_Q_values(self, state):
"""
'''
Returns max_action Q(state,action)
where the max is over legal actions. Note that if
there are no legal actions, which is the case at the
terminal state, you should return a value of 0.0.
"""
'''

legal_actions = self.get_legal_actions(state)

#Terminal
if len(legal_actions) == 0:
return 0.0

return max([self.getQValue(state, a) for a in legal_actions])
return max([self.get_Q_value(state, a) for a in legal_actions])


def compute_action_from_Q_values(self, state):
Expand All @@ -54,7 +56,10 @@ def compute_action_from_Q_values(self, state):
if len(legal_actions) == 0:
return None

best = sorted([(a, self.get_Q_value(state, a)) for a in legal_actions], key = lambda x: x[1], reverse = True)
best = sorted(
[(a, self.get_Q_value(state, a)) for a in legal_actions],
key = lambda x: x[1], reverse = True
)
return random.choice([b[0] for b in best if b[1] == best[0][1]])

def get_action(self, state):
Expand All @@ -68,3 +73,18 @@ def get_action(self, state):
return random.choice(legal_actions)
return self.compute_action_from_Q_values(state)


# update our q values here
def update(self, state, action, next_state, reward):
# from value estmation parent.parent
NSQ = self.get_value(next_state)
self.q_values[(state, action)] = self.get_Q_value(state, action) + self.alpha * (reward + self.discount*NSQ - self.get_Q_value(state, action))


def get_policy(self, state):
return self.compute_action_from_Q_values(state)

def get_value(self, state):
return self.compute_value_from_Q_values(state)


45 changes: 45 additions & 0 deletions src/machine_learning/state.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import numpy as np

import sys
sys.path.append('../')

from path_handler import PathHandler as PH
import sim.automata as atm
from sim.rules import Rules

class State:

def __init__(self, state, rule):
self.values = state
self.rule = rule

self._shape = self.values.shape

# fix this dogshit hash function
def __hash__(self):
return hash(self.values.tostring())

def __repr__(self):
return self.values.__repr__()

def copy(self):
return State(self.values, self.rule)

@property
def shape(self):
return self.values.shape

@property
def flat(self):
return self.values.flat


'''
Generate successor states based on given action
'''
def get_successor(self, action):
new_state = self.values.copy()
new_state[action // self.values.shape[1], action % self.values.shape[1]] = 1
new_state = atm.update(new_state, rule = self.rule)

return State(new_state, rule = self.rule)
Loading