Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move agents from rlberry #4

Merged
merged 2 commits into from
Nov 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions rlberry_scool/agents/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Interfaces
from .dynprog import ValueIterationAgent
from .mbqvi import MBQVIAgent
from .ucbvi import UCBVIAgent
1 change: 1 addition & 0 deletions rlberry_scool/agents/dynprog/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .value_iteration import ValueIterationAgent
272 changes: 272 additions & 0 deletions rlberry_scool/agents/dynprog/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@
import numpy as np
from rlberry.utils.jit_setup import numba_jit


@numba_jit
def backward_induction(R, P, horizon, gamma=1.0, vmax=np.inf):
"""Backward induction to compute Q and V functions in the finite horizon
setting.

Parameters
----------
R : numpy.ndarray
array of shape (S, A) contaning the rewards, where S is the number
of states and A is the number of actions
P : numpy.ndarray
array of shape (S, A, S) such that P[s,a,ns] is the probability of
arriving at ns by taking action a in state s.
horizon : int
problem horizon
gamma : double, default: 1.0
discount factor
vmax : double, default: np.inf
maximum possible value in V

Returns
--------
tuple (Q, V) containing the Q and V functions, of shapes (horizon, S, A)
and (horizon, S), respectively.
"""
S, A = R.shape
V = np.zeros((horizon, S))
Q = np.zeros((horizon, S, A))
for hh in range(horizon - 1, -1, -1):
for ss in range(S):
max_q = -np.inf
for aa in range(A):
q_aa = R[ss, aa]
if hh < horizon - 1:
# not using .dot instead of loop to avoid scipy dependency
# (numba seems to require scipy for linear
# algebra operations in numpy)
for ns in range(S):
q_aa += gamma * P[ss, aa, ns] * V[hh + 1, ns]
if q_aa > max_q:
max_q = q_aa
Q[hh, ss, aa] = q_aa
V[hh, ss] = max_q
if V[hh, ss] > vmax:
V[hh, ss] = vmax
return Q, V


@numba_jit
def backward_induction_reward_sd(Q, V, R, P, gamma=1.0, vmax=np.inf):
"""
Backward induction to compute Q and V functions in
the finite horizon setting.

Assumes R is stage-dependent, but P is stage-independent.

Takes as input the arrays where to store Q and V.

Parameters
----------
Q: numpy.ndarray
array of shape (horizon, S, A) where to store the Q function
V: numpy.ndarray
array of shape (horizon, S) where to store the V function
R : numpy.ndarray
array of shape (horizon, S, A) contaning the rewards, where S is the number
of states and A is the number of actions
P : numpy.ndarray
array of shape (S, A, S) such that P[s,a,ns] is the probability of
arriving at ns by taking action a in state s.
horizon : int
problem horizon
gamma : double
discount factor, default = 1.0
vmax : double
maximum possible value in V
default = np.inf
"""
H, S, A = R.shape
horizon = H
for hh in range(horizon - 1, -1, -1):
for ss in range(S):
max_q = -np.inf
for aa in range(A):
q_aa = R[hh, ss, aa]
if hh < horizon - 1:
# not using .dot instead of loop to avoid scipy dependency
# (numba seems to require scipy for linear algebra
# operations in numpy)
for ns in range(S):
q_aa += gamma * P[ss, aa, ns] * V[hh + 1, ns]
if q_aa > max_q:
max_q = q_aa
Q[hh, ss, aa] = q_aa
V[hh, ss] = max_q
if V[hh, ss] > vmax:
V[hh, ss] = vmax


@numba_jit
def backward_induction_in_place(Q, V, R, P, horizon, gamma=1.0, vmax=np.inf):
"""
Backward induction to compute Q and V functions in
the finite horizon setting.
Takes as input the arrays where to store Q and V.

Parameters
----------
Q: numpy.ndarray
array of shape (horizon, S, A) where to store the Q function
V: numpy.ndarray
array of shape (horizon, S) where to store the V function
R : numpy.ndarray
array of shape (S, A) contaning the rewards, where S is the number
of states and A is the number of actions
P : numpy.ndarray
array of shape (S, A, S) such that P[s,a,ns] is the probability of
arriving at ns by taking action a in state s.
horizon : int
problem horizon
gamma : double
discount factor, default = 1.0
vmax : double
maximum possible value in V
default = np.inf
"""
S, A = R.shape
for hh in range(horizon - 1, -1, -1):
for ss in range(S):
max_q = -np.inf
for aa in range(A):
q_aa = R[ss, aa]
if hh < horizon - 1:
# not using .dot instead of loop to avoid scipy dependency
# (numba seems to require scipy for linear algebra
# operations in numpy)
for ns in range(S):
q_aa += gamma * P[ss, aa, ns] * V[hh + 1, ns]
if q_aa > max_q:
max_q = q_aa
Q[hh, ss, aa] = q_aa
V[hh, ss] = max_q
if V[hh, ss] > vmax:
V[hh, ss] = vmax


@numba_jit
def backward_induction_sd(Q, V, R, P, gamma=1.0, vmax=np.inf):
"""
In-place implementation of backward induction to compute Q and V functions
in the finite horizon setting.

Assumes R and P are stage-dependent.

Parameters
----------
Q: numpy.ndarray
array of shape (H, S, A) where to store the Q function
V: numpy.ndarray
array of shape (H, S) where to store the V function
R : numpy.ndarray
array of shape (H, S, A) contaning the rewards, where S is the number
of states and A is the number of actions
P : numpy.ndarray
array of shape (H, S, A, S) such that P[h, s, a, ns] is the probability of
arriving at ns by taking action a in state s at stage h.
gamma : double, default: 1.0
discount factor
vmax : double, default: np.inf
maximum possible value in V

"""
H, S, A = R.shape
for hh in range(H - 1, -1, -1):
for ss in range(S):
max_q = -np.inf
for aa in range(A):
q_aa = R[hh, ss, aa]
if hh < H - 1:
# not using .dot instead of loop to avoid scipy dependency
# (numba seems to require scipy for linear
# algebra operations in numpy)
for ns in range(S):
q_aa += gamma * P[hh, ss, aa, ns] * V[hh + 1, ns]
if q_aa > max_q:
max_q = q_aa
Q[hh, ss, aa] = q_aa
V[hh, ss] = max_q
# clip V
if V[hh, ss] > vmax:
V[hh, ss] = vmax


@numba_jit
def value_iteration(R, P, gamma, epsilon=1e-6):
"""
Value iteration for discounted problems.

Parameters
----------
R : numpy.ndarray
array of shape (S, A) contaning the rewards, where S is the number
of states and A is the number of actions
P : numpy.ndarray
array of shape (S, A, S) such that P[s,a,ns] is the probability of
arriving at ns by taking action a in state s.
gamma : double
discount factor
epsilon : double
precision

Returns
--------
tuple (Q, V, n_it) containing the epsilon-optimal Q and V functions,
of shapes (S, A) and (S,), respectively, and n_it, the number of iterations
"""
S, A = R.shape
Q = np.zeros((S, A))
Q_aux = np.full((S, A), np.inf)
n_it = 0
while np.abs(Q - Q_aux).max() > epsilon:
Q_aux = Q
Q = bellman_operator(Q, R, P, gamma)
n_it += 1
V = np.zeros(S)
# numba does not support np.max(Q, axis=1)
for ss in range(S):
V[ss] = Q[ss, :].max()
return Q, V, n_it


@numba_jit
def bellman_operator(Q, R, P, gamma):
"""
Bellman optimality operator for Q functions

Parameters
----------
Q : numpy.ndarray
array of shape (S, A) containing the Q function to which apply
the operator
R : numpy.ndarray
array of shape (S, A) contaning the rewards, where S is the number
of states and A is the number of actions
P : numpy.ndarray
array of shape (S, A, S) such that P[s,a,ns] is the probability of
arriving at ns by taking action a in state s.
gamma : double
discount factor

Returns
--------
TQ, array of shape (S, A) containing the result of the Bellman operator
applied to the input Q
"""
S, A = Q.shape
TQ = np.zeros((S, A))
V = np.zeros(S)
# numba does not support np.max(Q, axis=1)
for ss in range(S):
V[ss] = Q[ss, :].max()
#
for ss in range(S):
for aa in range(A):
TQ[ss, aa] = R[ss, aa]
for ns in range(S):
TQ[ss, aa] += gamma * P[ss, aa, ns] * V[ns]
return TQ
82 changes: 82 additions & 0 deletions rlberry_scool/agents/dynprog/value_iteration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from rlberry.agents.agent import AgentWithSimplePolicy
from rlberry_scool.agents.dynprog.utils import backward_induction, value_iteration
from rlberry.envs.finite.finite_mdp import FiniteMDP


class ValueIterationAgent(AgentWithSimplePolicy):
"""
Value iteration for enviroments of type FiniteMDP
(rlberry.envs.finite.finite_mdp.FiniteMDP)

Important: the discount gamma is also used if the problem is
finite horizon, but, in this case, gamma can be set to 1.0.

Parameters
-----------
env : rlberry.envs.finite.finite_mdp.FiniteMDP
Environment used to fit the agent.
gamma : double
Discount factor in [0, 1]
horizon : int
Horizon, if the problem is finite-horizon. if None, the discounted
problem is solved
default = None
epsilon : double
Precision of value iteration, only used in discounted problems
(when horizon is None).

"""

name = "ValueIteration"

def __init__(self, env, gamma=0.95, horizon=None, epsilon=1e-6, **kwargs):
AgentWithSimplePolicy.__init__(self, env, **kwargs)

# initialize base class
assert isinstance(
self.env, FiniteMDP
), "Value iteration requires a FiniteMDP model."
#

self.gamma = gamma # attribute gamma

self.horizon = horizon
self.epsilon = epsilon

# value functions
self.Q = None
self.V = None

def fit(self, budget=None, **kwargs):
"""
Run value iteration.

Parameters
----------
budget: None
Not used. Only defined for compatibility purpose with rlberry.
Changing `budget` value has no effect.
"""
del kwargs
info = {}
if self.horizon is None:
assert self.gamma < 1.0, "The discounted setting requires gamma < 1.0"
self.Q, self.V, n_it = value_iteration(
self.env.R, self.env.P, self.gamma, self.epsilon
)
info["n_iterations"] = n_it
info["precision"] = self.epsilon
else:
self.Q, self.V = backward_induction(
self.env.R, self.env.P, self.horizon, self.gamma
)
info["n_iterations"] = self.horizon
info["precision"] = 0.0
return info

def policy(self, observation):
state = observation
if self.horizon is None:
return self.Q[state, :].argmax()
else:
return self.Q[0, state, :].argmax()
1 change: 1 addition & 0 deletions rlberry_scool/agents/mbqvi/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .mbqvi import MBQVIAgent
Loading
Loading