-
Notifications
You must be signed in to change notification settings - Fork 0
/
thompson.py
65 lines (51 loc) · 2.03 KB
/
thompson.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import numpy as np
from utils import bucketize_action
class ThompsonBandit:
def __init__(self, feature_dim, T):
# could put in a config or something
R = 1
epsilon = 1/np.log(T)
delta = .99995
self.v = R * np.sqrt((24*feature_dim*np.log(1/delta)/epsilon))
# print (self.v **2)
self.mu = [np.zeros(feature_dim) for _ in range(3)]
self.f = [np.zeros(feature_dim) for _ in range(3)]
self.B = [np.identity(feature_dim) for _ in range(3)]
def _get_context(self, context):
# normalizes
return np.squeeze(context)
def update(self, context, action, reward):
context = self._get_context(context)
self.B[action] = self.B[action] + np.outer(context,context)
self.f[action] = self.f[action] + context * reward
# print("AFTER",self.f[action])
# print(self.B[action])
self.mu[action] = np.linalg.inv((self.B[action])) @ (self.f[action])
# print("AFTER", self.mu[action])
def print_weights(self):
print(self.mu)
print(self.f)
print(self.B)
def predict_no_update(self, context, history):
results = []
for a in range(3):
mu_samp = np.random.multivariate_normal(self.mu[a], (self.v**2) * np.linalg.inv(self.B[a]))
context = self._get_context(context)
results.append(np.dot(context.T, mu_samp))
# print(context.T)
# print(np.dot(context.T, mu_samp))
# print()
best_action = np.argmax(results)
# print(results)
# print(best_action)
return best_action
def predict(self, context, history):
# Given the current context vector and the past history in the form of
# [(context), (action), reward]
# return an action
if len(history) > 0:
self.update(*(history[-1]))
return self.predict_no_update(context, history)
class WarfarinThompsonSeparate(ThompsonBandit):
def __init__(self):
super().__init__(8, 4386)