-
Notifications
You must be signed in to change notification settings - Fork 2
/
poilcy.py
165 lines (130 loc) · 7.04 KB
/
poilcy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import numpy as np
import tensorflow as tf
from sklearn.linear_model import LogisticRegression
from tf_utils import create_checkpoint
class BasePolicy(object):
def __init__(self, num_action):
self._num_action = num_action
def update(self, x, y, epochs=300, batch_size=32, verbose=False):
""" Learn to be able to select an action given a context """
del x, y, epochs, batch_size, verbose
def select_action(self, context):
""" Take an action based on the computed scores given the context
:param context: contexts
:return action: one hot vector representing an action (num_data x num_action)
:return score: uniform dist prob over each action (num_data x num_action)
"""
raise NotImplementedError
class UniformPolicy(BasePolicy):
def __init__(self, num_action):
super(UniformPolicy, self).__init__(num_action=num_action)
def select_action(self, context):
""" returns the uniformly sampled actions
p(a | x) = 1 / K
where K is the number of arms(actions)
:param context: contexts
:return action: one hot vector representing an action (num_data x num_action)
:return score: uniform dist prob over each action (num_data x num_action)
"""
# get the number of samples to sample
num_data = context.shape[0]
# sample from Uniform dist
action = np.random.uniform(low=0, high=self._num_action, size=num_data).astype(np.int8)
# work out the scores on each action
score = np.tile(np.ones(self._num_action) / self._num_action, (num_data, 1))
return action, score
class DeterministicPolicy(BasePolicy):
def __init__(self, num_action, weight_path="./model"):
""" Deterministic Policy """
super(DeterministicPolicy, self).__init__(num_action=num_action)
class Model(tf.keras.Model):
def __init__(self, num_action):
super(Model, self).__init__()
self.dense1 = tf.keras.layers.Dense(16, activation='relu')
self.dense2 = tf.keras.layers.Dense(16, activation='relu')
self.pred = tf.keras.layers.Dense(num_action, activation='softmax')
def call(self, inputs):
x = self.dense1(inputs)
x = self.dense2(x)
return self.pred(x)
self._model = Model(num_action=num_action)
self._optimizer = tf.compat.v1.train.AdamOptimizer()
self._loss_fn = tf.keras.losses.CategoricalCrossentropy()
self._global_ts = tf.compat.v1.train.get_or_create_global_step().assign(0)
self._manager = create_checkpoint(model=self._model,
optimizer=self._optimizer,
model_dir=weight_path)
def update(self, x, y, epochs=300, batch_size=32, verbose=False):
""" Learn to be able to select an action given a context """
# Only if we haven't trained the model yet, we move on to the training.
# This is because the task is less difficult so that one time of training should be enough!
if self._global_ts == 0:
for epoch in range(epochs):
_idx = np.random.randint(low=0, high=x.shape[0] - 1, size=batch_size)
_x, _y = np.array(x[_idx, :], dtype=np.float32), np.array(y[_idx, :], dtype=np.float32)
loss = self._update(x=_x, y=_y)
if verbose: print("Epoch: [{}/{}] | Loss: {}".format(epoch, epochs, loss.numpy()))
self._global_ts.assign_add(1)
self._manager.save()
@tf.function
def _update(self, x, y):
with tf.GradientTape() as tape:
pred = self._model(x)
loss = self._loss_fn(y_pred=pred, y_true=y)
# get gradients
grads = tape.gradient(loss, self._model.trainable_variables)
# apply processed gradients to the network
self._optimizer.apply_gradients(zip(grads, self._model.trainable_variables))
return loss
def select_action(self, context):
""" Get scores for each arm(action) given a context and compute an action """
# get scores of each action given a context(state in RL literature)
score = np.asarray(self._model(context))
# get an action vector: num_action dim vector with 1 representing the predicted label
# TODO: think if we really need to one-hot vectorise the action vector
action = np.eye(score.shape[1])[np.argmax(score, axis=-1)]
return action, score
class DeterministicPolicy2(BasePolicy):
def __init__(self, num_action):
""" Deterministic Policy """
super(DeterministicPolicy2, self).__init__(num_action=num_action)
self._model = LogisticRegression(fit_intercept=True, max_iter=2000, multi_class="multinomial")
def update(self, x, y, epochs=300, batch_size=32, verbose=False):
""" Learn to be able to select an action given a context """
self._model.fit(x, np.argmax(y, axis=-1))
def select_action(self, context):
""" Get scores for each arm(action) given a context and compute an action """
# get scores of each action given a context(state in RL literature)
score = self._model.predict_proba(context)
# get an action vector: num_action dim vector with 1 representing the predicted label
# TODO: think if we really need to one-hot vectorise the action vector
action = self._model.predict(context)
return action, score
if __name__ == '__main__':
from sklearn.model_selection import train_test_split
from data.data_manager import load_ecoli
from tf_utils import eager_setup
eager_setup()
data = load_ecoli() # [ecoli] x: (336, 7) y: (336, 8) num_label: 8
x_train, x_test, y_train, y_test = train_test_split(data.x, data.y_onehot, test_size=0.5, random_state=1)
label = np.argmax(y_test, axis=-1)
print("=== Test UniformPolicy ===")
policy = UniformPolicy(num_action=data.num_label)
action, score = policy.select_action(context=x_test)
pred = np.argmax(score, axis=1)
print(np.mean(pred == label))
print("Taken action: {} Score: {}".format(action.shape, score.shape))
# print("=== Test DeterministicPolicy ===")
# policy = DeterministicPolicy(num_action=data.num_label, weight_path="./model/{}".format("ecoli"))
# policy.update(x=train_x, y=train_y, epochs=1000, batch_size=64, verbose=False)
# action, score = policy.select_action(context=test_x)
# pred = np.argmax(score, axis=1)
# print("Accuracy: {}".format(np.mean(pred == label)))
# print("Taken action: {} Score: {}".format(action.shape, score.shape))
print("=== Test DeterministicPolicy2 ===")
policy = DeterministicPolicy2(num_action=data.num_label)
policy.update(x=x_train, y=y_train, epochs=1000, batch_size=64, verbose=False)
action, score = policy.select_action(context=x_test)
pred = np.argmax(score, axis=1)
print("Accuracy: {}".format(np.mean(pred == label)))
print("Taken action: {} Score: {}".format(action.shape, score.shape))