-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathtrain.py
133 lines (91 loc) · 3.59 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from mnist_env import MNISTEnv
from actor_critic_agent import MNISTNet, ActorCriticNNAgent
import numpy as np
import time
import argparse
import sys
def main():
''' Argument parsing.'''
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', type=int)
parser.add_argument('--iters', type=int)
parser.add_argument('--verbose', action='store_true')
args = parser.parse_args(sys.argv[1:]);
print("Training...")
trained_agent = train(args.iters, args.batch_size, verbose=args.verbose)
test_agent = trained_agent.copy()
print("Testing...")
test(test_agent)
print("Evaluating...")
for _ in range(5):
eval(test_agent)
def train(iterations, episodes, verbose=False):
''' Method to train a model, currently fixed on
using ActorCritic; will change when updated to use
modular DeepRL env'''
def obs_to_input(obs):
# reshape to (1, 28, 28)
return obs[np.newaxis, ...]
# initialize agent
agent = ActorCriticNNAgent(MNISTNet, obs_to_input=obs_to_input, df=0.1)
# intialize environment
env = MNISTEnv(type='train', seed=None)
# training loop
start = time.time()
for iter in range(iterations):
if iter % 10 == 0: print("Starting iteration %d" % iter)
rewards = []
# play out each episode
for ep in range(episodes):
if verbose and iter % 10 == 0 and ep == 0:
display = True
else:
display = False
observation = env.reset()
agent.new_episode()
total_reward = 0
done = False
while not done:
action = agent.act(observation, env, display=display)
observation, reward, done, info = env.step(action)
if display: print("Actual reward:", reward)
agent.store_reward(reward)
total_reward += reward
rewards.append(total_reward)
if display: env.render()
# adjust agent parameters based on played episodes
agent.update()
# print performance for this iteration
if iter % 10 == 0:
print("Mean total reward / episode: %.3f" % np.mean(rewards))
end = time.time()
print("Completed %d iterations of %d episodes in %.3f s" % \
(iterations, episodes, end - start))
# return trained agent
return agent
def eval(agent, n_test=1000):
# evaluate a trained agent
env = MNISTEnv(type='test', seed=None)
observation = env.reset()
done = False
while not done:
action = agent.act(observation, env, display=True)
observation, reward, done, info = env.step(action)
print("Received reward %.1f on step %d" % (reward, env.steps))
env.render()
def test(agent, n_test=1000):
# calculate test average reward
env = MNISTEnv(type='test', seed=None)
rewards = []
for _ in range(n_test):
observation = env.reset()
total_reward = 0
done = False
while not done:
action = agent.act(observation, env, display=False)
observation, reward, done, info = env.step(action)
total_reward += reward
rewards.append(total_reward)
print("Mean total reward / episode: %.3f" % np.mean(rewards))
if __name__ == '__main__':
main()