forked from vy007vikas/PyTorch-ActorCriticRL
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
75 lines (59 loc) · 1.64 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from __future__ import division
import gym
import numpy as np
import torch
from torch.autograd import Variable
import os
import psutil
import gc
import train
import buffer
env = gym.make('BipedalWalker-v2')
# env = gym.make('Pendulum-v0')
MAX_EPISODES = 5000
MAX_STEPS = 1000
MAX_BUFFER = 1000000
MAX_TOTAL_REWARD = 300
S_DIM = env.observation_space.shape[0]
A_DIM = env.action_space.shape[0]
A_MAX = env.action_space.high[0]
print ' State Dimensions :- ', S_DIM
print ' Action Dimensions :- ', A_DIM
print ' Action Max :- ', A_MAX
ram = buffer.MemoryBuffer(MAX_BUFFER)
trainer = train.Trainer(S_DIM, A_DIM, A_MAX, ram)
for _ep in range(MAX_EPISODES):
observation = env.reset()
print 'EPISODE :- ', _ep
for r in range(MAX_STEPS):
env.render()
state = np.float32(observation)
action = trainer.get_exploration_action(state)
# if _ep%5 == 0:
# # validate every 5th episode
# action = trainer.get_exploitation_action(state)
# else:
# # get action based on observation, use exploration policy here
# action = trainer.get_exploration_action(state)
new_observation, reward, done, info = env.step(action)
# # dont update if this is validation
# if _ep%50 == 0 or _ep>450:
# continue
if done:
new_state = None
else:
new_state = np.float32(new_observation)
# push this exp in ram
ram.add(state, action, reward, new_state)
observation = new_observation
# perform optimization
trainer.optimize()
if done:
break
# check memory consumption and clear memory
gc.collect()
# process = psutil.Process(os.getpid())
# print(process.memory_info().rss)
if _ep%100 == 0:
trainer.save_models(_ep)
print 'Completed episodes'