-
Notifications
You must be signed in to change notification settings - Fork 2
/
main_DDPG.py
272 lines (226 loc) · 10.1 KB
/
main_DDPG.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
from copy import deepcopy
import numpy as np
import torch
from torch.optim import Adam
import gym
import time
import Code.core_DDPG as core
from Code.enviroment import MicrogridEnv
from Code.utils.logx import EpochLogger
class ReplayBuffer:
def __init__(self, obs_dim, act_dim, size):
self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
self.obs2_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32)
self.rew_buf = np.zeros(size, dtype=np.float32)
self.done_buf = np.zeros(size, dtype=np.float32)
self.ptr, self.size, self.max_size = 0, 0, size
def store(self, obs, act, rew, next_obs, done):
self.obs_buf[self.ptr] = obs
self.obs2_buf[self.ptr] = next_obs
self.act_buf[self.ptr] = act
self.rew_buf[self.ptr] = rew
self.done_buf[self.ptr] = done
self.ptr = (self.ptr+1) % self.max_size
self.size = min(self.size+1, self.max_size)
def sample_batch(self, batch_size=32):
idxs = np.random.randint(0, self.size, size=batch_size)
device = 'cuda'
batch = dict(obs=self.obs_buf[idxs],
obs2=self.obs2_buf[idxs],
act=self.act_buf[idxs],
rew=self.rew_buf[idxs],
done=self.done_buf[idxs])
return {k: torch.as_tensor(v, dtype=torch.float32).to(device) for k,v in batch.items()}
def ddpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0,
steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99,
polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000,
update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=10,
max_ep_len=1000, logger_kwargs=dict(), save_freq=1):
logger = EpochLogger(**logger_kwargs)
logger.save_config(locals())
torch.cuda.manual_seed(seed)
np.random.seed(seed)
env, test_env = env_fn(), env_fn()
obs_dim = env.observation_space.shape
act_dim = env.action_space.shape[0]
# Action limit for clamping: critically, assumes all dimensions share the same bound!
act_limit = [env.action_space.low, env.action_space.high]
# Create actor-critic module and target networks
ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
ac_targ = deepcopy(ac)
# Freeze target networks with respect to optimizers (only update via polyak averaging)
for p in ac_targ.parameters():
p.requires_grad = False
# Experience buffer
replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)
# Count variables (protip: try to get a feel for how different size networks behave!)
var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q])
logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n'%var_counts)
# Set up function for computing DDPG Q-loss
def compute_loss_q(data):
o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']
q = ac.q(o,a)
# Bellman backup for Q function
with torch.no_grad():
q_pi_targ = ac_targ.q(o2, ac_targ.pi(o2))
backup = r + gamma * (1 - d) * q_pi_targ
# MSE loss against Bellman backup
loss_q = ((q - backup)**2).mean()
# Useful info for logging
loss_info = dict(QVals=q.detach().cpu().data.numpy())
return loss_q, loss_info
# Set up function for computing DDPG pi loss
def compute_loss_pi(data):
o = data['obs']
q_pi = ac.q(o, ac.pi(o))
return -q_pi.mean()
# Set up optimizers for policy and q-function
pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
q_optimizer = Adam(ac.q.parameters(), lr=q_lr)
# Set up model saving
logger.setup_pytorch_saver(ac)
def update(data):
# First run one gradient descent step for Q.
q_optimizer.zero_grad()
loss_q, loss_info = compute_loss_q(data)
loss_q.backward()
q_optimizer.step()
# Freeze Q-network so you don't waste computational effort
# computing gradients for it during the policy learning step.
for p in ac.q.parameters():
p.requires_grad = False
# Next run one gradient descent step for pi.
pi_optimizer.zero_grad()
loss_pi = compute_loss_pi(data)
loss_pi.backward()
pi_optimizer.step()
# Unfreeze Q-network so you can optimize it at next DDPG step.
for p in ac.q.parameters():
p.requires_grad = True
# Record things
logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info)
# Finally, update target networks by polyak averaging.
with torch.no_grad():
for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
# NB: We use an in-place operations "mul_", "add_" to update target
# params, as opposed to "mul" and "add", which would make new tensors.
p_targ.data.mul_(polyak)
p_targ.data.add_((1 - polyak) * p.data)
def get_action(o, noise_scale):
a = ac.act(torch.as_tensor(o, dtype=torch.float32).to('cuda'))
a += noise_scale * np.random.randn(act_dim)
return np.clip(a, act_limit[0], act_limit[1])
def test_agent():
for j in range(num_test_episodes):
o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
while not(d or (ep_len == max_ep_len)):
# Take deterministic actions at test time (noise_scale=0)
o, r, d, _ = test_env.step(get_action(o, 0))
ep_ret += r
ep_len += 1
logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
# Prepare for interaction with environment
total_steps = steps_per_epoch * epochs
start_time = time.time()
o, ep_ret, ep_len = env.reset(), 0, 0
# Main loop: collect experience in env and update/log each epoch
for t in range(total_steps):
# Until start_steps have elapsed, randomly sample actions
# from a uniform distribution for better exploration. Afterwards,
# use the learned policy (with some noise, via act_noise).
if t > start_steps:
a = get_action(o, act_noise)
#print(a)
else:
a = env.action_space.sample()
#print("t:",t)
if t == 10:
print(ac)
# Step the env
o2, r, d, _ = env.step(a)
ep_ret += r
ep_len += 1
# Ignore the "done" signal if it comes from hitting the time
# horizon (that is, when it's an artificial terminal signal
# that isn't based on the agent's state)
d = False if ep_len==max_ep_len else d
# Store experience to replay buffer
replay_buffer.store(o, a, r, o2, d)
# Super critical, easy to overlook step: make sure to update
# most recent observation!
o = o2
# End of trajectory handling
if d or (ep_len == max_ep_len):
logger.store(EpRet=ep_ret, EpLen=ep_len)
o, ep_ret, ep_len = env.reset(), 0, 0
# Update handling
if t >= update_after and t % update_every == 0:
for _ in range(update_every):
batch = replay_buffer.sample_batch(batch_size)
update(data=batch)
# End of epoch handling
if (t+1) % steps_per_epoch == 0:
epoch = (t+1) // steps_per_epoch
# Save model
if (epoch % save_freq == 0) or (epoch == epochs):
logger.save_state({'env': env}, None)
# Test the performance of the deterministic version of the agent.
test_agent()
# Log info about epoch
logger.log_tabular('Epoch', epoch)
logger.log_tabular('EpRet', with_min_and_max=False)
logger.log_tabular('TestEpRet', with_min_and_max=False)
logger.log_tabular('EpLen', average_only=True)
logger.log_tabular('TestEpLen', average_only=True)
logger.log_tabular('TotalEnvInteracts', t)
logger.log_tabular('AvgEnergyBought', np.mean(env.energy_bought))
logger.log_tabular('AvgEnergySold', np.mean(env.energy_sold))
logger.log_tabular('AvgPriceBought', np.mean(env.prices))
logger.log_tabular('QVals', with_min_and_max=False)
logger.log_tabular('LossPi', average_only=True)
logger.log_tabular('LossQ', average_only=True)
logger.log_tabular('Time', time.time()-start_time)
logger.dump_tabular()
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--env', type=str, default='HalfCheetah-v2')
parser.add_argument('--hid', type=int, default=256)
parser.add_argument('--l', type=int, default=2)
parser.add_argument('--gamma', type=float, default=0.99)
parser.add_argument('--seed', '-s', type=int, default=0)
parser.add_argument('--epochs', type=int, default=50)
parser.add_argument('--exp_name', type=str, default='ddpg')
args = parser.parse_args()
from Code.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed)
env = MicrogridEnv()
'''
o = env.reset()
rewards = []
t=0
t1 = time.time()
while True:
amount = o[1] - (o[2] + o[0])
if amount == 0:
o, r, d, _ = env.step([0,0,0,0])
elif amount>0:
o, r, d, _ = env.step([0,0,amount,10])
else:
o, r, d, _ = env.step([1,0,abs(amount), 19])
rewards.append(r)
t+=1
if t > 4000:
break
#for i in range(99):
# print("Bought, sold, prices, rewards tot_sold", (env.energy_bought[i], env.energy_sold[i], env.prices[i], rewards[i], env.tot[i]))
t2= time.time()
print("time:",t2-t1)
print("sum rewards: ",sum(rewards))
print("avg reward: ", np.mean(rewards))
'''
ddpg(lambda : env, actor_critic=core.MLPActorCritic,
ac_kwargs=dict(hidden_sizes=[args.hid]*args.l),
gamma=args.gamma, seed=args.seed, epochs=args.epochs,
logger_kwargs=logger_kwargs)