forked from mrahtz/learning-from-human-preferences
-
Notifications
You must be signed in to change notification settings - Fork 0
/
enduro_wrapper.py
29 lines (24 loc) · 972 Bytes
/
enduro_wrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
"""
An environment wrapper for Enduro which blanks out the speedometer (so that the
agent doesn't inadvertently learn reward-related information from it) and
signals 'done' once weather begins to change (so that the observations don't
change so much and therefore the reward predictor can learn more easily).
"""
from gym import Wrapper
class EnduroWrapper(Wrapper):
def __init__(self, env):
super(EnduroWrapper, self).__init__(env)
assert str(env) == '<TimeLimit<AtariEnv<EnduroNoFrameskip-v4>>>'
self._steps = None
def step(self, action):
observation, reward, done, info = self.env.step(action)
# Blank out all the speedometer stuff
observation[160:] = 0
self._steps += 1
# Done once the weather starts to change
if self._steps == 3000:
done = True
return observation, reward, done, info
def reset(self):
self._steps = 0
return self.env.reset()