diff --git a/config.yaml b/config.yaml index 7f7212e..138cfb5 100644 --- a/config.yaml +++ b/config.yaml @@ -33,7 +33,7 @@ env: state_dir: pyboy_states init_state: Bulbasaur action_freq: 24 - max_steps: 1_000_000 + max_steps: 100_000_000 save_video: False fast_video: False frame_stacks: 1 @@ -80,6 +80,9 @@ train: cpu_offload: True pool_kernel: [0] + events_maxlen: 800 + ent_coef_adj: 0.01015 + wrappers: baseline: - stream_wrapper.StreamWrapper: diff --git a/pokemonred_puffer/cleanrl_puffer.py b/pokemonred_puffer/cleanrl_puffer.py index 36e08b3..22e8704 100644 --- a/pokemonred_puffer/cleanrl_puffer.py +++ b/pokemonred_puffer/cleanrl_puffer.py @@ -311,14 +311,13 @@ def __init__( self.losses = Losses() self.performance = Performance() - self.reward_buffer = deque(maxlen=1_000) self.exploration_map_agg = np.zeros((config.num_envs, *GLOBAL_MAP_SHAPE), dtype=np.float32) self.taught_cut = False self.infos = {} self.log = False self.ent_coef = self.config.ent_coef - self.events_avg = deque(maxlen=500) + self.events_avg = deque(maxlen=self.config.events_maxlen) @pufferlib.utils.profile def evaluate(self): @@ -435,12 +434,22 @@ def evaluate(self): with env_profiler: self.pool.send(actions) + eval_profiler.stop() + + self.total_agent_steps += padded_steps_collected + new_step = np.mean(self.infos["learner"]["stats/step"]) + if new_step > self.global_step: + self.global_step = new_step + self.log = True + self.reward = torch.mean(self.rewards).float().item() + self.SPS = int(padded_steps_collected / eval_profiler.elapsed) self.events_avg.append(np.mean(self.infos["learner"]["stats/event"])) + if ( len(self.events_avg) == self.events_avg.maxlen and abs(self.events_avg[-1] - self.events_avg[0]) < 3 ): - self.ent_coef = self.config.ent_coef * 1.25 + self.ent_coef = self.config.ent_coef_adj else: self.ent_coef = self.config.ent_coef if self.log and self.wandb is not None: @@ -451,16 +460,6 @@ def evaluate(self): }, ) - eval_profiler.stop() - - self.total_agent_steps += padded_steps_collected - new_step = np.mean(self.infos["learner"]["stats/step"]) - if new_step > self.global_step: - self.global_step = new_step - self.log = True - self.reward = torch.mean(self.rewards).float().item() - self.SPS = int(padded_steps_collected / eval_profiler.elapsed) - perf = self.performance perf.total_uptime = int(time.time() - self.start_time) perf.total_agent_steps = self.total_agent_steps diff --git a/pokemonred_puffer/environment.py b/pokemonred_puffer/environment.py index e26d064..c19b4a5 100644 --- a/pokemonred_puffer/environment.py +++ b/pokemonred_puffer/environment.py @@ -904,7 +904,7 @@ def update_tm_hm_moves_obtained(self): if self.pyboy.get_memory_value(i) != 0: for j in range(4): move_id = self.pyboy.get_memory_value(i + j + 8) - if move_id != 0: # and move_id in TM_HM_MOVES: + if move_id != 0 and move_id in TM_HM_MOVES: self.moves_obtained[move_id] = 1 """ # Scan current box (since the box doesn't auto increment in pokemon red) diff --git a/pokemonred_puffer/rewards/baseline.py b/pokemonred_puffer/rewards/baseline.py index 8f13d28..da00aaf 100644 --- a/pokemonred_puffer/rewards/baseline.py +++ b/pokemonred_puffer/rewards/baseline.py @@ -27,7 +27,7 @@ def get_game_state_reward(self): "explore_npcs": sum(self.seen_npcs.values()) * 0.02, # "seen_pokemon": sum(self.seen_pokemon) * 0.0000010, # "caught_pokemon": sum(self.caught_pokemon) * 0.0000010, - "moves_obtained": sum(self.moves_obtained) * 0.00010, + "moves_obtained": sum(self.moves_obtained) * 0.0010, "explore_hidden_objs": sum(self.seen_hidden_objs.values()) * 0.02, # "level": self.get_levels_reward(), # "opponent_level": self.max_opponent_level,