diff --git a/curriculum_generation/sample_evaluation_task.py b/curriculum_generation/sample_evaluation_task.py new file mode 100644 index 0000000..13cb9a5 --- /dev/null +++ b/curriculum_generation/sample_evaluation_task.py @@ -0,0 +1,54 @@ +"""Manual test for creating learning curriculum manually""" +# pylint: disable=invalid-name,redefined-outer-name,bad-builtin +# pylint: disable=wildcard-import,unused-wildcard-import +from typing import List + +from nmmo.task import constraint as c +from nmmo.task.base_predicates import AttainSkill, CountEvent, EarnGold, TickGE +from nmmo.task.task_spec import TaskSpec + +curriculum: List[TaskSpec] = [] + +# Stay alive as long as possible +curriculum.append( + TaskSpec(eval_fn=TickGE, eval_fn_kwargs={"num_tick": 1024}) +) + +# Perform these 10 times +essential_skills = [ + "EAT_FOOD", + "DRINK_WATER", + "SCORE_HIT", + "PLAYER_KILL", + "HARVEST_ITEM", + "EQUIP_ITEM", + "CONSUME_ITEM", + "LEVEL_UP", + "EARN_GOLD", + "LIST_ITEM", + "BUY_ITEM", + "GIVE_ITEM", + "DESTROY_ITEM", + "GIVE_GOLD", +] +for event_code in essential_skills: + curriculum.append( + TaskSpec( + eval_fn=CountEvent, + eval_fn_kwargs={"event": event_code, "N": 10}, + ) + ) + +# Reach skill level 10 +for skill in c.combat_skills + c.harvest_skills: + curriculum.append( + TaskSpec( + eval_fn=AttainSkill, + eval_fn_kwargs={"skill": skill, "level": 10, "num_agent": 1}, + ) + ) + +# Earn gold 50 +curriculum.append( + TaskSpec(eval_fn=EarnGold, eval_fn_kwargs={"amount": 50}) +) diff --git a/environment.py b/environment.py index 36452ab..a921a65 100644 --- a/environment.py +++ b/environment.py @@ -33,14 +33,15 @@ def __init__(self, args: Namespace): class Postprocessor(StatPostprocessor): def __init__(self, env, is_multiagent, agent_id, - early_stop_agent_num=0, - sqrt_achievement_rewards=False, - heal_bonus_weight=0, - meander_bonus_weight=0, - explore_bonus_weight=0, - clip_unique_event=3, + eval_mode=False, + early_stop_agent_num=0, + sqrt_achievement_rewards=False, + heal_bonus_weight=0, + meander_bonus_weight=0, + explore_bonus_weight=0, + clip_unique_event=3, ): - super().__init__(env, agent_id) + super().__init__(env, agent_id, eval_mode) self.early_stop_agent_num = early_stop_agent_num self.sqrt_achievement_rewards = sqrt_achievement_rewards self.heal_bonus_weight = heal_bonus_weight @@ -117,6 +118,7 @@ def env_creator(): env = pufferlib.emulation.PettingZooPufferEnv(env, postprocessor_cls=Postprocessor, postprocessor_kwargs={ + 'eval_mode': args.eval_mode, 'early_stop_agent_num': args.early_stop_agent_num, 'sqrt_achievement_rewards': args.sqrt_achievement_rewards, 'heal_bonus_weight': args.heal_bonus_weight, diff --git a/evaluate.py b/evaluate.py index 99e9c9b..c9dc617 100644 --- a/evaluate.py +++ b/evaluate.py @@ -5,7 +5,11 @@ import time from types import SimpleNamespace from pathlib import Path +from collections import defaultdict +from dataclasses import asdict +from itertools import cycle +import numpy as np import torch import pandas as pd @@ -54,11 +58,11 @@ def save_replays(policy_store_dir, save_dir): args.early_stop_agent_num = 0 # run the full episode args.resilient_population = 0 # no resilient agents - # TODO: custom models will require different policy creation functions + # NOTE: This creates a dummy learner agent. Is it necessary? from reinforcement_learning import policy # import your policy def make_policy(envs): learner_policy = policy.Baseline( - envs, + envs._driver_env, input_size=args.input_size, hidden_size=args.hidden_size, task_size=args.task_size @@ -83,8 +87,10 @@ def make_policy(envs): # Load the policies into the policy pool evaluator.policy_pool.update_policies({ - p.name: p.policy(make_policy, evaluator.buffers[0], evaluator.device) - for p in policy_store._all_policies().values() + p.name: p.policy( + policy_args=[evaluator.buffers[0]], + device=evaluator.device + ) for p in list(policy_store._all_policies().values()) }) # Set up the replay helper @@ -121,7 +127,7 @@ def create_policy_ranker(policy_store_dir, ranker_file="openskill.pickle"): file = os.path.join(policy_store_dir, ranker_file) if os.path.exists(file): if os.path.exists(file + ".lock"): - raise ValueError("Policy ranker file is locked.") + raise ValueError("Policy ranker file is locked. Delete the lock file.") logging.info("Using policy ranker from %s", file) policy_ranker = pufferlib.utils.PersistentObject( file, @@ -135,21 +141,34 @@ def create_policy_ranker(policy_store_dir, ranker_file="openskill.pickle"): ) return policy_ranker -def rank_policies(policy_store_dir, device): +class AllPolicySelector(pufferlib.policy_ranker.PolicySelector): + def select_policies(self, policies): + # Return all policy names in the alpahebetical order + # Loops circularly if more policies are needed than available + loop = cycle([ + policies[name] for name in sorted(policies.keys() + )]) + return [next(loop) for _ in range(self._num)] + +def rank_policies(policy_store_dir, eval_curriculum_file, device): # CHECK ME: can be custom models with different architectures loaded here? policy_store = setup_policy_store(policy_store_dir) - num_policies = len(policy_store._all_policies()) policy_ranker = create_policy_ranker(policy_store_dir) + num_policies = len(policy_store._all_policies()) + policy_selector = AllPolicySelector(num_policies) - # TODO: task-condition agents when generating replays args = SimpleNamespace(**config.Config.asdict()) args.data_dir = policy_store_dir + args.eval_mode = True + args.num_envs = 5 # sample a bit longer in each env + args.num_buffers = 1 args.learner_weight = 0 # evaluate mode args.selfplay_num_policies = num_policies + 1 args.early_stop_agent_num = 0 # run the full episode args.resilient_population = 0 # no resilient agents + args.tasks_path = eval_curriculum_file # task-conditioning - # TODO: custom models will require different policy creation functions + # NOTE: This creates a dummy learner agent. Is it necessary? from reinforcement_learning import policy # import your policy def make_policy(envs): learner_policy = policy.Baseline( @@ -174,17 +193,25 @@ def make_policy(envs): num_buffers=args.num_buffers, selfplay_learner_weight=args.learner_weight, selfplay_num_policies=args.selfplay_num_policies, - batch_size=args.rollout_batch_size, + batch_size=args.eval_batch_size, policy_store=policy_store, policy_ranker=policy_ranker, # so that a new ranker is created + policy_selector=policy_selector, ) rank_file = os.path.join(policy_store_dir, "ranking.txt") with open(rank_file, "w") as f: pass - while evaluator.global_step < args.train_num_steps: - evaluator.evaluate() + results = defaultdict(list) + while evaluator.global_step < args.eval_num_steps: + _, stats, infos = evaluator.evaluate() + + for pol, vals in infos.items(): + results[pol].extend([ + e[1] for e in infos[pol]['team_results'] + ]) + ratings = evaluator.policy_ranker.ratings() dataframe = pd.DataFrame( { @@ -202,10 +229,27 @@ def make_policy(envs): + "\n\n" ) + # Reset the envs and start the new episodes + # NOTE: The below line will probably end the episode in the middle, + # so we won't be able to sample scores from the successful agents. + # Thus, the scores will be biased towards the agents that die early. + # Still, the numbers we get this way is better than frequently + # updating the scores because the openskill ranking only takes the mean. + #evaluator.buffers[0]._async_reset() + # CHECK ME: delete the policy_ranker lock file Path(evaluator.policy_ranker.lock.lock_file).unlink(missing_ok=True) evaluator.close() + for pol, res in results.items(): + aggregated = {} + keys = asdict(res[0]).keys() + for k in keys: + if k == 'policy_id': + continue + aggregated[k] = np.mean([asdict(e)[k] for e in res]) + results[pol] = aggregated + print('Evaluation complete. Average stats:\n', results) if __name__ == "__main__": @@ -213,7 +257,7 @@ def make_policy(envs): -p, --policy-store-dir: Directory to load policy checkpoints from for evaluation/ranking -s, --replay-save-dir: Directory to save replays (Default: replays/) - -e, --eval-mode: Evaluate mode (Default: False) + -r, --replay-mode: Replay save mode (Default: False) -d, --device: Device to use for evaluation/ranking (Default: cuda if available, otherwise cpu) To generate replay from your checkpoints, put them together in policy_store_dir, run the following command, @@ -247,12 +291,11 @@ def make_policy(envs): help="Directory to save replays (Default: replays/)", ) parser.add_argument( - "-e", - "--eval-mode", - dest="eval_mode", - type=bool, - default=False, - help="Evaluate mode (Default: False)", + "-r", + "--replay-mode", + dest="replay_mode", + action="store_true", + help="Replay mode (Default: False)", ) parser.add_argument( "-d", @@ -262,15 +305,23 @@ def make_policy(envs): default="cuda" if torch.cuda.is_available() else "cpu", help="Device to use for evaluation/ranking (Default: cuda if available, otherwise cpu)", ) + parser.add_argument( + "-t", + "--task-file", + dest="task_file", + type=str, + default="reinforcement_learning/eval_task_with_embedding.pkl", + help="Task file to use for evaluation", + ) # Parse and check the arguments eval_args = parser.parse_args() assert eval_args.policy_store_dir is not None, "Policy store directory must be specified" - if eval_args.eval_mode: - logging.info("Ranking checkpoints from %s", eval_args.policy_store_dir) - logging.info("Replays will NOT be generated") - rank_policies(eval_args.policy_store_dir, eval_args.device) - else: + if getattr(eval_args, "replay_mode", False): logging.info("Generating replays from the checkpoints in %s", eval_args.policy_store_dir) save_replays(eval_args.policy_store_dir, eval_args.replay_save_dir) + else: + logging.info("Ranking checkpoints from %s", eval_args.policy_store_dir) + logging.info("Replays will NOT be generated") + rank_policies(eval_args.policy_store_dir, eval_args.task_file, eval_args.device) diff --git a/leader_board.py b/leader_board.py index 1e6edbf..1c8e783 100644 --- a/leader_board.py +++ b/leader_board.py @@ -36,6 +36,7 @@ class TeamResult: time_alive: int = 0, earned_gold: int = 0, completed_task_count: int = 0, + max_task_progress: float = 0, damage_received: int = 0, damage_inflicted: int = 0, ration_consumed: int = 0, @@ -72,6 +73,7 @@ def names(cls) -> List[str]: "time_alive", "earned_gold", "completed_task_count", + "max_task_progress", "damage_received", "damage_inflicted", "ration_consumed", @@ -110,8 +112,9 @@ class StatPostprocessor(pufferlib.emulation.Postprocessor): """Postprocessing actions and metrics of Neural MMO. Process wandb/leader board stats, and save replays. """ - def __init__(self, env, agent_id): + def __init__(self, env, agent_id, eval_mode=False): super().__init__(env, is_multiagent=True, agent_id=agent_id) + self.eval_mode = eval_mode self._reset_episode_stats() def reset(self, observation): @@ -125,6 +128,7 @@ def _reset_episode_stats(self): self._cod_starved = 0 self._cod_dehydrated = 0 self._task_completed = 0 + self._max_task_progress = 0 self._task_with_2_reward_signal = 0 self._task_with_0p2_max_progress = 0 self._curriculum = defaultdict(list) @@ -156,19 +160,20 @@ def _update_stats(self, agent): task = self.env.agent_task_map[agent.ent_id][0] # For each task spec, record whether its max progress and reward count self._curriculum[task.spec_name].append((task._max_progress, task.reward_signal_count)) + self._max_task_progress = task._max_progress if task.reward_signal_count >= 2: - self._task_with_2_reward_signal += 1.0 + self._task_with_2_reward_signal = 1.0 if task._max_progress >= 0.2: - self._task_with_0p2_max_progress += 1.0 + self._task_with_0p2_max_progress = 1.0 if task.completed: - self._task_completed += 1.0 + self._task_completed = 1.0 if agent.damage.val > 0: - self._cod_attacked += 1.0 + self._cod_attacked = 1.0 elif agent.food.val == 0: - self._cod_starved += 1.0 + self._cod_starved = 1.0 elif agent.water.val == 0: - self._cod_dehydrated += 1.0 + self._cod_dehydrated = 1.0 self._combat_level.append(agent.attack_level) self._harvest_level.append(max( @@ -249,6 +254,7 @@ def reward_done_info(self, reward, done, info): info["stats"][key] = float(val) # Fill in the "TeamResult" + result.max_task_progress = self._max_task_progress result.total_score = self._curr_unique_count result.time_alive = self._time_alive result.earned_gold = achieved["achieved/earned_gold"] @@ -268,6 +274,10 @@ def reward_done_info(self, reward, done, info): info["team_results"] = (self.agent_id, result) + if self.eval_mode: + # "return" is used for ranking in the eval mode, so put the task progress here + info["return"] = self._max_task_progress # this is 1 if done + return reward, done, info # Event processing utilities for Neural MMO. diff --git a/reinforcement_learning/config.py b/reinforcement_learning/config.py index 243c23e..f23e894 100644 --- a/reinforcement_learning/config.py +++ b/reinforcement_learning/config.py @@ -16,8 +16,10 @@ class Config: num_cores = None # Number of cores to use for training num_envs = 6 # Number of environments to use for training num_buffers = 2 # Number of buffers to use for training - rollout_batch_size = 32768 # Number of steps to rollout + rollout_batch_size = 2**15 # Number of steps to rollout + eval_batch_size = 2**15 # Number of steps to rollout for eval train_num_steps = 10_000_000 # Number of steps to train + eval_num_steps = 1_000_000 # Number of steps to evaluate checkpoint_interval = 30 # Interval to save models run_name = f"nmmo_{time.strftime('%Y%m%d_%H%M%S')}" # Run name runs_dir = "/tmp/runs" # Directory for runs @@ -47,6 +49,7 @@ class Config: map_size = 128 # Size of maps to use for training resilient_population = 0.2 # Percentage of agents to be resilient to starvation/dehydration tasks_path = None # Path to tasks to use for training + eval_mode = False # Run the postprocessor in the eval mode early_stop_agent_num = 8 # Stop the episode when the number of agents reaches this number sqrt_achievement_rewards=False # Use the log of achievement rewards heal_bonus_weight = 0.03 diff --git a/reinforcement_learning/eval_task_with_embedding.pkl b/reinforcement_learning/eval_task_with_embedding.pkl new file mode 100644 index 0000000..4ccd2ac Binary files /dev/null and b/reinforcement_learning/eval_task_with_embedding.pkl differ diff --git a/reinforcement_learning/policy.py b/reinforcement_learning/policy.py index ba82cd2..3bb4d58 100644 --- a/reinforcement_learning/policy.py +++ b/reinforcement_learning/policy.py @@ -35,9 +35,11 @@ def critic(self, hidden): class Baseline(pufferlib.models.Policy): - def __init__(self, envs, input_size, hidden_size, task_size): + def __init__(self, env, input_size=256, hidden_size=256, task_size=4096): super().__init__() - self.envs = envs + + self.flat_observation_space = env.flat_observation_space + self.flat_observation_structure = env.flat_observation_structure self.tile_encoder = TileEncoder(input_size) self.player_encoder = PlayerEncoder(input_size, hidden_size) @@ -50,7 +52,8 @@ def __init__(self, envs, input_size, hidden_size, task_size): self.value_head = torch.nn.Linear(hidden_size, 1) def encode_observations(self, flat_observations): - env_outputs = self.envs.unpack_batched_obs(flat_observations) + env_outputs = pufferlib.emulation.unpack_batched_obs(flat_observations, + self.flat_observation_space, self.flat_observation_structure) tile = self.tile_encoder(env_outputs["Tile"]) player_embeddings, my_agent = self.player_encoder( env_outputs["Entity"], env_outputs["AgentId"][:, 0] diff --git a/scripts/slurm_run.sh b/scripts/slurm_run.sh index cebf651..a8080bf 100755 --- a/scripts/slurm_run.sh +++ b/scripts/slurm_run.sh @@ -3,22 +3,21 @@ # Example ussage: # # sbatch ./scripts/slurm_run.sh scripts/train_baseline.sh \ -# --train.run_name=realikun_16x8_0001 +# --run-name=test --wandb-project=nmmo --wandb-entity=kywch #SBATCH --account=carperai -#SBATCH --partition=g40 +#SBATCH --partition=g40x #SBATCH --nodes=1 #SBATCH --gpus=1 -#SBATCH --cpus-per-gpu=12 +#SBATCH --cpus-per-gpu=6 #__SBATCH --mem=80G -#SBATCH --chdir=/fsx/home-daveey/nmmo-baselines/ +#SBATCH --chdir=/fsx/proj-nmmo/nmmo-baselines/ #SBATCH --output=sbatch/%j.log #SBATCH --error=sbatch/%j.log #SBATCH --requeue -#SBATCH --export=PYTHONUNBUFFERED=1,WANDB_BASE_URL="https://stability.wandb.io",WANDB_DIR=/fsx/home-daveey/tmp/wandb,WANDB_CONFIG_DIR=/fsx/home-daveey/tmp/wandb +#SBATCH --export=PYTHONUNBUFFERED=1,WANDB_BASE_URL="https://stability.wandb.io",WANDB_DIR=/fsx/proj-nmmo/tmp/wandb,WANDB_CONFIG_DIR=/fsx/proj-nmmo/tmp/wandb -source /fsx/home-daveey/miniconda/etc/profile.d/conda.sh && \ -conda activate nmmo && \ +source /fsx/proj-nmmo/venv/bin/activate && \ ulimit -c unlimited && \ ulimit -s unlimited && \ ulimit -a diff --git a/scripts/train_baseline.sh b/scripts/train_baseline.sh index 32464ac..a7d2a56 100755 --- a/scripts/train_baseline.sh +++ b/scripts/train_baseline.sh @@ -1,14 +1,5 @@ #!/bin/bash -python -u -O -m train \ ---wandb.entity=daveey \ ---wandb.project=nmmo \ ---train.runs_dir=/fsx/home-daveey/runs/ \ ---train.num_steps=10000000000 \ ---env.num_maps=100 \ ---rollout.num_buffers=2 \ ---rollout.num_envs=12 \ ---env.num_npcs=266 \ ---ppo.training_batch_size=128 \ ---rollout.batch_size=131072 \ +python -u -m train \ +--runs-dir=/fsx/proj-nmmo/runs/ \ "${@}" diff --git a/train.py b/train.py index 682df2d..08f08c0 100644 --- a/train.py +++ b/train.py @@ -31,7 +31,7 @@ def setup_env(args): def make_policy(envs): learner_policy = policy.Baseline( - envs, + envs._driver_env, input_size=args.input_size, hidden_size=args.hidden_size, task_size=args.task_size @@ -114,6 +114,7 @@ def curriculum_generation_track(trainer, args, use_elm=True): # Use the train_task_spec to train agents task_encoder.get_task_embedding(curriculum, save_to_file=CUSTOM_CURRICULUM_FILE) task_encoder.close() + trainer.data.sort_keys = [] reinforcement_learning_track(trainer, args) if __name__ == "__main__":