diff --git a/oai_agents/agents/base_agent.py b/oai_agents/agents/base_agent.py index 9f959316..7186a6e7 100644 --- a/oai_agents/agents/base_agent.py +++ b/oai_agents/agents/base_agent.py @@ -3,6 +3,7 @@ from oai_agents.common.state_encodings import ENCODING_SCHEMES from oai_agents.common.subtasks import calculate_completed_subtask, get_doable_subtasks, Subtasks from oai_agents.common.tags import AgentPerformance, TeamType, KeyCheckpoints +from oai_agents.common.checked_model_name_handler import CheckedModelNameHandler from oai_agents.gym_environments.base_overcooked_env import USEABLE_COUNTERS from overcooked_ai_py.mdp.overcooked_mdp import Action @@ -16,7 +17,7 @@ import numpy as np import torch as th import torch.nn as nn -from typing import List, Tuple, Union +from typing import List, Tuple, Union, Optional import stable_baselines3.common.distributions as sb3_distributions from stable_baselines3.common.evaluation import evaluate_policy from stable_baselines3.common.vec_env.stacked_observations import StackedObservations @@ -24,6 +25,7 @@ import os import random import pickle as pkl +import re class OAIAgent(nn.Module, ABC): """ @@ -464,11 +466,9 @@ def get_agents(self) -> List[OAIAgent]: def save_agents(self, path: Union[Path, None] = None, tag: Union[str, None] = None): ''' Saves each agent that the trainer is training ''' - if not path: - if self.args.exp_dir: - path = self.args.base_dir / 'agent_models' / self.args.exp_dir / self.name - else: - path = self.args.base_dir / 'agent_models'/ self.name + path = path or OAITrainer.get_model_path(base_dir=self.args.base_dir, + exp_folder=self.args.exp_dir, + model_name=self.name) tag = tag or self.args.exp_name save_path = path / tag / 'trainer_file' @@ -495,11 +495,9 @@ def save_agents(self, path: Union[Path, None] = None, tag: Union[str, None] = No @staticmethod def load_agents(args, tag, name: str=None, path: Union[Path, None] = None): ''' Loads each agent that the trainer is training ''' - if not path: - if args.exp_dir: - path = args.base_dir / 'agent_models' / args.exp_dir / name - else: - path = args.base_dir / 'agent_models'/ name + path = path or OAITrainer.get_model_path(base_dir=args.base_dir, + exp_folder=args.exp_dir, + model_name=name) tag = tag or args.exp_name load_path = path / tag / 'trainer_file' @@ -519,3 +517,68 @@ def load_agents(args, tag, name: str=None, path: Union[Path, None] = None): env_info = pkl.load(f) return agents, env_info, saved_variables + + @staticmethod + def list_agent_checked_tags(args, name: str=None, path: Union[Path, None] = None) -> List[str]: + ''' + Lists only tags that start with KeyCheckpoints.CHECKED_MODEL_PREFIX, followed by an integer. + If the integer is greater than 0, it must be followed by KeyCheckpoints.REWARD_SUBSTR and a floating-point number. + + Parameters: + - args: Experiment arguments containing base directory info and experiment directory info. + - name: The name of the agent, for which tags should be listed. + - path: Optional. If provided, it overrides the default path to the agents directory. + + Returns: + - A list of tags (directories) that match the specified pattern. + ''' + path = path or OAITrainer.get_model_path(base_dir=args.base_dir, + exp_folder=args.exp_dir, + model_name=name) + + handler = CheckedModelNameHandler() + return handler.get_all_checked_tags(path=path) + + @staticmethod + def get_most_recent_checkpoint(args, name: str) -> str: + path = OAITrainer.get_model_path( + base_dir=args.base_dir, + exp_folder=args.exp_dir, + model_name=name + ) + if not path.exists(): + print(f"Warning: The directory {path} does not exist.") + return None + ckpts = [name for name in os.listdir(path) if name.startswith(KeyCheckpoints.CHECKED_MODEL_PREFIX)] + if not ckpts: + print(f"Warning: No checkpoints found in {path} with prefix '{KeyCheckpoints.CHECKED_MODEL_PREFIX}'.") + return None + ckpts_nums = [int(c.split('_')[1]) for c in ckpts] + last_ckpt_num = max(ckpts_nums) + return [c for c in ckpts if c.startswith(f"{KeyCheckpoints.CHECKED_MODEL_PREFIX}{last_ckpt_num}")][0] + + @staticmethod + def get_model_path(base_dir: Union[str, Path], exp_folder: Optional[str], model_name: str) -> Path: + """ + Constructs a path for saving or loading an agent model. + + Parameters: + base_dir (str or Path): The base directory where models are stored. + exp_folder (str or None): The experiment folder name, or None if not applicable. + model_name (str): The name of the model. + + Returns: + Path: A Path object representing the constructed path. + """ + # Ensure base_dir is a Path object + base_dir = Path(base_dir) if isinstance(base_dir, str) else base_dir + + experiment_name = OAITrainer.get_experiment_name(exp_folder=exp_folder, model_name=model_name) + + path = base_dir / 'agent_models' /experiment_name + + return path + + @staticmethod + def get_experiment_name(exp_folder: Optional[str], model_name: str): + return f"{exp_folder}/{model_name}" if exp_folder else model_name \ No newline at end of file diff --git a/oai_agents/agents/il.py b/oai_agents/agents/il.py index 6c0e87e2..7cb40381 100644 --- a/oai_agents/agents/il.py +++ b/oai_agents/agents/il.py @@ -184,11 +184,11 @@ def run_epoch(self, agent_idx): self.agents[agent_idx].eval() return np.mean(losses) - def train_agents(self, epochs=100, exp_name=None): + def train_agents(self, epochs=100): """ Training routine """ if self.datasets is None: self.setup_datasets() - exp_name = exp_name or self.args.exp_name + exp_name = self.args.exp_name run = wandb.init(project="overcooked_ai", entity=self.args.wandb_ent, dir=str(self.args.base_dir / 'wandb'), reinit=True, name=exp_name + '_' + self.name, mode=self.args.wandb_mode) diff --git a/oai_agents/agents/rl.py b/oai_agents/agents/rl.py index acb3aec3..a1afe6e0 100644 --- a/oai_agents/agents/rl.py +++ b/oai_agents/agents/rl.py @@ -4,6 +4,7 @@ from oai_agents.common.state_encodings import ENCODING_SCHEMES from oai_agents.common.tags import AgentPerformance, TeamType, TeammatesCollection, KeyCheckpoints from oai_agents.gym_environments.base_overcooked_env import OvercookedGymEnv +from oai_agents.common.checked_model_name_handler import CheckedModelNameHandler import numpy as np import random @@ -13,24 +14,28 @@ from sb3_contrib import RecurrentPPO, MaskablePPO import wandb import os +from typing import Optional VEC_ENV_CLS = DummyVecEnv # class RLAgentTrainer(OAITrainer): ''' Train an RL agent to play with a teammates_collection of agents.''' - def __init__(self, teammates_collection, args, - agent, epoch_timesteps, n_envs, - seed, learner_type, - train_types=[], eval_types=[], - curriculum=None, num_layers=2, hidden_dim=256, - checkpoint_rate=None, name=None, env=None, eval_envs=None, - use_cnn=False, use_lstm=False, use_frame_stack=False, - taper_layers=False, use_policy_clone=False, deterministic=False, start_step: int=0, start_timestep: int=0): + def __init__( + self, teammates_collection, args, + agent, epoch_timesteps, n_envs, + seed, learner_type, + train_types=[], eval_types=[], + curriculum=None, num_layers=2, hidden_dim=256, + checkpoint_rate=None, name=None, env=None, eval_envs=None, + use_cnn=False, use_lstm=False, use_frame_stack=False, + taper_layers=False, use_policy_clone=False, deterministic=False, start_step: int=0, start_timestep: int=0 + ): name = name or 'rl_agent' super(RLAgentTrainer, self).__init__(name, args, seed=seed) + self.args = args self.device = args.device self.teammates_len = self.args.teammates_len @@ -62,20 +67,23 @@ def __init__(self, teammates_collection, args, self.start_timestep = start_timestep self.learning_agent, self.agents = self.get_learning_agent(agent) - self.teammates_collection, self.eval_teammates_collection = self.get_teammates_collection(_tms_clctn = teammates_collection, - learning_agent = self.learning_agent, - train_types = train_types, - eval_types = eval_types) + self.teammates_collection, self.eval_teammates_collection = self.get_teammates_collection( + _tms_clctn = teammates_collection, + learning_agent = self.learning_agent, + train_types = train_types, + eval_types = eval_types + ) self.best_score, self.best_training_rew = -1, float('-inf') @classmethod - def generate_randomly_initialized_agent(cls, - args, - learner_type:str, - name:str, - seed:int, - hidden_dim:int, - ) -> OAIAgent: + def generate_randomly_initialized_agent( + cls, + args, + learner_type:str, + name:str, + seed:int, + hidden_dim:int, + ) -> OAIAgent: ''' Generate a randomly initialized learning agent using the RLAgentTrainer class This function does not perform any learning @@ -84,16 +92,17 @@ def generate_randomly_initialized_agent(cls, :param seed: Random seed :returns: An untrained, randomly inititalized RL agent ''' - trainer = cls(name=name, - args=args, - agent=None, - teammates_collection={}, - epoch_timesteps=args.epoch_timesteps, - n_envs=args.n_envs, - seed=seed, - hidden_dim=hidden_dim, - learner_type=learner_type, - ) + trainer = cls( + name=name, + args=args, + agent=None, + teammates_collection={}, + epoch_timesteps=args.epoch_timesteps, + n_envs=args.n_envs, + seed=seed, + hidden_dim=hidden_dim, + learner_type=learner_type, + ) learning_agent, _ = trainer.get_learning_agent(None) return learning_agent @@ -261,19 +270,16 @@ def wrap_agent(self, sb3_agent, name): return SB3LSTMWrapper(sb3_agent, name, self.args) return SB3Wrapper(sb3_agent, name, self.args) - def get_experiment_name(self, exp_name): - return exp_name or str(self.args.exp_dir) + '/' + self.name - def should_evaluate(self, steps): mean_training_rew = np.mean([ep_info["r"] for ep_info in self.learning_agent.agent.ep_info_buffer]) - self.best_training_rew *= 0.98 + self.best_training_rew *= 1.00 - steps_divisable_by_15 = (steps + 1) % 15 == 0 + steps_divisible_by_x = (steps + 1) % 15 == 0 mean_rew_greater_than_best = mean_training_rew > self.best_training_rew and self.learning_agent.num_timesteps >= 5e6 checkpoint_rate_reached = self.checkpoint_rate and self.learning_agent.num_timesteps // self.checkpoint_rate > (len(self.ck_list) - 1) - return steps_divisable_by_15 or mean_rew_greater_than_best or checkpoint_rate_reached + return steps_divisible_by_x or mean_rew_greater_than_best or checkpoint_rate_reached def log_details(self, experiment_name, total_train_timesteps): print("Training agent: " + self.name + ", for experiment: " + experiment_name) @@ -292,8 +298,8 @@ def log_details(self, experiment_name, total_train_timesteps): print("Final sparse reward ratio: ", self.args.final_sparse_r_ratio) - def train_agents(self, total_train_timesteps, tag_for_returning_agent, exp_name=None, resume_ck_list=None): - experiment_name = self.get_experiment_name(exp_name) + def train_agents(self, total_train_timesteps, tag_for_returning_agent, resume_ck_list=None): + experiment_name = RLAgentTrainer.get_experiment_name(exp_folder=self.args.exp_dir, model_name=self.name) run = wandb.init(project="overcooked_ai", entity=self.args.wandb_ent, dir=str(self.args.base_dir / 'wandb'), reinit=True, name=experiment_name, mode=self.args.wandb_mode, resume="allow") @@ -302,23 +308,35 @@ def train_agents(self, total_train_timesteps, tag_for_returning_agent, exp_name= if self.checkpoint_rate is not None: if self.args.resume: - path = self.args.base_dir / 'agent_models' / experiment_name - - ckpts = [name for name in os.listdir(path) if name.startswith("ck")] + path = RLAgentTrainer.get_model_path( + base_dir=self.args.base_dir, + exp_folder=self.args.exp_dir, + model_name=self.name + ) + if not path.exists(): + print(f"Warning: The directory {path} does not exist.") + return None + ckpts = [name for name in os.listdir(path) if name.startswith(KeyCheckpoints.CHECKED_MODEL_PREFIX)] + if not ckpts: + print(f"Warning: No checkpoints found in {path} with prefix '{KeyCheckpoints.CHECKED_MODEL_PREFIX}'.") + return None ckpts_nums = [int(c.split('_')[1]) for c in ckpts] sorted_idxs = np.argsort(ckpts_nums) ckpts = [ckpts[i] for i in sorted_idxs] - self.ck_list = [(c[0], path, c[2]) for c in resume_ck_list] if resume_ck_list else [({k: 0 for k in self.args.layout_names}, path, ck) for ck in ckpts] + self.ck_list = [(c[0], path, c[2]) for c in resume_ck_list] if resume_ck_list else [ + ({k: 0 for k in self.args.layout_names}, path, ck) for ck in ckpts] else: self.ck_list = [] - path, tag = self.save_agents(tag=f'ck_{len(self.ck_list)}') + path, tag = self.save_agents(tag=f'{KeyCheckpoints.CHECKED_MODEL_PREFIX}{len(self.ck_list)}') self.ck_list.append(({k: 0 for k in self.args.layout_names}, path, tag)) + best_path, best_tag = None, None self.steps = self.start_step curr_timesteps = self.start_timestep prev_timesteps = self.learning_agent.num_timesteps + ck_name_handler = CheckedModelNameHandler() while curr_timesteps < total_train_timesteps: self.curriculum.update(current_step=self.steps) @@ -347,7 +365,7 @@ def train_agents(self, total_train_timesteps, tag_for_returning_agent, exp_name= if self.checkpoint_rate: if self.learning_agent.num_timesteps // self.checkpoint_rate > (len(self.ck_list) - 1): - path, tag = self.save_agents(tag=f'ck_{len(self.ck_list)}_rew_{mean_reward}') + path, tag = self.save_agents(tag=ck_name_handler.generate_tag(id=len(self.ck_list), mean_reward=mean_reward)) self.ck_list.append((rew_per_layout, path, tag)) if mean_reward >= self.best_score: diff --git a/oai_agents/common/arguments.py b/oai_agents/common/arguments.py index 1185bfd6..388e3dac 100644 --- a/oai_agents/common/arguments.py +++ b/oai_agents/common/arguments.py @@ -77,6 +77,9 @@ def get_arguments(additional_args=[]): parser.add_argument("--num-of-ckpoints", type=int, default=10) parser.add_argument("--resume", action="store_true", default=False, help="Restart from last checkpoint for population training only") + parser.add_argument("--for-evaluation", action="store_true", default=False, help="The trained agents are used for evaluating other agents. Please note that seeds and h_dim are different when agents are trained for evaluating others.)") + parser.add_argument("--num-of-training-variants", type=int, default=4) + for parser_arg, parser_kwargs in additional_args: parser.add_argument(parser_arg, **parser_kwargs) diff --git a/oai_agents/common/checked_model_name_handler.py b/oai_agents/common/checked_model_name_handler.py new file mode 100644 index 00000000..65f93a4c --- /dev/null +++ b/oai_agents/common/checked_model_name_handler.py @@ -0,0 +1,72 @@ +from oai_agents.common.tags import KeyCheckpoints +import re +from pathlib import Path +from typing import Optional, List, Union + +class CheckedModelNameHandler: + def __init__(self): + """ + Initializes the CheckedModelNameHandler with default prefix and reward substring. + """ + self.prefix = KeyCheckpoints.CHECKED_MODEL_PREFIX + self.reward_substr = KeyCheckpoints.REWARD_SUBSTR + self.pattern = re.compile(f"^{re.escape(self.prefix)}(\\d+)(?:{re.escape(self.reward_substr)}[\\d.]+)?$") + + def generate_tag(self, id: int, mean_reward: Optional[float] = None) -> str: + """ + Generate a checked model name based on the given id and mean reward. + + :param id: The identifier for the model, used as a numeric suffix. + :param mean_reward: Optional mean reward to include in the model name, required for ids greater than 0. + :return: A string representing the generated checked model name. + :raises ValueError: If id is negative or if mean_reward is not provided for ids greater than 0. + """ + if id < 0: + raise ValueError("ID must be a non-negative integer.") + + if id == 0: + return f"{self.prefix}{id}" + + if mean_reward is None: + raise ValueError("Mean reward must be provided for IDs greater than 0.") + + return f"{self.prefix}{id}{self.reward_substr}{mean_reward}" + + def is_valid_checked_tag(self, tag: str) -> bool: + """ + Check if a tag name matches the required pattern for checked models. + + :param tag: The tag name to validate. + :return: True if the tag name matches the pattern; otherwise, False. + """ + return bool(self.pattern.match(tag)) + + def get_all_checked_tags(self, path: Union[Path, None] = None) -> List[str]: + """ + Retrieve all valid checked model tags (subdirectories) under the specified path that match the pattern. + + :param path: The directory path to search for valid checked model tags. Can be a Path object or None. + :return: A list of valid checked model tag names. + :raises ValueError: If the path is None. + :raises FileNotFoundError: If the specified path does not exist. + :raises NotADirectoryError: If the specified path is not a directory. + """ + if path is None: + raise ValueError("The path cannot be None.") + + path = Path(path) if not isinstance(path, Path) else path + + if not path.exists(): + raise FileNotFoundError(f"The specified path '{path}' does not exist.") + if not path.is_dir(): + raise NotADirectoryError(f"The specified path '{path}' is not a directory.") + + tags = [] + for tag_path in path.iterdir(): + if tag_path.is_dir() and self.pattern.match(tag_path.name): + match = self.pattern.match(tag_path.name) + integer_part = int(match.group(1)) + # Only add tags that either have no reward substring for integer 0, or have it when integer > 0 + if integer_part == 0 or (integer_part > 0 and self.reward_substr in tag_path.name): + tags.append(tag_path.name) + return tags diff --git a/oai_agents/common/multi_setup_trainer.py b/oai_agents/common/multi_setup_trainer.py new file mode 100644 index 00000000..ebec6de7 --- /dev/null +++ b/oai_agents/common/multi_setup_trainer.py @@ -0,0 +1,189 @@ +import concurrent.futures +import random +from scripts.utils.common import generate_name +from oai_agents.common.learner import LearnerType +from oai_agents.common.tags import Prefix, KeyCheckpoints +from oai_agents.agents.rl import RLAgentTrainer +import dill + + +class MultiSetupTrainer: + def __init__( + self, + args, + train_types, + eval_types, + curriculum, + tag_for_returning_agent + ): + self.args = args + self.train_types = train_types + self.eval_types = eval_types + self.curriculum = curriculum + self.tag_for_returning_agent = tag_for_returning_agent + + self.parallel = args.parallel + self.num_of_training_variants = args.num_of_training_variants + self.for_evaluation = args.for_evaluation + + def generate_hdim_and_seed(self): + training_seeds = [1010, 2020, 2602, 13, 68, 2907, 105, 128] + training_hdims = [256] * len(training_seeds) + + evaluation_seeds = [3031, 4041, 5051, 3708, 3809, 3910, 4607, 5506] + evaluation_hdims = [256] * len(evaluation_seeds) + + if self.for_evaluation: + seeds = evaluation_seeds + hdims = evaluation_hdims + min_seed, max_seed = 3000, 5999 + else: + seeds = training_seeds + hdims = training_hdims + min_seed, max_seed = 0, 2999 + + selected_seeds = [] + selected_hdims = [] + + if self.num_of_training_variants <= len(seeds): + selected_seeds = seeds[:self.num_of_training_variants] + selected_hdims = hdims[:self.num_of_training_variants] + else: + selected_seeds = seeds[:] + selected_hdims = hdims[:] + + remaining = self.num_of_training_variants - len(seeds) + available_seeds = list(set(range(min_seed, max_seed + 1)) - set(selected_seeds)) + random_seeds = random.sample(available_seeds, remaining) + random_hdims = [256] * remaining + + selected_seeds += random_seeds + selected_hdims += random_hdims + + return selected_seeds, selected_hdims + + def get_trained_agent(self, seed, h_dim): + raise NotImplementedError("This method should be implemented by subclasses.") + + def get_multiple_trained_agents(self): + agents = [] + + seeds, hdims = self.generate_hdim_and_seed() + inputs = [ + (seeds[i], hdims[i]) + for i in range(self.num_of_training_variants) + ] + + if self.args.parallel: + with concurrent.futures.ProcessPoolExecutor( + max_workers=self.args.max_concurrent_jobs) as executor: + arg_lists = list(zip(*inputs)) + # executor.map(self.get_trained_agent, *arg_lists) + dilled_results = list(executor.map(self.get_trained_agent, *arg_lists)) + for dilled_res in dilled_results: + checkpoints_list = dill.loads(dilled_res) + # for dilled_res in dilled_results: + # agent = dill.loads(dilled_res) + # agents.append(agent) + else: + for inp in inputs: + checkpoints_list = self.get_trained_agent(seed=seeds[i], h_dim=hdims[i]) + + # for i in range(self.num_of_training_variants): + # agents.append(self.get_trained_agent(seed=seeds[i], h_dim=hdims[i])) + + # return agents + + def get_reinforcement_agent( + self, + name, + teammates_collection, + curriculum, + h_dim, + seed, + learner_type, + checkpoint_rate, + total_train_timesteps, + ): + agent_ckpt = None + start_step = 0 + start_timestep = 0 + ck_list = None + if self.args.resume: + last_ckpt = RLAgentTrainer.get_most_recent_checkpoint(args=self.args, name=name) + if last_ckpt: + agent_ckpt_info, env_info, training_info = RLAgentTrainer.load_agents(args=self.args, name=name, tag=last_ckpt) + agent_ckpt = agent_ckpt_info[0] + start_step = env_info["step_count"] + start_timestep = env_info["timestep_count"] + ck_list = training_info["ck_list"] + print(f"Restarting training from step: {start_step} (timestep: {start_timestep})") + + rlat = RLAgentTrainer( + args=self.args, + name=name, + teammates_collection=teammates_collection, + curriculum=curriculum, + hidden_dim=h_dim, + seed=seed, + checkpoint_rate=checkpoint_rate, + learner_type=learner_type, + agent=agent_ckpt, + epoch_timesteps=self.args.epoch_timesteps, + n_envs=self.args.n_envs, + start_step=start_step, + start_timestep=start_timestep + ) + + rlat.train_agents( + total_train_timesteps=total_train_timesteps, + tag_for_returning_agent=self.tag_for_returning_agent, + resume_ck_list=ck_list + ) + + agent = rlat.get_agents()[0] + checkpoint_list = rlat.ck_list + + if self.parallel: + return dill.dumps(checkpoint_list) + + return checkpoint_list + + +class MultiSetupSPTrainer(MultiSetupTrainer): + def get_trained_agent(self, seed, h_dim): + name = generate_name( + args=self.args, + prefix=Prefix.SELF_PLAY, + seed=seed, + h_dim=h_dim, + train_types=self.train_types, + has_curriculum=not self.curriculum.is_random + ) + + return self.get_reinforcement_agent( + name=name, + teammates_collection={}, + curriculum=self.curriculum, + h_dim=h_dim, + seed=seed, + learner_type=self.args.primary_learner_type, + checkpoint_rate=self.args.pop_total_training_timesteps // self.args.num_of_ckpoints, + total_train_timesteps=self.args.pop_total_training_timesteps, + ) + +def get_SP_agents(args, train_types, eval_types, curriculum, tag_for_returning_agent): + sp_trainer = MultiSetupSPTrainer( + args=args, + train_types=train_types, + eval_types=eval_types, + curriculum=curriculum, + tag_for_returning_agent=tag_for_returning_agent, + ) + return sp_trainer.get_multiple_trained_agents() + +# Example usage: +# sp_trainer = MultiSetupSPTrainer(args=args, num_of_training_variants=4, train_types=train_types, eval_types=eval_types, curriculum=curriculum, tag=tag) +# trained_agents = sp_trainer.get_multiple_trained_agents() +# Alternatively: +# trained_agents = get_SP_agents(args=args, num_of_training_variants=4, train_types=train_types, eval_types=eval_types, curriculum=curriculum, parallel=True, tag=tag) diff --git a/oai_agents/common/population.py b/oai_agents/common/population.py index 278c6bc4..9ea72d55 100644 --- a/oai_agents/common/population.py +++ b/oai_agents/common/population.py @@ -6,22 +6,12 @@ from oai_agents.agents.rl import RLAgentTrainer from oai_agents.common.tags import AgentPerformance, KeyCheckpoints, TeamType -from .curriculum import Curriculum - - -def _get_most_recent_checkpoint(args, name: str) -> str: - if args.exp_dir: - path = args.base_dir / 'agent_models' / args.exp_dir / name - else: - path = args.base_dir / 'agent_models' / name +from .curriculum import Curriculum +import random - ckpts = [name for name in os.listdir(path) if name.startswith("ck")] - ckpts_nums = [int(c.split('_')[1]) for c in ckpts] - last_ckpt_num = max(ckpts_nums) - return [c for c in ckpts if c.startswith(f"ck_{last_ckpt_num}")][0] -def train_agent_with_checkpoints(args, total_training_timesteps, ck_rate, seed, h_dim, serialize): +def train_SP_with_checkpoints(args, total_training_timesteps, ck_rate, seed, h_dim, serialize): ''' Returns ckeckpoints_list either serialized or not based on serialize flag @@ -33,13 +23,14 @@ def train_agent_with_checkpoints(args, total_training_timesteps, ck_rate, seed, start_timestep = 0 ck_rewards = None if args.resume: - last_ckpt = _get_most_recent_checkpoint(args, name) - agent_ckpt_info, env_info, training_info = RLAgentTrainer.load_agents(args, name=name, tag=last_ckpt) - agent_ckpt = agent_ckpt_info[0] - start_step = env_info["step_count"] - start_timestep = env_info["timestep_count"] - ck_rewards = training_info["ck_list"] - print(f"Restarting training from step: {start_step} (timestep: {start_timestep})") + last_ckpt = RLAgentTrainer.get_most_recent_checkpoint(args, name=name) + if last_ckpt: + agent_ckpt_info, env_info, training_info = RLAgentTrainer.load_agents(args, name=name, tag=last_ckpt) + agent_ckpt = agent_ckpt_info[0] + start_step = env_info["step_count"] + start_timestep = env_info["timestep_count"] + ck_rewards = training_info["ck_list"] + print(f"Restarting training from step: {start_step} (timestep: {start_timestep})") rlat = RLAgentTrainer( @@ -62,7 +53,11 @@ def train_agent_with_checkpoints(args, total_training_timesteps, ck_rate, seed, For SP agents, they only are trained with themselves so the order doesn't matter. ''' - rlat.train_agents(total_train_timesteps=total_training_timesteps, tag_for_returning_agent=KeyCheckpoints.MOST_RECENT_TRAINED_MODEL, resume_ck_list=ck_rewards) + rlat.train_agents( + total_train_timesteps=total_training_timesteps, + tag_for_returning_agent=KeyCheckpoints.MOST_RECENT_TRAINED_MODEL, + resume_ck_list=ck_rewards + ) checkpoints_list = rlat.ck_list if serialize: @@ -70,12 +65,12 @@ def train_agent_with_checkpoints(args, total_training_timesteps, ck_rate, seed, return checkpoints_list -def ensure_we_will_have_enough_agents_in_population(teammates_len, - train_types, - eval_types, - num_SPs_to_train, - unseen_teammates_len=0, # only used for SPX teamtypes - ): +def ensure_enough_SP_agents(teammates_len, + train_types, + eval_types, + num_SPs_to_train, + unseen_teammates_len=0, # only used for SPX teamtypes + ): total_population_len = len(AgentPerformance.ALL) * num_SPs_to_train @@ -92,8 +87,8 @@ def ensure_we_will_have_enough_agents_in_population(teammates_len, for eval_type in eval_types: if eval_type in TeamType.ALL_TYPES_BESIDES_SP: eval_agents_len += teammates_len - elif train_type == TeamType.SELF_PLAY or train_type == TeamType.SELF_PLAY_ADVERSARY: - train_agents_len += 0 + elif eval_type == TeamType.SELF_PLAY or eval_type == TeamType.SELF_PLAY_ADVERSARY: + eval_agents_len += 0 else: eval_agents_len += unseen_teammates_len @@ -105,30 +100,71 @@ def ensure_we_will_have_enough_agents_in_population(teammates_len, f" num_SPs_to_train: {num_SPs_to_train}." -def generate_hdim_and_seed(num_SPs_to_train): +def generate_hdim_and_seed(for_training: bool, num_of_required_agents: int): ''' - (hidden_dim, seed) = reward of selfplay - (256, 68)=362, (64, 14)=318 - (256, 13)=248, (64, 0)=230 - (256, 48)=20, (64, 30)=0 + Generates lists of seeds and hidden dimensions for a given number of agents for training or evaluation. + + Each setting is a pair (hidden_dim, seed). If the number of required agents + is less than or equal to the number of predefined settings, it selects from + the predefined seeds and hidden dimensions. Otherwise, it generates random + seeds and hidden dimensions to fill the remaining number of agents. + + Arguments: + for_training -- a boolean indicating whether to generate settings for training (True) or evaluation (False). + num_of_required_agents -- the number of (hidden_dim, seed) pairs to generate. + + Returns: + selected_seeds -- list of selected seeds + selected_hdims -- list of selected hidden dimensions ''' - # Tested in 3-chefs-small-kitchen: - good_seeds = [68, 14, 13, 0] - good_hdims = [256, 64, 256, 64] - # Not tested: - other_seeds_copied_from_HAHA = [2907, 2907, 105, 105, 8, 32, 128, 512] - other_hdims_copied_from_HAHA = [64, 256, 64, 256, 16, 64, 256, 1024] + # Predefined seeds and hidden dimensions for training + training_seeds = [1010, 2020, 2602, 13, 68, 2907, 105, 128] + training_hdims = [256] * len(training_seeds) - all_seeds = good_seeds + other_seeds_copied_from_HAHA - all_hdims = good_hdims + other_hdims_copied_from_HAHA + # Predefined seeds and hidden dimensions for evaluation + evaluation_seeds = [3031, 4041, 5051, 3708, 3809, 3910, 4607, 5506] + evaluation_hdims = [256] * len(evaluation_seeds) - selected_seeds = all_seeds[:num_SPs_to_train] - selected_hdims = all_hdims[:num_SPs_to_train] - return selected_seeds, selected_hdims + # Select appropriate predefined settings based on the input setting + if for_training: + seeds = training_seeds + hdims = training_hdims + min_seed = 0 + max_seed = 2999 + else: + seeds = evaluation_seeds + hdims = evaluation_hdims + min_seed, max_seed = 3000, 5999 + + + # Initialize selected lists + selected_seeds = [] + selected_hdims = [] + + # Check if we have enough predefined pairs + if num_of_required_agents <= len(seeds): + # Select predefined seeds and hdims + selected_seeds = seeds[:num_of_required_agents] + selected_hdims = hdims[:num_of_required_agents] + else: + # Use all predefined settings + selected_seeds = seeds[:] + selected_hdims = hdims[:] + # Generate additional random settings if more agents are needed + remaining = num_of_required_agents - len(seeds) + available_seeds = list(set(range(min_seed, max_seed)) - set(selected_seeds)) + random_seeds = random.sample(available_seeds, remaining) # Generate random seeds + random_hdims = [256] * remaining # Generate random hidden dimensions -def save_population(args, population): + # Append randomly generated settings to selected lists + selected_seeds += random_seeds + selected_hdims += random_hdims + + return selected_seeds, selected_hdims + +def save_categorized_SP_population(args, population): name_prefix = 'pop' for layout_name in args.layout_names: rt = RLAgentTrainer( @@ -147,16 +183,17 @@ def save_population(args, population): rt.save_agents(tag=KeyCheckpoints.MOST_RECENT_TRAINED_MODEL) -def get_population(args, - ck_rate, - total_training_timesteps, - train_types, - eval_types, - num_SPs_to_train, - unseen_teammates_len=0, - force_training=False, - tag=KeyCheckpoints.MOST_RECENT_TRAINED_MODEL, - ): +def get_categorized_SP_population( + args, + ck_rate, + total_training_timesteps, + train_types, + eval_types, + num_SPs_to_train, + unseen_teammates_len=0, + force_training=False, + tag=KeyCheckpoints.MOST_RECENT_TRAINED_MODEL, + ): population = {layout_name: [] for layout_name in args.layout_names} @@ -170,38 +207,47 @@ def get_population(args, except FileNotFoundError as e: print(f'Could not find saved population, creating them from scratch...\nFull Error: {e}') - ensure_we_will_have_enough_agents_in_population(teammates_len=args.teammates_len, - unseen_teammates_len=unseen_teammates_len, - train_types=train_types, - eval_types=eval_types, - num_SPs_to_train=num_SPs_to_train) + ensure_enough_SP_agents( + teammates_len=args.teammates_len, + unseen_teammates_len=unseen_teammates_len, + train_types=train_types, + eval_types=eval_types, + num_SPs_to_train=num_SPs_to_train + ) - seed, h_dim = generate_hdim_and_seed(num_SPs_to_train) + seed, h_dim = generate_hdim_and_seed( + for_training=True, num_of_required_agents=num_SPs_to_train) inputs = [ - (args, total_training_timesteps, ck_rate, seed[i], h_dim[i], True) for i in range(num_SPs_to_train) + (args, total_training_timesteps, ck_rate, seed[i], h_dim[i], True) + for i in range(num_SPs_to_train) ] + if args.parallel: with concurrent.futures.ProcessPoolExecutor(max_workers=args.max_concurrent_jobs) as executor: arg_lists = list(zip(*inputs)) - dilled_results = list(executor.map(train_agent_with_checkpoints, *arg_lists)) + dilled_results = list(executor.map(train_SP_with_checkpoints, *arg_lists)) for dilled_res in dilled_results: checkpoints_list = dill.loads(dilled_res) for layout_name in args.layout_names: - layout_pop = RLAgentTrainer.get_checkedpoints_agents(args, checkpoints_list, layout_name) + layout_pop = RLAgentTrainer.get_checkedpoints_agents( + args, checkpoints_list, layout_name) population[layout_name].extend(layout_pop) else: for inp in inputs: - checkpoints_list = train_agent_with_checkpoints(args=inp[0], - total_training_timesteps = inp[1], - ck_rate=inp[2], - seed=inp[3], - h_dim=inp[4], - serialize=False) + checkpoints_list = train_SP_with_checkpoints( + args=inp[0], + total_training_timesteps = inp[1], + ck_rate=inp[2], + seed=inp[3], + h_dim=inp[4], + serialize=False + ) for layout_name in args.layout_names: - layout_pop = RLAgentTrainer.get_checkedpoints_agents(args, checkpoints_list, layout_name) + layout_pop = RLAgentTrainer.get_checkedpoints_agents( + args, checkpoints_list, layout_name) population[layout_name].extend(layout_pop) - save_population(args=args, population=population) + save_categorized_SP_population(args=args, population=population) - return population + return population \ No newline at end of file diff --git a/oai_agents/common/tags.py b/oai_agents/common/tags.py index 9f8a0b75..308f05b7 100644 --- a/oai_agents/common/tags.py +++ b/oai_agents/common/tags.py @@ -83,6 +83,8 @@ class TeammatesCollection: class KeyCheckpoints: # Tags to identify the type of model checkpoint to save/load BEST_EVAL_REWARD = 'best' # Use only for evaluation MOST_RECENT_TRAINED_MODEL = 'last' # Use only for training + CHECKED_MODEL_PREFIX = 'ck_' + REWARD_SUBSTR = '_rew_' class Prefix: SELF_PLAY = 'SP' diff --git a/oai_agents/common/teammates_collection.py b/oai_agents/common/teammates_collection.py index d54134bb..6f69fb39 100644 --- a/oai_agents/common/teammates_collection.py +++ b/oai_agents/common/teammates_collection.py @@ -267,24 +267,25 @@ def generate_TC(args, def get_best_SP_agent(args, population): + # all_agents = [agent for agent in population[args.layout_names[0]]] + all_agents = get_all_agents( args.layout_names[0], population) agents_scores_averaged_over_layouts = [] - for layout_name in args.layout_names: - all_agents = [agent for agent in population[layout_name]] - for agent in all_agents: scores = [agent.layout_scores[layout_name] for layout_name in args.layout_names] agents_scores_averaged_over_layouts.append((agent, sum(scores)/len(scores))) best_agent = max(agents_scores_averaged_over_layouts, key=lambda x: x[1]) return best_agent[0] - +def get_all_agents(layout_name, population): + all_agents = [agent for agent in population[layout_name]] + return all_agents def update_eval_collection_with_eval_types_from_file(args, agent, unseen_teammates_len, eval_types, eval_collection): for teammates in eval_types: if teammates.team_type not in eval_collection[teammates.layout_name]: eval_collection[teammates.layout_name][teammates.team_type] = [] - tms_path = Path.cwd() / 'agent_models' / teammates.names[0] + tms_path = RLAgentTrainer.get_model_path(base_dir=Path.cwd(), model_name=teammates.names[0]) if teammates.load_from_pop_structure: layout_population, _, _ = RLAgentTrainer.load_agents(args, path=tms_path, tag=teammates.tags[0]) agents_perftag_score_all = [(agent, diff --git a/sandbox/fix_pop_ck_list_after_continued_run.py b/sandbox/fix_pop_ck_list_after_continued_run.py index 04093777..7adfa929 100644 --- a/sandbox/fix_pop_ck_list_after_continued_run.py +++ b/sandbox/fix_pop_ck_list_after_continued_run.py @@ -4,7 +4,7 @@ from oai_agents.agents.rl import RLAgentTrainer from oai_agents.common.arguments import get_arguments -from oai_agents.common.tags import TeamType +from oai_agents.common.tags import TeamType, KeyCheckpoints from oai_agents.common.learner import LearnerType from oai_agents.common.tags import KeyCheckpoints diff --git a/scripts/evaluate_agents.py b/scripts/evaluate_agents.py index 71a03c1a..004f4a99 100644 --- a/scripts/evaluate_agents.py +++ b/scripts/evaluate_agents.py @@ -1,7 +1,7 @@ import multiprocessing as mp import os from pathlib import Path -mp.set_start_method('spawn', force=True) +mp.set_start_method('spawn', force=True) import hashlib import sys @@ -27,6 +27,9 @@ THREE_PLAYERS_LOW_EVAL, THREE_PLAYERS_MEDIUM_EVAL, THREE_PLAYERS_HIGH_EVAL, + FOUR_PLAYERS_LOW_EVAL, + FOUR_PLAYERS_MEDIUM_EVAL, + FOUR_PLAYERS_HIGH_EVAL, FIVE_PLAYERS_LOW_EVAL, FIVE_PLAYERS_MEDIUM_FOR_ALL_BESIDES_STORAGE_ROOM_EVAL, FIVE_PLAYERS_HIGH_FOR_ALL_BESIDES_STORAGE_ROOM_EVAL, @@ -46,17 +49,27 @@ class Eval: } LAYOUT_NAMES_PATHs = { - 'selected_2_chefs_coordination_ring': { + 'selected_2_chefs_double_counter_circuit': { Eval.LOW: TWO_PLAYERS_LOW_EVAL, Eval.MEDIUM: TWO_PLAYERS_MEDIUM_EVAL, Eval.HIGH:TWO_PLAYERS_HIGH_EVAL }, - 'selected_2_chefs_counter_circuit': { + 'selected_2_chefs_secret_coordination_ring': { Eval.LOW: TWO_PLAYERS_LOW_EVAL, Eval.MEDIUM: TWO_PLAYERS_MEDIUM_EVAL, Eval.HIGH:TWO_PLAYERS_HIGH_EVAL }, - 'selected_2_chefs_cramped_room': { + 'selected_2_chefs_spacious_room_few_resources': { + Eval.LOW: TWO_PLAYERS_LOW_EVAL, + Eval.MEDIUM: TWO_PLAYERS_MEDIUM_EVAL, + Eval.HIGH:TWO_PLAYERS_HIGH_EVAL + }, + 'selected_2_chefs_spacious_room_no_counter_space': { + Eval.LOW: TWO_PLAYERS_LOW_EVAL, + Eval.MEDIUM: TWO_PLAYERS_MEDIUM_EVAL, + Eval.HIGH:TWO_PLAYERS_HIGH_EVAL + }, + 'selected_2_chefs_storage_room': { Eval.LOW: TWO_PLAYERS_LOW_EVAL, Eval.MEDIUM: TWO_PLAYERS_MEDIUM_EVAL, Eval.HIGH:TWO_PLAYERS_HIGH_EVAL @@ -78,6 +91,33 @@ class Eval: Eval.HIGH: THREE_PLAYERS_HIGH_EVAL, }, + 'selected_4_chefs_double_counter_circuit': { + Eval.LOW: FOUR_PLAYERS_LOW_EVAL, + Eval.MEDIUM: FOUR_PLAYERS_MEDIUM_EVAL, + Eval.HIGH: FOUR_PLAYERS_HIGH_EVAL + }, + 'selected_4_chefs_secret_coordination_ring': { + Eval.LOW: FOUR_PLAYERS_LOW_EVAL, + Eval.MEDIUM: FOUR_PLAYERS_MEDIUM_EVAL, + Eval.HIGH: FOUR_PLAYERS_HIGH_EVAL + }, + 'selected_4_chefs_spacious_room_few_resources': { + Eval.LOW: FOUR_PLAYERS_LOW_EVAL, + Eval.MEDIUM: FOUR_PLAYERS_MEDIUM_EVAL, + Eval.HIGH: FOUR_PLAYERS_HIGH_EVAL + }, + 'selected_4_chefs_spacious_room_no_counter_space': { + Eval.LOW: FOUR_PLAYERS_LOW_EVAL, + Eval.MEDIUM: FOUR_PLAYERS_MEDIUM_EVAL, + Eval.HIGH: FOUR_PLAYERS_HIGH_EVAL + }, + 'selected_4_chefs_storage_room': { + Eval.LOW: FOUR_PLAYERS_LOW_EVAL, + Eval.MEDIUM: FOUR_PLAYERS_MEDIUM_EVAL, + Eval.HIGH: FOUR_PLAYERS_HIGH_EVAL + }, + + 'selected_5_chefs_counter_circuit': { Eval.LOW: FIVE_PLAYERS_LOW_EVAL, Eval.MEDIUM: FIVE_PLAYERS_MEDIUM_FOR_ALL_BESIDES_STORAGE_ROOM_EVAL, @@ -218,7 +258,7 @@ def process_reward(reward): for idx, agent_name in enumerate(all_mean_rewards): mean_values = [v / num_teamsets for v in cross_exp_mean[agent_name]] std_values = [v / num_teamsets for v in cross_exp_std[agent_name]] - + x = x_values + idx * width - width * (num_agents - 1) / 2 ax.bar(x, mean_values, width, yerr=std_values, label=f"Agent: {agent_name}", capsize=5) @@ -345,7 +385,7 @@ def evaluate_agent_for_layout(agent_name, path, layout_names, p_idxes, args, det m.update(str(s).encode()) arg_hash = m.hexdigest() cached_eval = Path(f"eval_cache/eval_{arg_hash}.pkl") - + if cached_eval.is_file(): print(f"Loading cached evaluation for agent {agent_name}") with open(cached_eval, "rb") as f: @@ -405,37 +445,124 @@ def run_parallel_evaluation(args, all_agents_paths, layout_names, p_idxes, deter def get_2_player_input(args): args.num_players = 2 - layout_names = ['selected_2_chefs_coordination_ring', - 'selected_2_chefs_counter_circuit', - 'selected_2_chefs_cramped_room'] + layout_names = [ + # 'selected_2_chefs_coordination_ring', + # 'selected_2_chefs_counter_circuit', + # 'selected_2_chefs_cramped_room', + 'selected_2_chefs_double_counter_circuit', + 'selected_2_chefs_secret_coordination_ring', + 'selected_2_chefs_spacious_room_few_resources', + 'selected_2_chefs_spacious_room_no_counter_space', + 'selected_2_chefs_storage_room' + ] + p_idxes = [0, 1] - all_agents_paths = { - 'SP': 'agent_models/Result/2/SP_hd64_seed14/best', - 'FCP': 'agent_models/FCP_correct/2/FCP_s2020_h256_tr(AMX)_ran/best', + all_agents_paths = { + 'SP': 'agent_models/Final/2/SP_s1010_h256_tr[SP]_ran/best', + 'LMH CUR': 'agent_models/Final/2/N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPL_SPL_SPL_SPL]_cur_originaler/best', + 'LAST ALMH RAN REUSED 3A 60M': 'agent_models/Final/2/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPL_SPL_SPL_SPL_SPADV]_ran_originaler_attack2/best', + 'LAST ALMH CUR REUSED 3A 60M': 'agent_models/Final/2/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPL_SPL_SPL_SPL_SPADV]_cur_originaler_attack2/best', + # 'LAST ALMH-SP RAN REUSED 3A': 'agent_models/Final/2/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPL_SPL_SPL_SPL_SPADV_SP]_cur_originaler_attack2/best', + # 'LAST A-SP RAN REUSED 3A': 'agent_models/Final/2/PWADV-N-1-SP_s1010_h256_tr[SP_SPADV]_ran_originaler_attack2/best', + 'LAST AMH CUR 3A':'agent_models/Final/2/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPADV]_cur_originaler_attack2/best', + # 'LAST AMH CUR 2A':'agent_models/Final/2/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPADV]_cur_originaler_attack1/best', + # 'LAST AMH CUR 1A':'agent_models/Final/2/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPADV]_cur_originaler_attack0/best', + 'BEST AMH CUR 3A':'agent_models/Final/2-ego-play-with-best-adv/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPADV]_cur_originaler_attack2/best', + # 'BEST AMH CUR 2A':'agent_models/Final/2-ego-play-with-best-adv/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPADV]_cur_originaler_attack1/best', + # 'BEST AMH CUR 1A':'agent_models/Final/2-ego-play-with-best-adv/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPADV]_cur_originaler_attack0/best', + 'H': 'agent_models/Final/2/N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH]_ran_originaler/best', + + } + + teammate_lvl_sets = [ + [Eval.LOW], + [Eval.MEDIUM], + [Eval.HIGH] + ] + + return layout_names, p_idxes, all_agents_paths, teammate_lvl_sets, args + + +def get_2_player_input(args): + args.num_players = 2 + layout_names = [ + # 'selected_2_chefs_coordination_ring', + # 'selected_2_chefs_counter_circuit', + # 'selected_2_chefs_cramped_room', + 'selected_2_chefs_double_counter_circuit', + 'selected_2_chefs_secret_coordination_ring', + 'selected_2_chefs_spacious_room_few_resources', + 'selected_2_chefs_spacious_room_no_counter_space', + 'selected_2_chefs_storage_room' + ] - 'ALMH CUR 3A': 'agent_models/ALMH_CUR/2/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPL_SPL_SPL_SPL_SPADV]_cur_originaler_attack2/best', - 'ALMH RAN 3A': 'agent_models/ALMH_RAN/2/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPL_SPL_SPL_SPL_SPADV]_ran_originaler_attack2/best', - 'AMH CUR 3A': 'agent_models/AMH_CUR/2/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPADV]_cur_originaler_attack2/best', - 'AMH RAN 3A': 'agent_models/AMH_RAN/2/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPADV]_ran_originaler_attack2/best', + p_idxes = [0, 1] - 'ALMH CUR 2A': 'agent_models/ALMH_CUR/2/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPL_SPL_SPL_SPL_SPADV]_cur_originaler_attack1/best', - 'ALMH RAN 2A': 'agent_models/ALMH_RAN/2/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPL_SPL_SPL_SPL_SPADV]_ran_originaler_attack1/best', - 'AMH CUR 2A': 'agent_models/AMH_CUR/2/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPADV]_cur_originaler_attack1/best', - 'AMH RAN 2A': 'agent_models/AMH_RAN/2/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPADV]_ran_originaler_attack1/best', + all_agents_paths = { + 'SP': 'agent_models/Final/2/SP_s1010_h256_tr[SP]_ran/best', + 'LMH CUR': 'agent_models/Final/2/N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPL_SPL_SPL_SPL]_cur_originaler/best', + 'LAST ALMH RAN REUSED 3A 60M': 'agent_models/Final/2/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPL_SPL_SPL_SPL_SPADV]_ran_originaler_attack2/best', + 'LAST ALMH CUR REUSED 3A 60M': 'agent_models/Final/2/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPL_SPL_SPL_SPL_SPADV]_cur_originaler_attack2/best', + # 'LAST ALMH-SP RAN REUSED 3A': 'agent_models/Final/2/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPL_SPL_SPL_SPL_SPADV_SP]_cur_originaler_attack2/best', + # 'LAST A-SP RAN REUSED 3A': 'agent_models/Final/2/PWADV-N-1-SP_s1010_h256_tr[SP_SPADV]_ran_originaler_attack2/best', + 'LAST AMH CUR 3A':'agent_models/Final/2/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPADV]_cur_originaler_attack2/best', + # 'LAST AMH CUR 2A':'agent_models/Final/2/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPADV]_cur_originaler_attack1/best', + # 'LAST AMH CUR 1A':'agent_models/Final/2/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPADV]_cur_originaler_attack0/best', + 'BEST AMH CUR 3A':'agent_models/Final/2-ego-play-with-best-adv/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPADV]_cur_originaler_attack2/best', + # 'BEST AMH CUR 2A':'agent_models/Final/2-ego-play-with-best-adv/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPADV]_cur_originaler_attack1/best', + # 'BEST AMH CUR 1A':'agent_models/Final/2-ego-play-with-best-adv/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPADV]_cur_originaler_attack0/best', + 'H': 'agent_models/Final/2/N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH]_ran_originaler/best', - 'ALMH CUR 1A': 'agent_models/ALMH_CUR/2/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPL_SPL_SPL_SPL_SPADV]_cur_originaler_attack0/best', - 'ALMH RAN 1A': 'agent_models/ALMH_RAN/2/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPL_SPL_SPL_SPL_SPADV]_ran_originaler_attack0/best', - 'AMH CUR 1A': 'agent_models/AMH_CUR/2/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPADV]_cur_originaler_attack0/best', - 'AMH RAN 1A': 'agent_models/AMH_RAN/2/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPADV]_ran_originaler_attack0/best' } + teammate_lvl_sets = [ [Eval.LOW], [Eval.MEDIUM], [Eval.HIGH] ] + return layout_names, p_idxes, all_agents_paths, teammate_lvl_sets, args +def get_4_player_input(args): + args.num_players = 4 + layout_names = [ + # 'selected_4_chefs_coordination_ring', + # 'selected_4_chefs_counter_circuit', + # 'selected_4_chefs_cramped_room', + 'selected_4_chefs_double_counter_circuit', + 'selected_4_chefs_secret_coordination_ring', + 'selected_4_chefs_spacious_room_few_resources', + 'selected_4_chefs_spacious_room_no_counter_space', + 'selected_4_chefs_storage_room' + ] + + p_idxes = [0, 1, 2, 3] + + all_agents_paths = { + 'SP': 'agent_models/Final/4/SP_s1010_h256_tr[SP]_ran/best', + 'LMH CUR': 'agent_models/Final/4/N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPL_SPL_SPL_SPL]_cur_originaler/best', + # 'LAST ALMH RAN REUSED 3A 60M': 'agent_models/Final/4/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPL_SPL_SPL_SPL_SPADV]_ran_originaler_attack2/best', + # 'LAST ALMH CUR REUSED 3A 60M': 'agent_models/Final/4/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPL_SPL_SPL_SPL_SPADV]_cur_originaler_attack2/best', + # 'LAST ALMH-SP RAN REUSED 3A': 'agent_models/Final/4/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPL_SPL_SPL_SPL_SPADV_SP]_cur_originaler_attack2/best', + # 'LAST A-SP RAN REUSED 3A': 'agent_models/Final/4/PWADV-N-1-SP_s1010_h256_tr[SP_SPADV]_ran_originaler_attack2/best', + # 'LAST AMH CUR 3A':'agent_models/Final/4/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPADV]_cur_originaler_attack2/best', + 'LAST AMH CUR 2A':'agent_models/Final/4/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPADV]_cur_originaler_attack1/best', + 'LAST AMH CUR 1A':'agent_models/Final/4/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPADV]_cur_originaler_attack0/best', + 'BEST AMH CUR 3A':'agent_models/Final/4-ego-play-with-best-adv/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPADV]_cur_originaler_attack2/best', + # 'BEST AMH CUR 2A':'agent_models/Final/4-ego-play-with-best-adv/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPADV]_cur_originaler_attack1/best', + # 'BEST AMH CUR 1A':'agent_models/Final/4-ego-play-with-best-adv/PWADV-N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH_SPM_SPM_SPM_SPM_SPADV]_cur_originaler_attack0/best', + # 'H': 'agent_models/Final/4/N-1-SP_s1010_h256_tr[SPH_SPH_SPH_SPH]_ran_originaler/best', + + } + + teammate_lvl_sets = [ + [Eval.LOW], + [Eval.MEDIUM], + [Eval.HIGH] + ] + + return layout_names, p_idxes, all_agents_paths, teammate_lvl_sets, args def get_3_player_input(args): args.num_players = 3 @@ -489,8 +616,8 @@ def get_5_player_input(args): if __name__ == "__main__": args = get_arguments() - # layout_names, p_idxes, all_agents_paths, teammate_lvl_sets, args = get_2_player_input(args) - layout_names, p_idxes, all_agents_paths, teammate_lvl_sets, args = get_3_player_input(args) + layout_names, p_idxes, all_agents_paths, teammate_lvl_sets, args = get_4_player_input(args) + # layout_names, p_idxes, all_agents_paths, teammate_lvl_sets, args = get_3_player_input(args) # layout_names, p_idxes, all_agents_paths, teammate_lvl_sets, args = get_5_player_input(args) deterministic = False # deterministic = True does not actually work :sweat_smile: @@ -529,13 +656,13 @@ def get_5_player_input(args): unseen_counts=unseen_counts, display_delivery=show_delivery_num, plot_name=plot_name) - - - plot_evaluation_results_line(all_mean_rewards=all_mean_rewards, - all_std_rewards=all_std_rewards, - layout_names=layout_names, - teammate_lvl_sets=teammate_lvl_sets, - num_players=args.num_players, - plot_name=plot_name) - + + + # plot_evaluation_results_line(all_mean_rewards=all_mean_rewards, + # all_std_rewards=all_std_rewards, + # layout_names=layout_names, + # teammate_lvl_sets=teammate_lvl_sets, + # num_players=args.num_players, + # plot_name=plot_name) + diff --git a/scripts/generate_agents_for_eval.py b/scripts/generate_agents_for_eval.py index 848f8157..3f69a708 100644 --- a/scripts/generate_agents_for_eval.py +++ b/scripts/generate_agents_for_eval.py @@ -3,7 +3,7 @@ from oai_agents.agents.rl import RLAgentTrainer from oai_agents.common.arguments import get_arguments -from oai_agents.common.tags import TeamType, TeammatesCollection +from oai_agents.common.tags import TeamType, TeammatesCollection, KeyCheckpoints from scripts.utils import get_fcp_population @@ -18,7 +18,7 @@ def train_FCP(args, name, teammates_collection, train_types, total_training_time train_types=train_types, seed=2602, ) - fcp_trainer.train_agents(total_train_timesteps=total_training_timesteps) + fcp_trainer.train_agents(total_train_timesteps=total_training_timesteps, tag_for_returning_agent=KeyCheckpoints.MOST_RECENT_TRAINED_MODEL) def set_input(args, quick_test=False): @@ -26,8 +26,8 @@ def set_input(args, quick_test=False): args.teammates_len = 2 args.num_players = args.teammates_len + 1 # 3 players = 1 agent + 2 teammates args.exp_dir = f'eval/{args.teammates_len+1}_chefs' - - if not quick_test: + + if not quick_test: args.n_envs = 50 args.epoch_timesteps = 1e5 args.pop_total_training_timesteps = 5e6 @@ -76,7 +76,7 @@ def set_input(args, quick_test=False): ) teammates_collection[TeammatesCollection.EVAL] = teammates_collection[TeammatesCollection.TRAIN] - + # TODO: run this in parallel for fcp_train_types in all_FCP_train_types: vb = '_'.join(fcp_train_types) diff --git a/scripts/train_agents.py b/scripts/train_agents.py index 21b3a319..90add793 100644 --- a/scripts/train_agents.py +++ b/scripts/train_agents.py @@ -6,11 +6,13 @@ from oai_agents.common.learner import LearnerType from oai_agents.common.curriculum import Curriculum -from scripts.utils import (get_SP_agent, - get_FCP_agent_w_pop, - get_N_X_FCP_agents, - get_N_X_SP_agents, - ) +from scripts.utils import ( + get_SP_agents, + get_SP_agent, + get_FCP_agent_w_pop, + get_N_X_FCP_agents, + get_N_X_SP_agents, +) def SP(args): primary_train_types = [TeamType.SELF_PLAY] @@ -20,11 +22,20 @@ def SP(args): } curriculum = Curriculum(train_types=primary_train_types, is_random=True) - get_SP_agent(args=args, - train_types=curriculum.train_types, - eval_types=primary_eval_types, - curriculum=curriculum - ) + get_SP_agents( + args=args, + train_types=curriculum.train_types, + eval_types=primary_eval_types, + curriculum=curriculum, + tag_for_returning_agent=KeyCheckpoints.MOST_RECENT_TRAINED_MODEL + ) + + # get_SP_agent( + # args=args, + # train_types=curriculum.train_types, + # eval_types=primary_eval_types, + # curriculum=curriculum + # ) def SPN_1ADV(args) -> None: @@ -41,11 +52,17 @@ def SPN_1ADV(args) -> None: adversary_play_config = AdversaryPlayConfig.MAP primary_train_types = [TeamType.SELF_PLAY, TeamType.SELF_PLAY_ADVERSARY] - primary_eval_types = {'generate': [TeamType.SELF_PLAY_HIGH, TeamType.SELF_PLAY_LOW, TeamType.SELF_PLAY_ADVERSARY], - 'load': []} + primary_eval_types = { + 'generate': [ + TeamType.SELF_PLAY_HIGH, + TeamType.SELF_PLAY_LOW, + TeamType.SELF_PLAY_ADVERSARY + ], + 'load': [] + } - curriculum = Curriculum(train_types = primary_train_types, - is_random = True) + curriculum = Curriculum( + train_types = primary_train_types, is_random = True) get_N_X_SP_agents( args, n_x_sp_train_types=curriculum.train_types, @@ -61,8 +78,8 @@ def SPN_1ADV_XSPCKP(args) -> None: ''' In N-agents games, a randomly initialized agent will be trained with N-X copies of itself and X unseen teammates. X unseen teammates can be composed by either one of the two conditions: - (a) 1 adversary and X-1 self-play checkedpoints. - (b) X self-play checkedpoints. + (a) 1 adversary and X-1 self-play KeyCheckpoints. + (b) X self-play KeyCheckpoints. e.g. when N is 4 and X is 1, the team can be composed by [SP, SP, SP, ADV] or [SP, SP, SP, H] or [SP, SP, SP, M] or [SP, SP, SP, L] in a 4-chef layout. when N is 4 and X is 2, the team can be composed @@ -75,23 +92,35 @@ def SPN_1ADV_XSPCKP(args) -> None: attack_rounds = 3 unseen_teammates_len = 1 adversary_play_config = AdversaryPlayConfig.MAP - primary_train_types = [TeamType.SELF_PLAY_HIGH, TeamType.SELF_PLAY_MEDIUM, TeamType.SELF_PLAY_ADVERSARY] + primary_train_types = [ + TeamType.SELF_PLAY_HIGH, + TeamType.SELF_PLAY_MEDIUM, + TeamType.SELF_PLAY_ADVERSARY + ] - primary_eval_types = {'generate': [TeamType.SELF_PLAY_HIGH, TeamType.SELF_PLAY_LOW, TeamType.SELF_PLAY_ADVERSARY], - 'load': []} + primary_eval_types = { + 'generate': [ + TeamType.SELF_PLAY_HIGH, + TeamType.SELF_PLAY_LOW, + TeamType.SELF_PLAY_ADVERSARY + ], + 'load': [] + } - curriculum = Curriculum(train_types = primary_train_types, - is_random = False, - total_steps = args.n_x_sp_total_training_timesteps//args.epoch_timesteps, - training_phases_durations_in_order={ - (TeamType.SELF_PLAY_ADVERSARY): 0.5, - }, - rest_of_the_training_probabilities={ - TeamType.SELF_PLAY_MEDIUM: 0.3, - TeamType.SELF_PLAY_HIGH: 0.3, - TeamType.SELF_PLAY_ADVERSARY: 0.4, - }, - probabilities_decay_over_time=0) + curriculum = Curriculum( + train_types = primary_train_types, + is_random = False, + total_steps = args.n_x_sp_total_training_timesteps//args.epoch_timesteps, + training_phases_durations_in_order={ + (TeamType.SELF_PLAY_ADVERSARY): 0.5, + }, + rest_of_the_training_probabilities={ + TeamType.SELF_PLAY_MEDIUM: 0.3, + TeamType.SELF_PLAY_HIGH: 0.3, + TeamType.SELF_PLAY_ADVERSARY: 0.4, + }, + probabilities_decay_over_time=0 + ) get_N_X_SP_agents( args, n_x_sp_train_types=curriculum.train_types, @@ -126,9 +155,9 @@ def SPN_XSPCKP(args) -> None: unseen_teammates_len = 1 primary_train_types = [TeamType.SELF_PLAY_HIGH, TeamType.SELF_PLAY_MEDIUM, TeamType.SELF_PLAY_LOW] primary_eval_types = { - 'generate': [TeamType.SELF_PLAY_HIGH, TeamType.SELF_PLAY_LOW], - 'load': [] - } + 'generate': [TeamType.SELF_PLAY_HIGH, TeamType.SELF_PLAY_LOW], + 'load': [] + } curriculum = Curriculum(train_types = primary_train_types, is_random=False, @@ -166,26 +195,29 @@ def FCP_mhri(args): primary_eval_types = {'generate' : [TeamType.HIGH_FIRST], 'load': []} - fcp_curriculum = Curriculum(train_types = primary_train_types, - is_random=False, - total_steps = args.fcp_total_training_timesteps//args.epoch_timesteps, - training_phases_durations_in_order={ - (TeamType.LOW_FIRST): 0.5, - (TeamType.MEDIUM_FIRST): 0.125, - (TeamType.HIGH_FIRST): 0.125, - }, - rest_of_the_training_probabilities={ - TeamType.LOW_FIRST: 0.4, - TeamType.MEDIUM_FIRST: 0.3, - TeamType.HIGH_FIRST: 0.3, - }, - probabilities_decay_over_time=0 - ) + fcp_curriculum = Curriculum( + train_types = primary_train_types, + is_random=False, + total_steps = args.fcp_total_training_timesteps//args.epoch_timesteps, + training_phases_durations_in_order={ + (TeamType.LOW_FIRST): 0.5, + (TeamType.MEDIUM_FIRST): 0.125, + (TeamType.HIGH_FIRST): 0.125, + }, + rest_of_the_training_probabilities={ + TeamType.LOW_FIRST: 0.4, + TeamType.MEDIUM_FIRST: 0.3, + TeamType.HIGH_FIRST: 0.3, + }, + probabilities_decay_over_time=0 + ) - _, _ = get_FCP_agent_w_pop(args, - fcp_train_types = fcp_curriculum.train_types, - fcp_eval_types=primary_eval_types, - fcp_curriculum=fcp_curriculum) + _, _ = get_FCP_agent_w_pop( + args, + fcp_train_types = fcp_curriculum.train_types, + fcp_eval_types=primary_eval_types, + fcp_curriculum=fcp_curriculum + ) @@ -196,15 +228,18 @@ def FCP_traditional(args): ''' primary_train_types = [TeamType.ALL_MIX] - primary_eval_types = {'generate' : [TeamType.HIGH_FIRST, TeamType.LOW_FIRST], - 'load': []} + primary_eval_types = { + 'generate' : [TeamType.HIGH_FIRST, TeamType.LOW_FIRST], + 'load': [] + } fcp_curriculum = Curriculum(train_types=primary_train_types, is_random=True) - _, _ = get_FCP_agent_w_pop(args, - fcp_train_types=fcp_curriculum.train_types, - fcp_eval_types=primary_eval_types, - fcp_curriculum=fcp_curriculum, - ) + _, _ = get_FCP_agent_w_pop( + args, + fcp_train_types=fcp_curriculum.train_types, + fcp_eval_types=primary_eval_types, + fcp_curriculum=fcp_curriculum, + ) def N_1_FCP(args): @@ -214,19 +249,31 @@ def N_1_FCP(args): fcp_eval_types = {'generate' : [], 'load': []} fcp_curriculum = Curriculum(train_types=fcp_train_types, is_random=True) - primary_train_types = [TeamType.SELF_PLAY_LOW, TeamType.SELF_PLAY_MEDIUM, TeamType.SELF_PLAY_HIGH] - primary_eval_types = {'generate': [TeamType.SELF_PLAY_LOW, TeamType.SELF_PLAY_MEDIUM, TeamType.SELF_PLAY_HIGH], - 'load': []} + primary_train_types = [ + TeamType.SELF_PLAY_LOW, + TeamType.SELF_PLAY_MEDIUM, + TeamType.SELF_PLAY_HIGH + ] + primary_eval_types = { + 'generate': [ + TeamType.SELF_PLAY_LOW, + TeamType.SELF_PLAY_MEDIUM, + TeamType.SELF_PLAY_HIGH + ], + 'load': [] + } n_1_fcp_curriculum = Curriculum(train_types=primary_train_types, is_random=True) - get_N_X_FCP_agents(args=args, - fcp_train_types=fcp_curriculum.train_types, - fcp_eval_types=fcp_eval_types, - n_1_fcp_train_types=n_1_fcp_curriculum.train_types, - n_1_fcp_eval_types=primary_eval_types, - fcp_curriculum=fcp_curriculum, - n_1_fcp_curriculum=n_1_fcp_curriculum, - unseen_teammates_len=unseen_teammates_len) + get_N_X_FCP_agents( + args=args, + fcp_train_types=fcp_curriculum.train_types, + fcp_eval_types=fcp_eval_types, + n_1_fcp_train_types=n_1_fcp_curriculum.train_types, + n_1_fcp_eval_types=primary_eval_types, + fcp_curriculum=fcp_curriculum, + n_1_fcp_curriculum=n_1_fcp_curriculum, + unseen_teammates_len=unseen_teammates_len + ) def set_input(args): @@ -305,7 +352,7 @@ def set_input(args): args.fcp_total_training_timesteps = int(5e6 * args.how_long) args.n_x_fcp_total_training_timesteps = int(2 * args.fcp_total_training_timesteps * args.how_long) - args.SP_seed, args.SP_h_dim = 68, 256 + args.SP_seed, args.SP_h_dim = 1010, 256 args.N_X_SP_seed, args.N_X_SP_h_dim = 1010, 256 args.FCP_seed, args.FCP_h_dim = 2020, 256 args.N_X_FCP_seed, args.N_X_FCP_h_dim = 2602, 256 @@ -317,7 +364,8 @@ def set_input(args): else: # Used for doing quick tests args.num_of_ckpoints = 10 args.sb_verbose = 1 - args.wandb_mode = 'disabled' + # args.wandb_mode = 'disabled' + args.wandb_mode = 'online' args.n_envs = 2 args.epoch_timesteps = 2 @@ -336,19 +384,21 @@ def set_input(args): args = get_arguments() args.quick_test = False args.parallel = True + args.num_of_training_variants = 2 + # args.device = 'cpu' args.pop_force_training = False args.adversary_force_training = False args.primary_force_training = False - args.teammates_len = 2 - args.how_long = 6 # Not effective in quick_test mode + args.teammates_len = 1 + args.how_long = 8 # Not effective in quick_test mode set_input(args=args) - SPN_1ADV_XSPCKP(args=args) + # SPN_1ADV_XSPCKP(args=args) - #SP(args) + SP(args) # FCP_traditional(args=args) diff --git a/scripts/utils/__init__.py b/scripts/utils/__init__.py index f03281e6..1d691992 100644 --- a/scripts/utils/__init__.py +++ b/scripts/utils/__init__.py @@ -1,4 +1,4 @@ -from .train_helper import get_SP_agent, get_FCP_agent_w_pop, get_N_X_FCP_agents, get_N_X_SP_agents +from .train_helper import get_SP_agents, get_SP_agent, get_FCP_agent_w_pop, get_N_X_FCP_agents, get_N_X_SP_agents from .eval_helper import get_eval_types_to_load from .eval_constants import * diff --git a/scripts/utils/eval_constants.py b/scripts/utils/eval_constants.py index a7e6dcfa..edda23b7 100644 --- a/scripts/utils/eval_constants.py +++ b/scripts/utils/eval_constants.py @@ -1,24 +1,98 @@ +# TWO_PLAYERS_LOW_EVAL = [ +# 'agent_models/Result/Eval/2/SP_hd64_seed11/ck_0', +# 'agent_models/Result/Eval/2/SP_hd256_seed42/ck_1_rew_18.666666666666668', +# 'agent_models/Result/Eval/2/SP_hd64_seed11/ck_1_rew_22.0', +# 'agent_models/Result/Eval/2/SP_hd256_seed7/ck_1_rew_38.0', +# 'agent_models/Result/Eval/2/SP_hd64_seed1995/ck_0', +# ] +# TWO_PLAYERS_MEDIUM_EVAL = [ +# 'agent_models/Result/Eval/2/SP_hd64_seed11/ck_2_rew_108.66666666666667', +# 'agent_models/Result/Eval/2/SP_hd64_seed11/ck_3_rew_170.66666666666666', +# 'agent_models/Result/Eval/2/SP_hd256_seed7/ck_2_rew_106.66666666666667', +# 'agent_models/Result/Eval/2/SP_hd256_seed7/ck_3_rew_192.66666666666666', +# 'agent_models/Result/Eval/2/SP_hd256_seed42/ck_2_rew_110.66666666666667', +# ] +# TWO_PLAYERS_HIGH_EVAL = [ +# 'agent_models/Result/Eval/2/SP_hd64_seed11/ck_24_rew_298.0', +# 'agent_models/Result/Eval/2/SP_hd64_seed1995/ck_20_rew_286.6666666666667', +# 'agent_models/Result/Eval/2/SP_hd256_seed7/ck_24_rew_258.0', +# 'agent_models/Result/Eval/2/SP_hd256_seed42/ck_22_rew_262.0', +# 'agent_models/Result/Eval/2/SP_hd256_seed42/ck_13_rew_274.0', +# ] + TWO_PLAYERS_LOW_EVAL = [ - 'agent_models/Result/Eval/2/SP_hd64_seed11/ck_0', - 'agent_models/Result/Eval/2/SP_hd256_seed42/ck_1_rew_18.666666666666668', - 'agent_models/Result/Eval/2/SP_hd64_seed11/ck_1_rew_22.0', - 'agent_models/Result/Eval/2/SP_hd256_seed7/ck_1_rew_38.0', - 'agent_models/Result/Eval/2/SP_hd64_seed1995/ck_0', + "agent_models/Final/2/SP_hd64_seed0/ck_0", + "agent_models/Final/2/SP_hd64_seed14/ck_0", + "agent_models/Final/2/SP_hd256_seed13/ck_0", + "agent_models/Final/2/SP_hd256_seed68/ck_0", + "agent_models/Final/2/SP_s1010_h256_tr[SP]_ran/ck_0", + "agent_models/Final/2/SP_hd64_seed14/ck_1_rew_18.4", + "agent_models/Final/2/SP_hd64_seed0/ck_1_rew_28.8", + "agent_models/Final/2/SP_hd256_seed13/ck_1_rew_30.8", + "agent_models/Final/2/SP_hd256_seed68/ck_1_rew_56.8" ] + TWO_PLAYERS_MEDIUM_EVAL = [ - 'agent_models/Result/Eval/2/SP_hd64_seed11/ck_2_rew_108.66666666666667', - 'agent_models/Result/Eval/2/SP_hd64_seed11/ck_3_rew_170.66666666666666', - 'agent_models/Result/Eval/2/SP_hd256_seed7/ck_2_rew_106.66666666666667', - 'agent_models/Result/Eval/2/SP_hd256_seed7/ck_3_rew_192.66666666666666', - 'agent_models/Result/Eval/2/SP_hd256_seed42/ck_2_rew_110.66666666666667', + "agent_models/Final/2/SP_hd64_seed14/ck_2_rew_88.4", + "agent_models/Final/2/SP_hd64_seed0/ck_2_rew_122.8", + "agent_models/Final/2/SP_hd256_seed13/ck_2_rew_128.8", + "agent_models/Final/2/SP_hd256_seed68/ck_2_rew_156.0", + "agent_models/Final/2/SP_hd64_seed14/ck_3_rew_152.8", + "agent_models/Final/2/SP_hd64_seed0/ck_3_rew_171.6" ] + TWO_PLAYERS_HIGH_EVAL = [ - 'agent_models/Result/Eval/2/SP_hd64_seed11/ck_24_rew_298.0', - 'agent_models/Result/Eval/2/SP_hd64_seed1995/ck_20_rew_286.6666666666667', - 'agent_models/Result/Eval/2/SP_hd256_seed7/ck_24_rew_258.0', - 'agent_models/Result/Eval/2/SP_hd256_seed42/ck_22_rew_262.0', - 'agent_models/Result/Eval/2/SP_hd256_seed42/ck_13_rew_274.0', + "agent_models/Final/2/SP_s1010_h256_tr[SP]_ran/ck_10_rew_238.0", + "agent_models/Final/2/SP_s1010_h256_tr[SP]_ran/ck_9_rew_232.8", + "agent_models/Final/2/SP_s1010_h256_tr[SP]_ran/ck_8_rew_234.0", + "agent_models/Final/2/SP_s1010_h256_tr[SP]_ran/ck_7_rew_230.8", + "agent_models/Final/2/SP_s1010_h256_tr[SP]_ran/ck_6_rew_230.8", + "agent_models/Final/2/SP_s1010_h256_tr[SP]_ran/ck_5_rew_230.8", + "agent_models/Final/2/SP_s1010_h256_tr[SP]_ran/ck_4_rew_224.8", + "agent_models/Final/2/SP_hd64_seed14/ck_10_rew_226.8", + "agent_models/Final/2/SP_hd64_seed14/ck_9_rew_224.0", + "agent_models/Final/2/SP_hd256_seed13/ck_5_rew_217.2", + "agent_models/Final/2/SP_hd64_seed0/ck_10_rew_221.6", + "agent_models/Final/2/SP_hd256_seed68/ck_10_rew_209.2", + "agent_models/Final/2/SP_hd64_seed14/ck_5_rew_212.0", + "agent_models/Final/2/SP_hd256_seed68/ck_9_rew_213.6" +] + +# Define the paths for four-player evaluation in three different lists +FOUR_PLAYERS_LOW_EVAL = [ + "agent_models/Final/4/SP_s1010_h256_tr[SP]_ran/ck_0", + "agent_models/Final/4/SP_hd64_seed0/ck_0", + "agent_models/Final/4/SP_hd64_seed14/ck_0", + "agent_models/Final/4/SP_hd256_seed13/ck_0", + "agent_models/Final/4/SP_hd256_seed68/ck_0", + "agent_models/Final/4/SP_hd64_seed0/ck_1_rew_54.2", + "agent_models/Final/4/SP_hd256_seed68/ck_1_rew_66.0", + "agent_models/Final/4/SP_hd256_seed13/ck_1_rew_79.0", + "agent_models/Final/4/SP_hd64_seed14/ck_1_rew_44.0", + "agent_models/Final/4/SP_hd256_seed68/ck_2_rew_142.0" +] + +FOUR_PLAYERS_MEDIUM_EVAL = [ + "agent_models/Final/4/SP_hd64_seed14/ck_2_rew_122.2", + "agent_models/Final/4/SP_hd256_seed13/ck_2_rew_197.2", + "agent_models/Final/4/SP_hd64_seed0/ck_3_rew_168.0", + "agent_models/Final/4/SP_hd256_seed68/ck_3_rew_214.0", + "agent_models/Final/4/SP_hd64_seed0/ck_4_rew_204.6", + "agent_models/Final/4/SP_hd64_seed14/ck_4_rew_243.6" +] + +FOUR_PLAYERS_HIGH_EVAL = [ + "agent_models/Final/4/SP_s1010_h256_tr[SP]_ran/ck_8_rew_308.0", + "agent_models/Final/4/SP_s1010_h256_tr[SP]_ran/ck_6_rew_309.6", + "agent_models/Final/4/SP_s1010_h256_tr[SP]_ran/ck_5_rew_299.7", + "agent_models/Final/4/SP_hd64_seed0/ck_10_rew_302.4", + "agent_models/Final/4/SP_hd64_seed14/ck_10_rew_295.6", + "agent_models/Final/4/SP_hd256_seed68/ck_9_rew_296.8", + "agent_models/Final/4/SP_hd256_seed68/ck_8_rew_296.2", + "agent_models/Final/4/SP_hd64_seed14/ck_9_rew_289.0", + "agent_models/Final/4/SP_hd256_seed13/ck_9_rew_299.2", + "agent_models/Final/4/SP_hd256_seed13/ck_10_rew_290.8" ] THREE_PLAYERS_LOW_EVAL = [ diff --git a/scripts/utils/train_helper.py b/scripts/utils/train_helper.py index 441514cd..e20ccd46 100644 --- a/scripts/utils/train_helper.py +++ b/scripts/utils/train_helper.py @@ -1,20 +1,38 @@ from oai_agents.agents.rl import RLAgentTrainer from oai_agents.common.tags import TeamType -from oai_agents.common.population import get_population +from oai_agents.common.population import get_categorized_SP_population, generate_hdim_and_seed from oai_agents.common.teammates_collection import generate_TC, get_best_SP_agent, generate_TC_for_ADV_agent, update_TC_w_ADV_teammates from oai_agents.common.curriculum import Curriculum from .common import load_agents, generate_name -from oai_agents.common.tags import Prefix -from oai_agents.common.tags import KeyCheckpoints +from oai_agents.common.tags import Prefix, KeyCheckpoints +from oai_agents.common.multi_setup_trainer import MultiSetupSPTrainer -def get_SP_agent(args, train_types, eval_types, curriculum, tag=KeyCheckpoints.MOST_RECENT_TRAINED_MODEL): - name = generate_name(args, - prefix=Prefix.SELF_PLAY, - seed=args.SP_seed, - h_dim=args.SP_h_dim, - train_types=train_types, - has_curriculum= not curriculum.is_random) +def get_SP_agents(args, train_types, eval_types, curriculum, tag_for_returning_agent): + sp_trainer = MultiSetupSPTrainer( + args=args, + train_types=train_types, + eval_types=eval_types, + curriculum=curriculum, + tag_for_returning_agent=tag_for_returning_agent, + ) + return sp_trainer.get_multiple_trained_agents() + +def get_SP_agent( + args, + train_types, + eval_types, + curriculum, + tag=KeyCheckpoints.MOST_RECENT_TRAINED_MODEL + ): + name = generate_name( + args, + prefix=Prefix.SELF_PLAY, + seed=args.SP_seed, + h_dim=args.SP_h_dim, + train_types=train_types, + has_curriculum=not curriculum.is_random + ) agents = load_agents(args, name=name, tag=tag, force_training=args.pop_force_training) if agents: @@ -34,25 +52,34 @@ def get_SP_agent(args, train_types, eval_types, curriculum, tag=KeyCheckpoints.M checkpoint_rate=args.pop_total_training_timesteps // args.num_of_ckpoints, ) - selfplay_trainer.train_agents(total_train_timesteps=args.pop_total_training_timesteps, tag_for_returning_agent=tag) + selfplay_trainer.train_agents( + total_train_timesteps=args.pop_total_training_timesteps, + tag_for_returning_agent=tag + ) return selfplay_trainer.get_agents()[0] -def get_N_X_SP_agents(args, - unseen_teammates_len:int, - n_x_sp_train_types:list, - n_x_sp_eval_types:dict, - curriculum:Curriculum, - tag:str=KeyCheckpoints.MOST_RECENT_TRAINED_MODEL, - attack_rounds:int=-1, - adversary_play_config:str=None) -> tuple: - - curriculum.validate_curriculum_types(expected_types = [TeamType.SELF_PLAY_HIGH, - TeamType.SELF_PLAY_MEDIUM, - TeamType.SELF_PLAY_LOW, - TeamType.SELF_PLAY, - TeamType.SELF_PLAY_ADVERSARY], - unallowed_types = TeamType.ALL_TYPES_BESIDES_SP) +def get_N_X_SP_agents( + args, + unseen_teammates_len:int, + n_x_sp_train_types:list, + n_x_sp_eval_types:dict, + curriculum:Curriculum, + tag:str=KeyCheckpoints.MOST_RECENT_TRAINED_MODEL, + attack_rounds:int=-1, + adversary_play_config:str=None + ): + + curriculum.validate_curriculum_types( + expected_types = [ + TeamType.SELF_PLAY_HIGH, + TeamType.SELF_PLAY_MEDIUM, + TeamType.SELF_PLAY_LOW, + TeamType.SELF_PLAY, + TeamType.SELF_PLAY_ADVERSARY + ], + unallowed_types = TeamType.ALL_TYPES_BESIDES_SP + ) if TeamType.SELF_PLAY_ADVERSARY in n_x_sp_train_types: @@ -62,19 +89,20 @@ def get_N_X_SP_agents(args, prefix = 'N-' + str(unseen_teammates_len) + '-SP' suffix = args.primary_learner_type - name = generate_name(args, - prefix = prefix, - seed = args.N_X_SP_seed, - h_dim = args.N_X_SP_h_dim, - train_types = n_x_sp_train_types, - has_curriculum = not curriculum.is_random, - suffix=suffix, - ) + name = generate_name( + args, + prefix = prefix, + seed = args.N_X_SP_seed, + h_dim = args.N_X_SP_h_dim, + train_types = n_x_sp_train_types, + has_curriculum = not curriculum.is_random, + suffix=suffix, + ) agents = load_agents(args, name=name, tag=tag, force_training=args.primary_force_training) if agents: return agents[0] - population = get_population( + population = get_categorized_SP_population( args=args, ck_rate=args.pop_total_training_timesteps // args.num_of_ckpoints, total_training_timesteps=args.pop_total_training_timesteps, @@ -87,189 +115,254 @@ def get_N_X_SP_agents(args, ) if TeamType.SELF_PLAY_ADVERSARY in n_x_sp_train_types: - joint_ADV_N_X_SP(args=args, - population=population, - curriculum=curriculum, - unseen_teammates_len=unseen_teammates_len, - adversary_play_config=adversary_play_config, - attack_rounds=attack_rounds, - n_x_sp_eval_types=n_x_sp_eval_types - ) + joint_ADV_N_X_SP( + args=args, + population=population, + curriculum=curriculum, + unseen_teammates_len=unseen_teammates_len, + adversary_play_config=adversary_play_config, + attack_rounds=attack_rounds, + n_x_sp_eval_types=n_x_sp_eval_types + ) else: - no_ADV_N_X_SP(args=args, - population=population, - curriculum=curriculum, - unseen_teammates_len=unseen_teammates_len, - n_x_sp_eval_types=n_x_sp_eval_types - ) - - -def joint_ADV_N_X_SP(args, population, curriculum, unseen_teammates_len, adversary_play_config, attack_rounds, n_x_sp_eval_types, tag=KeyCheckpoints.MOST_RECENT_TRAINED_MODEL): + no_ADV_N_X_SP( + args=args, + population=population, + curriculum=curriculum, + unseen_teammates_len=unseen_teammates_len, + n_x_sp_eval_types=n_x_sp_eval_types + ) + + +def joint_ADV_N_X_SP( + args, + population, + curriculum, + unseen_teammates_len, + adversary_play_config, + attack_rounds, + n_x_sp_eval_types, + tag=KeyCheckpoints.MOST_RECENT_TRAINED_MODEL + ): assert TeamType.SELF_PLAY_ADVERSARY in curriculum.train_types agent_to_be_attacked = get_best_SP_agent(args=args, population=population) adversary_agents = [] for attack_round in range(attack_rounds): - adversary_agent = get_adversary_agent(args=args, - agent_to_be_attacked=agent_to_be_attacked, - attack_round=attack_round) + adversary_agent = get_adversary_agent( + args=args, + agent_to_be_attacked=agent_to_be_attacked, + attack_round=attack_round + ) adversary_agents.append(adversary_agent) - name = generate_name(args, - prefix = f'PWADV-N-{unseen_teammates_len}-SP', - seed = args.N_X_SP_seed, - h_dim = args.N_X_SP_h_dim, - train_types = curriculum.train_types, - has_curriculum = not curriculum.is_random, - suffix=args.primary_learner_type + '_attack' + str(attack_round), - ) - - agents = load_agents(args, name=name, tag=tag, force_training=args.primary_force_training) + name = generate_name( + args, + prefix = f'PWADV-N-{unseen_teammates_len}-SP', + seed = args.N_X_SP_seed, + h_dim = args.N_X_SP_h_dim, + train_types = curriculum.train_types, + has_curriculum = not curriculum.is_random, + suffix=args.primary_learner_type + '_attack' + str(attack_round), + ) + + agents = load_agents( + args, + name=name, + tag=tag, + force_training=args.primary_force_training + ) if agents: agent_to_be_attacked = agents[0] continue - random_init_agent = RLAgentTrainer.generate_randomly_initialized_agent(args=args, - name=name, - learner_type=args.primary_learner_type, - hidden_dim=args.N_X_SP_h_dim, - seed=args.N_X_SP_seed) - - teammates_collection = generate_TC(args=args, - population=population, - agent=random_init_agent, - train_types=curriculum.train_types, - eval_types_to_generate=n_x_sp_eval_types['generate'], - eval_types_to_read_from_file=n_x_sp_eval_types['load'], - unseen_teammates_len=unseen_teammates_len, - use_entire_population_for_train_types_teammates=True) - - teammates_collection = update_TC_w_ADV_teammates(args=args, - teammates_collection=teammates_collection, - primary_agent=random_init_agent, - adversaries=adversary_agents, - adversary_play_config=adversary_play_config) + random_init_agent = RLAgentTrainer.generate_randomly_initialized_agent( + args=args, + name=name, + learner_type=args.primary_learner_type, + hidden_dim=args.N_X_SP_h_dim, + seed=args.N_X_SP_seed + ) + + teammates_collection = generate_TC( + args=args, + population=population, + agent=random_init_agent, + train_types=curriculum.train_types, + eval_types_to_generate=n_x_sp_eval_types['generate'], + eval_types_to_read_from_file=n_x_sp_eval_types['load'], + unseen_teammates_len=unseen_teammates_len, + use_entire_population_for_train_types_teammates=True + ) + + teammates_collection = update_TC_w_ADV_teammates( + args=args, + teammates_collection=teammates_collection, + primary_agent=random_init_agent, + adversaries=adversary_agents, + adversary_play_config=adversary_play_config + ) if attack_round == attack_rounds-1: total_train_timesteps = 4*args.n_x_sp_total_training_timesteps else: total_train_timesteps = args.n_x_sp_total_training_timesteps - n_x_sp_types_trainer = RLAgentTrainer(name=name, - args=args, - agent=random_init_agent, - teammates_collection=teammates_collection, - epoch_timesteps=args.epoch_timesteps, - n_envs=args.n_envs, - curriculum=curriculum, - seed=args.N_X_SP_seed, - hidden_dim=args.N_X_SP_h_dim, - learner_type=args.primary_learner_type, - checkpoint_rate=total_train_timesteps // args.num_of_ckpoints, - ) + n_x_sp_types_trainer = RLAgentTrainer( + name=name, + args=args, + agent=random_init_agent, + teammates_collection=teammates_collection, + epoch_timesteps=args.epoch_timesteps, + n_envs=args.n_envs, + curriculum=curriculum, + seed=args.N_X_SP_seed, + hidden_dim=args.N_X_SP_h_dim, + learner_type=args.primary_learner_type, + checkpoint_rate=total_train_timesteps // args.num_of_ckpoints, + ) n_x_sp_types_trainer.train_agents(total_train_timesteps=total_train_timesteps, tag_for_returning_agent=tag) agent_to_be_attacked = n_x_sp_types_trainer.get_agents()[0] -def no_ADV_N_X_SP(args, population, curriculum, unseen_teammates_len, n_x_sp_eval_types, tag=KeyCheckpoints.MOST_RECENT_TRAINED_MODEL): +def no_ADV_N_X_SP( + args, + population, + curriculum, + unseen_teammates_len, + n_x_sp_eval_types, + tag=KeyCheckpoints.MOST_RECENT_TRAINED_MODEL + ): assert TeamType.SELF_PLAY_ADVERSARY not in curriculum.train_types - name = generate_name(args, - prefix = f'N-{unseen_teammates_len}-SP', - seed = args.N_X_SP_seed, - h_dim = args.N_X_SP_h_dim, - train_types = curriculum.train_types, - has_curriculum = not curriculum.is_random, - suffix=args.primary_learner_type, - ) + name = generate_name( + args, + prefix = f'N-{unseen_teammates_len}-SP', + seed = args.N_X_SP_seed, + h_dim = args.N_X_SP_h_dim, + train_types = curriculum.train_types, + has_curriculum = not curriculum.is_random, + suffix=args.primary_learner_type, + ) agents = load_agents(args, name=name, tag=tag, force_training=args.primary_force_training) if agents: return agents[0] - random_init_agent = RLAgentTrainer.generate_randomly_initialized_agent(args=args, - name=name, - learner_type=args.primary_learner_type, - hidden_dim=args.N_X_SP_h_dim, - seed=args.N_X_SP_seed) - - teammates_collection = generate_TC(args=args, - population=population, - agent=random_init_agent, - train_types=curriculum.train_types, - eval_types_to_generate=n_x_sp_eval_types['generate'], - eval_types_to_read_from_file=n_x_sp_eval_types['load'], - unseen_teammates_len=unseen_teammates_len, - use_entire_population_for_train_types_teammates=True) - - n_x_sp_types_trainer = RLAgentTrainer(name=name, - args=args, - agent=random_init_agent, - teammates_collection=teammates_collection, - epoch_timesteps=args.epoch_timesteps, - n_envs=args.n_envs, - curriculum=curriculum, - seed=args.N_X_SP_seed, - hidden_dim=args.N_X_SP_h_dim, - learner_type=args.primary_learner_type, - checkpoint_rate=args.n_x_sp_total_training_timesteps // args.num_of_ckpoints, - ) - n_x_sp_types_trainer.train_agents(total_train_timesteps=args.n_x_sp_total_training_timesteps, tag_for_returning_agent=tag) - - - -def get_adversary_agent(args, agent_to_be_attacked, attack_round, tag=KeyCheckpoints.MOST_RECENT_TRAINED_MODEL): + random_init_agent = RLAgentTrainer.generate_randomly_initialized_agent( + args=args, + name=name, + learner_type=args.primary_learner_type, + hidden_dim=args.N_X_SP_h_dim, + seed=args.N_X_SP_seed + ) + + teammates_collection = generate_TC( + args=args, + population=population, + agent=random_init_agent, + train_types=curriculum.train_types, + eval_types_to_generate=n_x_sp_eval_types['generate'], + eval_types_to_read_from_file=n_x_sp_eval_types['load'], + unseen_teammates_len=unseen_teammates_len, + use_entire_population_for_train_types_teammates=True + ) + + n_x_sp_types_trainer = RLAgentTrainer( + name=name, + args=args, + agent=random_init_agent, + teammates_collection=teammates_collection, + epoch_timesteps=args.epoch_timesteps, + n_envs=args.n_envs, + curriculum=curriculum, + seed=args.N_X_SP_seed, + hidden_dim=args.N_X_SP_h_dim, + learner_type=args.primary_learner_type, + checkpoint_rate=args.n_x_sp_total_training_timesteps // args.num_of_ckpoints, + ) + n_x_sp_types_trainer.train_agents( + total_train_timesteps=args.n_x_sp_total_training_timesteps, + tag_for_returning_agent=tag + ) + + + +def get_adversary_agent( + args, + agent_to_be_attacked, + attack_round, + tag=KeyCheckpoints.MOST_RECENT_TRAINED_MODEL + ): # It doesn't matter what we set the variable, adversary_teammates_teamtype, # the purpose of it is to maintain consistent naming and correct TC/curriculum creation adversary_teammates_teamtype = TeamType.HIGH_FIRST - teammates_collection = generate_TC_for_ADV_agent(args=args, - agent_to_be_attacked=agent_to_be_attacked, - teamtype=adversary_teammates_teamtype) + teammates_collection = generate_TC_for_ADV_agent( + args=args, + agent_to_be_attacked=agent_to_be_attacked, + teamtype=adversary_teammates_teamtype + ) - name = generate_name(args, - prefix='ADV', - seed=args.ADV_seed, - h_dim=args.ADV_h_dim, - train_types=[adversary_teammates_teamtype], - has_curriculum=False, - suffix=args.adversary_learner_type +'_attack'+ str(attack_round)) + name = generate_name( + args, + prefix='ADV', + seed=args.ADV_seed, + h_dim=args.ADV_h_dim, + train_types=[adversary_teammates_teamtype], + has_curriculum=False, + suffix=args.adversary_learner_type +'_attack'+ str(attack_round) + ) - agents = load_agents(args, name=name, tag=KeyCheckpoints.MOST_RECENT_TRAINED_MODEL, force_training=args.adversary_force_training) + agents = load_agents( + args, + name=name, + tag=KeyCheckpoints.MOST_RECENT_TRAINED_MODEL, + force_training=args.adversary_force_training + ) if agents: return agents[0] - adversary_trainer = RLAgentTrainer(name=name, - args=args, - agent=None, - teammates_collection=teammates_collection, - epoch_timesteps=args.epoch_timesteps, - n_envs=args.n_envs, - curriculum=Curriculum(train_types=[adversary_teammates_teamtype], is_random=True), - seed=args.ADV_seed, - hidden_dim=args.ADV_h_dim, - learner_type=args.adversary_learner_type, - checkpoint_rate=args.adversary_total_training_timesteps // args.num_of_ckpoints) - adversary_trainer.train_agents(total_train_timesteps=args.adversary_total_training_timesteps, tag_for_returning_agent=tag) + adversary_trainer = RLAgentTrainer( + name=name, + args=args, + agent=None, + teammates_collection=teammates_collection, + epoch_timesteps=args.epoch_timesteps, + n_envs=args.n_envs, + curriculum=Curriculum(train_types=[adversary_teammates_teamtype], is_random=True), + seed=args.ADV_seed, + hidden_dim=args.ADV_h_dim, + learner_type=args.adversary_learner_type, + checkpoint_rate=args.adversary_total_training_timesteps // args.num_of_ckpoints + ) + adversary_trainer.train_agents( + total_train_timesteps=args.adversary_total_training_timesteps, + tag_for_returning_agent=tag + ) return adversary_trainer.get_agents()[0] -def get_FCP_agent_w_pop(args, - fcp_train_types, - fcp_eval_types, - fcp_curriculum, - tag=KeyCheckpoints.MOST_RECENT_TRAINED_MODEL): +def get_FCP_agent_w_pop( + args, + fcp_train_types, + fcp_eval_types, + fcp_curriculum, + tag=KeyCheckpoints.MOST_RECENT_TRAINED_MODEL + ): - name = generate_name(args, - prefix=Prefix.FICTITIOUS_CO_PLAY, - seed=args.FCP_seed, - h_dim=args.FCP_h_dim, - train_types=fcp_train_types, - has_curriculum = not fcp_curriculum.is_random) + name = generate_name( + args, + prefix=Prefix.FICTITIOUS_CO_PLAY, + seed=args.FCP_seed, + h_dim=args.FCP_h_dim, + train_types=fcp_train_types, + has_curriculum = not fcp_curriculum.is_random + ) - population = get_population( + population = get_categorized_SP_population( args=args, ck_rate=args.pop_total_training_timesteps // args.num_of_ckpoints, total_training_timesteps=args.pop_total_training_timesteps, @@ -280,14 +373,21 @@ def get_FCP_agent_w_pop(args, tag=tag ) - teammates_collection = generate_TC(args=args, - population=population, - train_types=fcp_train_types, - eval_types_to_generate=fcp_eval_types['generate'], - eval_types_to_read_from_file=fcp_eval_types['load'], - use_entire_population_for_train_types_teammates=False) + teammates_collection = generate_TC( + args=args, + population=population, + train_types=fcp_train_types, + eval_types_to_generate=fcp_eval_types['generate'], + eval_types_to_read_from_file=fcp_eval_types['load'], + use_entire_population_for_train_types_teammates=False + ) - agents = load_agents(args, name=name, tag=tag, force_training=args.primary_force_training) + agents = load_agents( + args, + name=name, + tag=tag, + force_training=args.primary_force_training + ) if agents: return agents[0], population @@ -305,48 +405,70 @@ def get_FCP_agent_w_pop(args, checkpoint_rate=args.fcp_total_training_timesteps // args.num_of_ckpoints, ) - fcp_trainer.train_agents(total_train_timesteps=args.fcp_total_training_timesteps, tag_for_returning_agent=tag) + fcp_trainer.train_agents( + total_train_timesteps=args.fcp_total_training_timesteps, + tag_for_returning_agent=tag + ) return fcp_trainer.get_agents()[0], population -def get_N_X_FCP_agents(args, - fcp_train_types, - fcp_eval_types, - n_1_fcp_train_types, - n_1_fcp_eval_types, - fcp_curriculum, - n_1_fcp_curriculum, - unseen_teammates_len, - tag=KeyCheckpoints.MOST_RECENT_TRAINED_MODEL): - - n_1_fcp_curriculum.validate_curriculum_types(expected_types = [TeamType.SELF_PLAY_HIGH, TeamType.SELF_PLAY_MEDIUM, TeamType.SELF_PLAY_LOW], - unallowed_types= TeamType.ALL_TYPES_BESIDES_SP) +def get_N_X_FCP_agents( + args, + fcp_train_types, + fcp_eval_types, + n_1_fcp_train_types, + n_1_fcp_eval_types, + fcp_curriculum, + n_1_fcp_curriculum, + unseen_teammates_len, + tag=KeyCheckpoints.MOST_RECENT_TRAINED_MODEL + ): + + n_1_fcp_curriculum.validate_curriculum_types( + expected_types = [ + TeamType.SELF_PLAY_HIGH, + TeamType.SELF_PLAY_MEDIUM, + TeamType.SELF_PLAY_LOW + ], + unallowed_types= TeamType.ALL_TYPES_BESIDES_SP + ) - name = generate_name(args, - prefix=f'N-{unseen_teammates_len}-FCP', - seed=args.N_X_FCP_seed, - h_dim=args.N_X_FCP_h_dim, - train_types=n_1_fcp_curriculum.train_types, - has_curriculum = not fcp_curriculum.is_random) + name = generate_name( + args, + prefix=f'N-{unseen_teammates_len}-FCP', + seed=args.N_X_FCP_seed, + h_dim=args.N_X_FCP_h_dim, + train_types=n_1_fcp_curriculum.train_types, + has_curriculum = not fcp_curriculum.is_random + ) - agents = load_agents(args, name=name, tag=tag, force_training=args.primary_force_training) + agents = load_agents( + args, + name=name, + tag=tag, + force_training=args.primary_force_training + ) if agents: return agents[0] - fcp_agent, population = get_FCP_agent_w_pop(args, - fcp_train_types=fcp_train_types, - fcp_eval_types=fcp_eval_types, - fcp_curriculum=fcp_curriculum) + fcp_agent, population = get_FCP_agent_w_pop( + args, + fcp_train_types=fcp_train_types, + fcp_eval_types=fcp_eval_types, + fcp_curriculum=fcp_curriculum + ) - teammates_collection = generate_TC(args=args, - population=population, - agent=fcp_agent, - train_types=n_1_fcp_train_types, - eval_types_to_generate=n_1_fcp_eval_types['generate'], - eval_types_to_read_from_file=n_1_fcp_eval_types['load'], - unseen_teammates_len=unseen_teammates_len, - use_entire_population_for_train_types_teammates=False) + teammates_collection = generate_TC( + args=args, + population=population, + agent=fcp_agent, + train_types=n_1_fcp_train_types, + eval_types_to_generate=n_1_fcp_eval_types['generate'], + eval_types_to_read_from_file=n_1_fcp_eval_types['load'], + unseen_teammates_len=unseen_teammates_len, + use_entire_population_for_train_types_teammates=False + ) fcp_trainer = RLAgentTrainer( name=name, @@ -362,5 +484,8 @@ def get_N_X_FCP_agents(args, checkpoint_rate=args.n_x_fcp_total_training_timesteps // args.num_of_ckpoints, ) - fcp_trainer.train_agents(total_train_timesteps=args.n_x_fcp_total_training_timesteps, tag_for_returning_agent=tag) + fcp_trainer.train_agents( + total_train_timesteps=args.n_x_fcp_total_training_timesteps, + tag_for_returning_agent=tag + ) return fcp_trainer.get_agents()[0], teammates_collection diff --git a/tests/test_oai_agents/test_base_agent.py b/tests/test_oai_agents/test_base_agent.py new file mode 100644 index 00000000..1ea13e6a --- /dev/null +++ b/tests/test_oai_agents/test_base_agent.py @@ -0,0 +1,85 @@ +from oai_agents.agents.base_agent import OAITrainer +from pathlib import Path +from oai_agents.common.tags import KeyCheckpoints +import shutil + +def test_list_agent_checked_tags(): + # Define base directory based on the current working directory + base_dir = Path.cwd() + + # Set up the directory structure for testing + # This will create the following structure within the current working directory: + # + # / + # └── agent_models/ + # └── test_agents_folder/ + # └── test_agent/ + # ├── ck_0/ + # ├── ck_1_rew_59.5/ + # ├── ck_2_rew_140.0/ + # ├── ck_10_rew_336.8888888888889/ + # ├── ck_3_invalid/ # Should not match + # ├── ck_4_rew_invalid/ # Should not match + # ├── unrelated_tag/ # Should not match + # ├── best/ # Should not match + # └── last/ # Should not match + # + # Only `ck_0`, `ck_1_rew_59.5`, `ck_2_rew_140.0`, and `ck_10_rew_336.8888888888889` + # should be returned by the function. + + test_dir = base_dir / "agent_models" / "test_agents_folder" / "test_agent" + test_dir.mkdir(parents=True, exist_ok=True) # Ensure all parent directories are created + + # Simulate directory structure with various tags + tag_names = [ + "ck_0", + "ck_1_rew_59.5", + "ck_2_rew_140.0", + "ck_10_rew_336.8888888888889", + "ck_3_invalid", # Invalid because it doesn't have a valid float after the integer + "ck_4_rew_invalid", # Invalid because reward value is not a float + "unrelated_tag", # Invalid because it doesn't start with `KeyCheckpoints.CHECKED_MODEL_PREFIX` + "best", + "last" + ] + + # Create these tag directories within the test directory + for tag_name in tag_names: + (test_dir / tag_name).mkdir(parents=True, exist_ok=True) + + # Mock args object with base_dir and exp_dir pointing to the test directory + class MockArgs: + def __init__(self, base_dir, exp_dir, layout_names=[]): + self.base_dir = base_dir + self.exp_dir = "test_agents_folder" + self.layout_names = layout_names + + args = MockArgs(base_dir=base_dir, exp_dir="test_agents_folder") + + # Call the function to test + checked_tags = OAITrainer.list_agent_checked_tags(args, name="test_agent") + + # Expected tags should only include those that match the pattern + expected_tags = [ + "ck_0", + "ck_1_rew_59.5", + "ck_2_rew_140.0", + "ck_10_rew_336.8888888888889" + ] + + # Print results for verification + if sorted(checked_tags) == sorted(expected_tags): + print("Test passed: Tags returned as expected.") + else: + print(f"Test failed: Expected {expected_tags}, but got {checked_tags}") + + # Clean up the test directories after the test + # This will remove the entire "agent_models/test_agents_folder" structure created for testing + shutil.rmtree(base_dir / "agent_models" / "test_agents_folder") + +# Run the test function +test_list_agent_checked_tags() + + + + diff --git a/tests/test_oai_agents/test_population.py b/tests/test_oai_agents/test_population.py new file mode 100644 index 00000000..df3bc219 --- /dev/null +++ b/tests/test_oai_agents/test_population.py @@ -0,0 +1,43 @@ +# test_population.py + +from oai_agents.common.population import generate_hdim_and_seed + +def test_generate_hdim_and_seed(): + ''' + Test function for generate_hdim_and_seed to ensure: + 1. The number of (hidden_dim, seed) pairs matches the number of required agents. + 2. All generated seeds are unique. + 3. Hidden dimensions are as expected (256). + ''' + + # Test cases + test_cases = [3, 5, 8, 10] # Testing for fewer than, equal to, and more than predefined settings + + for for_training in [True, False]: + setting_type = "training" if for_training else "evaluation" + print(f"\nTesting for {setting_type} settings:") + + for num_agents in test_cases: + print(f"\nTesting with {num_agents} agents:") + + # Generate (hidden_dim, seed) pairs + selected_seeds, selected_hdims = generate_hdim_and_seed(for_training=for_training, num_of_required_agents=num_agents) + + # Check that the correct number of agents is generated + assert len(selected_seeds) == num_agents, f"Expected {num_agents} seeds, got {len(selected_seeds)}" + assert len(selected_hdims) == num_agents, f"Expected {num_agents} hidden dims, got {len(selected_hdims)}" + + # Check that all seeds are unique + assert len(set(selected_seeds)) == num_agents, "Duplicate seeds found in the generated seeds." + + # Check that hidden dims are from the valid set (256) + assert all(hdim == 256 for hdim in selected_hdims), "Invalid hidden dimension found. Only 256 is allowed." + + print(f"Test passed for {num_agents} agents.") + print("Selected seeds:", selected_seeds) + print("Selected hidden dimensions:", selected_hdims) + +# Ensure that this test script only runs when executed directly +if __name__ == "__main__": + print("Running tests for generate_hdim_and_seed...") + test_generate_hdim_and_seed()