From c27ae921c1d9d4c8a51dddf8573d27c7f580b1d7 Mon Sep 17 00:00:00 2001 From: nighood Date: Thu, 8 Jun 2023 21:48:12 +0800 Subject: [PATCH 01/16] env(rjy): add crowdsim env --- lzero/worker/__init__.py | 2 +- lzero/worker/muzero_collector.py | 88 +++++++++---- lzero/worker/muzero_evaluator.py | 5 +- zoo/CrowdSim/__init__.py | 0 .../config/CrowdSim_efficientzero_config.py | 99 ++++++++++++++ zoo/CrowdSim/config/__init__.py | 0 zoo/CrowdSim/envs/CrowdSim_env.py | 122 ++++++++++++++++++ zoo/CrowdSim/envs/__init__.py | 0 zoo/CrowdSim/envs/test_CrowdSim_env.py | 28 ++++ 9 files changed, 319 insertions(+), 25 deletions(-) create mode 100644 zoo/CrowdSim/__init__.py create mode 100644 zoo/CrowdSim/config/CrowdSim_efficientzero_config.py create mode 100644 zoo/CrowdSim/config/__init__.py create mode 100644 zoo/CrowdSim/envs/CrowdSim_env.py create mode 100644 zoo/CrowdSim/envs/__init__.py create mode 100644 zoo/CrowdSim/envs/test_CrowdSim_env.py diff --git a/lzero/worker/__init__.py b/lzero/worker/__init__.py index ef4ab2c65..aafaf5410 100644 --- a/lzero/worker/__init__.py +++ b/lzero/worker/__init__.py @@ -2,4 +2,4 @@ from .alphazero_evaluator import AlphaZeroEvaluator from .muzero_collector import MuZeroCollector from .muzero_evaluator import MuZeroEvaluator -from .gumbel_muzero_collector import GumbelMuZeroCollector +# from .gumbel_muzero_collector import GumbelMuZeroCollector diff --git a/lzero/worker/muzero_collector.py b/lzero/worker/muzero_collector.py index fffe6e84e..ec8ad7ebd 100644 --- a/lzero/worker/muzero_collector.py +++ b/lzero/worker/muzero_collector.py @@ -520,12 +520,28 @@ def collect(self, if timestep.done: self._total_episode_count += 1 reward = timestep.info['eval_episode_return'] - info = { - 'reward': reward, - 'time': self._env_info[env_id]['time'], - 'step': self._env_info[env_id]['step'], - 'visit_entropy': visit_entropies_lst[env_id] / eps_steps_lst[env_id], - } + if timestep.info.get('performance_info') is not None: + mean_aoi = timestep.info['performance_info']['mean_aoi'] + mean_energy_consumption = timestep.info['performance_info']['mean_energy_consumption'] + collected_data_amount = timestep.info['performance_info']['collected_data_amount'] + human_coverage = timestep.info['performance_info']['human_coverage'] + info = { + 'reward': reward, + 'time': self._env_info[env_id]['time'], + 'step': self._env_info[env_id]['step'], + 'visit_entropy': visit_entropies_lst[env_id] / eps_steps_lst[env_id], + 'mean_aoi': mean_aoi, + 'mean_energy_consumption': mean_energy_consumption, + 'collected_data_amount': collected_data_amount, + 'human_coverage': human_coverage, + } + else: + info = { + 'reward': reward, + 'time': self._env_info[env_id]['time'], + 'step': self._env_info[env_id]['step'], + 'visit_entropy': visit_entropies_lst[env_id] / eps_steps_lst[env_id], + } if self.policy_config.gumbel_algo: info['completed_value'] = completed_value_lst[env_id] / eps_steps_lst[env_id] collected_episode += 1 @@ -650,23 +666,49 @@ def _output_log(self, train_iter: int) -> None: if self.policy_config.gumbel_algo: completed_value = [d['completed_value'] for d in self._episode_info] self._total_duration += duration - info = { - 'episode_count': episode_count, - 'envstep_count': envstep_count, - 'avg_envstep_per_episode': envstep_count / episode_count, - 'avg_envstep_per_sec': envstep_count / duration, - 'avg_episode_per_sec': episode_count / duration, - 'collect_time': duration, - 'reward_mean': np.mean(episode_reward), - 'reward_std': np.std(episode_reward), - 'reward_max': np.max(episode_reward), - 'reward_min': np.min(episode_reward), - 'total_envstep_count': self._total_envstep_count, - 'total_episode_count': self._total_episode_count, - 'total_duration': self._total_duration, - 'visit_entropy': np.mean(visit_entropy), - # 'each_reward': episode_reward, - } + if self._episode_info[0].get('mean_aoi') is not None: + episode_aoi = [d['mean_aoi'] for d in self._episode_info] + episode_energy_consumption = [d['mean_energy_consumption'] for d in self._episode_info] + episode_collected_data_amount = [d['collected_data_amount'] for d in self._episode_info] + episode_human_coverage = [d['human_coverage'] for d in self._episode_info] + info = { + 'episode_count': episode_count, + 'envstep_count': envstep_count, + 'avg_envstep_per_episode': envstep_count / episode_count, + 'avg_envstep_per_sec': envstep_count / duration, + 'avg_episode_per_sec': episode_count / duration, + 'collect_time': duration, + 'reward_mean': np.mean(episode_reward), + 'reward_std': np.std(episode_reward), + 'reward_max': np.max(episode_reward), + 'reward_min': np.min(episode_reward), + 'total_envstep_count': self._total_envstep_count, + 'total_episode_count': self._total_episode_count, + 'total_duration': self._total_duration, + 'visit_entropy': np.mean(visit_entropy), + 'episode_mean_aoi': np.mean(episode_aoi), + 'episode_mean_energy_consumption': np.mean(episode_energy_consumption), + 'episode_mean_collected_data_amount': np.mean(episode_collected_data_amount), + 'episode_mean_human_coverage': np.mean(episode_human_coverage), + } + else: + info = { + 'episode_count': episode_count, + 'envstep_count': envstep_count, + 'avg_envstep_per_episode': envstep_count / episode_count, + 'avg_envstep_per_sec': envstep_count / duration, + 'avg_episode_per_sec': episode_count / duration, + 'collect_time': duration, + 'reward_mean': np.mean(episode_reward), + 'reward_std': np.std(episode_reward), + 'reward_max': np.max(episode_reward), + 'reward_min': np.min(episode_reward), + 'total_envstep_count': self._total_envstep_count, + 'total_episode_count': self._total_episode_count, + 'total_duration': self._total_duration, + 'visit_entropy': np.mean(visit_entropy), + # 'each_reward': episode_reward, + } if self.policy_config.gumbel_algo: info['completed_value'] = np.mean(completed_value) self._episode_info.clear() diff --git a/lzero/worker/muzero_evaluator.py b/lzero/worker/muzero_evaluator.py index 04d6fece9..eb2832a97 100644 --- a/lzero/worker/muzero_evaluator.py +++ b/lzero/worker/muzero_evaluator.py @@ -331,7 +331,10 @@ def eval( # Env reset is done by env_manager automatically. self._policy.reset([env_id]) reward = t.info['eval_episode_return'] - if 'episode_info' in t.info: + # 'performance_info' and 'episode_info' only choose one + if 'performance_info' in t.info: + eval_monitor.update_info(env_id, t.info['performance_info']) + elif 'episode_info' in t.info: eval_monitor.update_info(env_id, t.info['episode_info']) eval_monitor.update_reward(env_id, reward) self._logger.info( diff --git a/zoo/CrowdSim/__init__.py b/zoo/CrowdSim/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/zoo/CrowdSim/config/CrowdSim_efficientzero_config.py b/zoo/CrowdSim/config/CrowdSim_efficientzero_config.py new file mode 100644 index 000000000..acfbff5dc --- /dev/null +++ b/zoo/CrowdSim/config/CrowdSim_efficientzero_config.py @@ -0,0 +1,99 @@ +from easydict import EasyDict + +# ============================================================== +# begin of the most frequently changed config specified by the user +# ============================================================== +collector_env_num = 8 +n_episode = 8 +evaluator_env_num = 3 +num_simulations = 25 +update_per_collect = 100 +batch_size = 256 +max_env_step = int(1e5) +reanalyze_ratio = 0. +robot_num = 2 +human_num = 59 # purdue +one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]] +# ============================================================== +# end of the most frequently changed config specified by the user +# ============================================================== + +CrowdSim_efficientzero_config = dict( + exp_name= + f'/root/result/crowd_result/CrowdSim_efficientzero_ns{num_simulations}_upc{update_per_collect}_rr{reanalyze_ratio}_seed0', + env=dict( + env_name='CrowdSim-v0', + robot_num = robot_num, + human_num = human_num, + one_uav_action_space = one_uav_action_space, + continuous=False, + manually_discretization=False, + collector_env_num=collector_env_num, + evaluator_env_num=evaluator_env_num, + n_evaluator_episode=evaluator_env_num, + manager=dict(shared_memory=False, ), + ), + policy=dict( + model=dict( + observation_shape=(robot_num+human_num)*4, + action_space_size=(len(one_uav_action_space))**robot_num, + model_type='mlp', + lstm_hidden_size=256, + latent_state_dim=256, + discrete_action_encoding_type='one_hot', + # res_connection_in_dynamics=True, + norm_type='BN', + ), + cuda=True, + env_type='not_board_games', + game_segment_length=200, + update_per_collect=update_per_collect, + batch_size=batch_size, + optim_type='Adam', + lr_piecewise_constant_decay=False, + learning_rate=0.003, + num_simulations=num_simulations, + reanalyze_ratio=reanalyze_ratio, + n_episode=n_episode, + eval_freq=int(1e3), + replay_buffer_size=int(1e6), # the size/capacity of replay_buffer, in the terms of transitions. + collector_env_num=collector_env_num, + evaluator_env_num=evaluator_env_num, + ), +) + +CrowdSim_efficientzero_config = EasyDict(CrowdSim_efficientzero_config) +main_config = CrowdSim_efficientzero_config + +CrowdSim_efficientzero_create_config = dict( + env=dict( + type='crowdsim_lightzero', + import_names=['zoo.CrowdSim.envs.CrowdSim_env'], + ), + env_manager=dict(type='subprocess'), + policy=dict( + type='efficientzero', + import_names=['lzero.policy.efficientzero'], + ), + collector=dict( + type='episode_muzero', + import_names=['lzero.worker.muzero_collector'], + ) +) +CrowdSim_efficientzero_create_config = EasyDict(CrowdSim_efficientzero_create_config) +create_config = CrowdSim_efficientzero_create_config + +if __name__ == "__main__": + # Users can use different train entry by specifying the entry_type. + entry_type = "train_muzero" # options={"train_muzero", "train_muzero_with_gym_env"} + + if entry_type == "train_muzero": + from lzero.entry import train_muzero + elif entry_type == "train_muzero_with_gym_env": + """ + The ``train_muzero_with_gym_env`` entry means that the environment used in the training process is generated by wrapping the original gym environment with LightZeroEnvWrapper. + Users can refer to lzero/envs/wrappers for more details. + """ + from lzero.entry import train_muzero_with_gym_env as train_muzero + + train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step) diff --git a/zoo/CrowdSim/config/__init__.py b/zoo/CrowdSim/config/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/zoo/CrowdSim/envs/CrowdSim_env.py b/zoo/CrowdSim/envs/CrowdSim_env.py new file mode 100644 index 000000000..c88ae21b5 --- /dev/null +++ b/zoo/CrowdSim/envs/CrowdSim_env.py @@ -0,0 +1,122 @@ +from typing import Union, Optional + +import gym +import numpy as np +from itertools import product +import logging + +from ding.envs import BaseEnv, BaseEnvTimestep +from ding.envs import ObsPlusPrevActRewWrapper +from ding.torch_utils import to_ndarray +from ding.utils import ENV_REGISTRY + +import CrowdSim.envs as envs + + +@ENV_REGISTRY.register('crowdsim_lightzero') +class CrowdSimEnv(BaseEnv): + + def __init__(self, cfg: dict = {}) -> None: + self._cfg = cfg + self._init_flag = False + self._replay_path = None + self._robot_num = self._cfg.robot_num + self._human_num = self._cfg.human_num + self._observation_space = gym.spaces.Box( + low=float("-inf"), + high=float("inf"), + shape=((self._robot_num+self._human_num)*4,), + dtype=np.float32) + # action space + # one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]] + self.real_action_space = list(product(self._cfg.one_uav_action_space, repeat=self._robot_num)) + one_uav_action_n = len(self._cfg.one_uav_action_space) + self._action_space = gym.spaces.Discrete(one_uav_action_n**self._robot_num) + self._action_space.seed(0) # default seed + self._reward_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1, ), dtype=np.float32) + self._continuous = False + + def reset(self) -> np.ndarray: + if not self._init_flag: + self._env = gym.make('CrowdSim-v0') + # if self._replay_path is not None: + # self._env = gym.wrappers.RecordVideo( + # self._env, + # video_folder=self._replay_path, + # episode_trigger=lambda episode_id: True, + # name_prefix='rl-video-{}'.format(id(self)) + # ) + # if hasattr(self._cfg, 'obs_plus_prev_action_reward') and self._cfg.obs_plus_prev_action_reward: + # self._env = ObsPlusPrevActRewWrapper(self._env) + self._init_flag = True + if hasattr(self, '_seed') and hasattr(self, '_dynamic_seed') and self._dynamic_seed: + np_seed = 100 * np.random.randint(1, 1000) + self._env.seed(self._seed + np_seed) + self._action_space.seed(self._seed + np_seed) + elif hasattr(self, '_seed'): + self._env.seed(self._seed) + self._action_space.seed(self._seed) + self._eval_episode_return = 0 + # process obs + raw_obs = self._env.reset() + obs_list = to_ndarray(raw_obs.to_tensor()) + # human_obs, robot_obs = obs_list + obs = np.concatenate(obs_list,axis=0).flatten() # for 1 dim e.g.(244,) + action_mask = np.ones(self.action_space.n, 'int8') + obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1} + + return obs + + def close(self) -> None: + if self._init_flag: + self._env.close() + self._init_flag = False + + def seed(self, seed: int, dynamic_seed: bool = True) -> None: + self._seed = seed + self._dynamic_seed = dynamic_seed + np.random.seed(self._seed) + + def step(self, action: Union[int, np.ndarray]) -> BaseEnvTimestep: + if isinstance(action, np.ndarray) and action.shape == (1, ): + action = action.squeeze() # 0-dim array + real_action = self.real_action_space[action] + assert isinstance(real_action, tuple) and len(real_action) == self._robot_num, "illegal action!" + raw_obs, rew, done, info = self._env.step(real_action) + obs_list = to_ndarray(raw_obs.to_tensor()) + obs = np.concatenate(obs_list,axis=0).flatten() # for 1 dim e.g.(244,) + + self._eval_episode_return += rew + if done: + info['eval_episode_return'] = self._eval_episode_return + # logging.INFO('one game finish!') + + action_mask = np.ones(self.action_space.n, 'int8') + obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1} + rew = to_ndarray([rew]).astype(np.float32) + return BaseEnvTimestep(obs, rew, done, info) + + def enable_save_replay(self, replay_path: Optional[str] = None) -> None: + if replay_path is None: + replay_path = './video' + self._replay_path = replay_path + + def random_action(self) -> np.ndarray: + random_action = self.action_space.sample() + random_action = to_ndarray([random_action], dtype=np.int64) + return random_action + + @property + def observation_space(self) -> gym.spaces.Space: + return self._observation_space + + @property + def action_space(self) -> gym.spaces.Space: + return self._action_space + + @property + def reward_space(self) -> gym.spaces.Space: + return self._reward_space + + def __repr__(self) -> str: + return "LightZero CrowdSim Env" diff --git a/zoo/CrowdSim/envs/__init__.py b/zoo/CrowdSim/envs/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/zoo/CrowdSim/envs/test_CrowdSim_env.py b/zoo/CrowdSim/envs/test_CrowdSim_env.py new file mode 100644 index 000000000..4f95511dc --- /dev/null +++ b/zoo/CrowdSim/envs/test_CrowdSim_env.py @@ -0,0 +1,28 @@ +import numpy as np +from easydict import EasyDict +from zoo.CrowdSim.envs.CrowdSim_env import CrowdSimEnv + +mcfg=EasyDict( + env_name='CrowdSim-v0', + robot_num = 2, + human_num = 59, # purdue + one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]) + +def test_naive(cfg): + env = CrowdSimEnv(cfg) + env.seed(314) + assert env._seed == 314 + obs = env.reset() + assert obs['observation'].shape == (244,) + for i in range(10): + random_action = env.random_action() + timestep = env.step(random_action) + print(timestep) + assert isinstance(timestep.obs['observation'], np.ndarray) + assert isinstance(timestep.done, bool) + assert timestep.obs['observation'].shape == (244,) + assert timestep.reward.shape == (1, ) + print(env.observation_space, env.action_space, env.reward_space) + env.close() + +test_naive(mcfg) \ No newline at end of file From c6acd7dc519e1af9d770954c72a816e9ca57d3c9 Mon Sep 17 00:00:00 2001 From: nighood Date: Mon, 13 Nov 2023 11:49:35 +0800 Subject: [PATCH 02/16] config(rjy): add mz/ez config for crowdsim --- .../config/CrowdSim_efficientzero_config.py | 10 +- zoo/CrowdSim/config/CrowdSim_muzero_config.py | 105 ++++++++++++++++++ zoo/CrowdSim/envs/CrowdSim_env.py | 2 + 3 files changed, 114 insertions(+), 3 deletions(-) create mode 100644 zoo/CrowdSim/config/CrowdSim_muzero_config.py diff --git a/zoo/CrowdSim/config/CrowdSim_efficientzero_config.py b/zoo/CrowdSim/config/CrowdSim_efficientzero_config.py index acfbff5dc..0252c64ee 100644 --- a/zoo/CrowdSim/config/CrowdSim_efficientzero_config.py +++ b/zoo/CrowdSim/config/CrowdSim_efficientzero_config.py @@ -1,4 +1,6 @@ from easydict import EasyDict +import os +os.environ["CUDA_VISIBLE_DEVICES"] = '2' # ============================================================== # begin of the most frequently changed config specified by the user @@ -9,10 +11,12 @@ num_simulations = 25 update_per_collect = 100 batch_size = 256 -max_env_step = int(1e5) +max_env_step = int(3e5) reanalyze_ratio = 0. robot_num = 2 -human_num = 59 # purdue +human_num = 10 # purdue +# human_num = 33 # NCSU +# human_num = 92 # KAIST one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]] # ============================================================== # end of the most frequently changed config specified by the user @@ -20,7 +24,7 @@ CrowdSim_efficientzero_config = dict( exp_name= - f'/root/result/crowd_result/CrowdSim_efficientzero_ns{num_simulations}_upc{update_per_collect}_rr{reanalyze_ratio}_seed0', + f'result/crowd_num_human/CrowdSim_efficientzero_step{max_env_step}_uav{robot_num}_human{human_num}_upc{update_per_collect}_rr{reanalyze_ratio}_seed0', env=dict( env_name='CrowdSim-v0', robot_num = robot_num, diff --git a/zoo/CrowdSim/config/CrowdSim_muzero_config.py b/zoo/CrowdSim/config/CrowdSim_muzero_config.py new file mode 100644 index 000000000..8def49e3e --- /dev/null +++ b/zoo/CrowdSim/config/CrowdSim_muzero_config.py @@ -0,0 +1,105 @@ +from easydict import EasyDict +import os +os.environ["CUDA_VISIBLE_DEVICES"] = '2' +# ============================================================== +# begin of the most frequently changed config specified by the user +# ============================================================== +collector_env_num = 8 +n_episode = 8 +evaluator_env_num = 3 +num_simulations = 25 +update_per_collect = 100 +batch_size = 256 +max_env_step = int(3e5) +reanalyze_ratio = 0. +robot_num = 2 +human_num = 10 # purdue +# human_num = 33 # NCSU +# human_num = 92 # KAIST +one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]] +# ============================================================== +# end of the most frequently changed config specified by the user +# ============================================================== + +CrowdSim_muzero_config = dict( + exp_name= + f'result/crowd_num_human/CrowdSim_muzero_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_upc{update_per_collect}_rr{reanalyze_ratio}_seed0', + env=dict( + env_name='CrowdSim-v0', + robot_num = robot_num, + human_num = human_num, + one_uav_action_space = one_uav_action_space, + continuous=False, + manually_discretization=False, + collector_env_num=collector_env_num, + evaluator_env_num=evaluator_env_num, + n_evaluator_episode=evaluator_env_num, + manager=dict(shared_memory=False, ), + ), + policy=dict( + model=dict( + observation_shape=(robot_num+human_num)*4, + action_space_size=(len(one_uav_action_space))**robot_num, + model_type='mlp', + lstm_hidden_size=256, + latent_state_dim=256, + self_supervised_learning_loss=True, # NOTE: default is False. + discrete_action_encoding_type='one_hot', + res_connection_in_dynamics=True, + norm_type='BN', + ), + cuda=True, + env_type='not_board_games', + game_segment_length=200, + update_per_collect=update_per_collect, + batch_size=batch_size, + optim_type='Adam', + lr_piecewise_constant_decay=False, + learning_rate=0.003, + ssl_loss_weight=2, # NOTE: default is 0. + grad_clip_value=0.5, + num_simulations=num_simulations, + reanalyze_ratio=reanalyze_ratio, + n_episode=n_episode, + eval_freq=int(1e3), + replay_buffer_size=int(1e6), # the size/capacity of replay_buffer, in the terms of transitions. + collector_env_num=collector_env_num, + evaluator_env_num=evaluator_env_num, + ), +) + +CrowdSim_muzero_config = EasyDict(CrowdSim_muzero_config) +main_config = CrowdSim_muzero_config + +CrowdSim_muzero_create_config = dict( + env=dict( + type='crowdsim_lightzero', + import_names=['zoo.CrowdSim.envs.CrowdSim_env'], + ), + env_manager=dict(type='subprocess'), + policy=dict( + type='muzero', + import_names=['lzero.policy.muzero'], + ), + collector=dict( + type='episode_muzero', + import_names=['lzero.worker.muzero_collector'], + ) +) +CrowdSim_muzero_create_config = EasyDict(CrowdSim_muzero_create_config) +create_config = CrowdSim_muzero_create_config + +if __name__ == "__main__": + # Users can use different train entry by specifying the entry_type. + entry_type = "train_muzero" # options={"train_muzero", "train_muzero_with_gym_env"} + + if entry_type == "train_muzero": + from lzero.entry import train_muzero + elif entry_type == "train_muzero_with_gym_env": + """ + The ``train_muzero_with_gym_env`` entry means that the environment used in the training process is generated by wrapping the original gym environment with LightZeroEnvWrapper. + Users can refer to lzero/envs/wrappers for more details. + """ + from lzero.entry import train_muzero_with_gym_env as train_muzero + + train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step) diff --git a/zoo/CrowdSim/envs/CrowdSim_env.py b/zoo/CrowdSim/envs/CrowdSim_env.py index c88ae21b5..8af0729aa 100644 --- a/zoo/CrowdSim/envs/CrowdSim_env.py +++ b/zoo/CrowdSim/envs/CrowdSim_env.py @@ -62,6 +62,7 @@ def reset(self) -> np.ndarray: obs_list = to_ndarray(raw_obs.to_tensor()) # human_obs, robot_obs = obs_list obs = np.concatenate(obs_list,axis=0).flatten() # for 1 dim e.g.(244,) + assert len(obs)==(self._robot_num+self._human_num)*4 action_mask = np.ones(self.action_space.n, 'int8') obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1} @@ -85,6 +86,7 @@ def step(self, action: Union[int, np.ndarray]) -> BaseEnvTimestep: raw_obs, rew, done, info = self._env.step(real_action) obs_list = to_ndarray(raw_obs.to_tensor()) obs = np.concatenate(obs_list,axis=0).flatten() # for 1 dim e.g.(244,) + assert len(obs)==(self._robot_num+self._human_num)*4 self._eval_episode_return += rew if done: From ad0cd02da2202a6ee1654ba40fc7310cb91cf0de Mon Sep 17 00:00:00 2001 From: nighood Date: Sun, 7 Apr 2024 23:17:25 +0800 Subject: [PATCH 03/16] env(rjy): add crowdsim env --- zoo/CrowdSim/envs/CrowdSim_env.py | 15 +- zoo/CrowdSim/envs/Crowdsim/__init__.py | 7 + zoo/CrowdSim/envs/Crowdsim/env/__init__.py | 1 + .../envs/Crowdsim/env/base_env_config.py | 137 ++++++++++ zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py | 252 ++++++++++++++++++ .../envs/Crowdsim/env/model/__init__.py | 0 zoo/CrowdSim/envs/Crowdsim/env/model/agent.py | 72 +++++ zoo/CrowdSim/envs/Crowdsim/env/model/mdp.py | 89 +++++++ zoo/CrowdSim/envs/Crowdsim/env/model/utils.py | 248 +++++++++++++++++ zoo/CrowdSim/envs/test_CrowdSim_env.py | 1 + 10 files changed, 810 insertions(+), 12 deletions(-) create mode 100644 zoo/CrowdSim/envs/Crowdsim/__init__.py create mode 100644 zoo/CrowdSim/envs/Crowdsim/env/__init__.py create mode 100644 zoo/CrowdSim/envs/Crowdsim/env/base_env_config.py create mode 100644 zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py create mode 100644 zoo/CrowdSim/envs/Crowdsim/env/model/__init__.py create mode 100644 zoo/CrowdSim/envs/Crowdsim/env/model/agent.py create mode 100644 zoo/CrowdSim/envs/Crowdsim/env/model/mdp.py create mode 100644 zoo/CrowdSim/envs/Crowdsim/env/model/utils.py diff --git a/zoo/CrowdSim/envs/CrowdSim_env.py b/zoo/CrowdSim/envs/CrowdSim_env.py index 8af0729aa..21d31863e 100644 --- a/zoo/CrowdSim/envs/CrowdSim_env.py +++ b/zoo/CrowdSim/envs/CrowdSim_env.py @@ -10,7 +10,7 @@ from ding.torch_utils import to_ndarray from ding.utils import ENV_REGISTRY -import CrowdSim.envs as envs +import zoo.CrowdSim.envs.Crowdsim.env @ENV_REGISTRY.register('crowdsim_lightzero') @@ -38,16 +38,7 @@ def __init__(self, cfg: dict = {}) -> None: def reset(self) -> np.ndarray: if not self._init_flag: - self._env = gym.make('CrowdSim-v0') - # if self._replay_path is not None: - # self._env = gym.wrappers.RecordVideo( - # self._env, - # video_folder=self._replay_path, - # episode_trigger=lambda episode_id: True, - # name_prefix='rl-video-{}'.format(id(self)) - # ) - # if hasattr(self._cfg, 'obs_plus_prev_action_reward') and self._cfg.obs_plus_prev_action_reward: - # self._env = ObsPlusPrevActRewWrapper(self._env) + self._env = gym.make('CrowdSim-v0', dataset = self._cfg.dataset, custom_config = self._cfg) self._init_flag = True if hasattr(self, '_seed') and hasattr(self, '_dynamic_seed') and self._dynamic_seed: np_seed = 100 * np.random.randint(1, 1000) @@ -59,7 +50,7 @@ def reset(self) -> np.ndarray: self._eval_episode_return = 0 # process obs raw_obs = self._env.reset() - obs_list = to_ndarray(raw_obs.to_tensor()) + obs_list = raw_obs.to_array() # human_obs, robot_obs = obs_list obs = np.concatenate(obs_list,axis=0).flatten() # for 1 dim e.g.(244,) assert len(obs)==(self._robot_num+self._human_num)*4 diff --git a/zoo/CrowdSim/envs/Crowdsim/__init__.py b/zoo/CrowdSim/envs/Crowdsim/__init__.py new file mode 100644 index 000000000..5fb297682 --- /dev/null +++ b/zoo/CrowdSim/envs/Crowdsim/__init__.py @@ -0,0 +1,7 @@ +import logging +from gym.envs.registration import register +logger = logging.getLogger(__name__) +register( + id='CrowdSim-v0', + entry_point='zoo.CrowdSim.envs.Crowdsim.env.crowd_sim:CrowdSim', +) diff --git a/zoo/CrowdSim/envs/Crowdsim/env/__init__.py b/zoo/CrowdSim/envs/Crowdsim/env/__init__.py new file mode 100644 index 000000000..eddfe3037 --- /dev/null +++ b/zoo/CrowdSim/envs/Crowdsim/env/__init__.py @@ -0,0 +1 @@ +from .crowd_sim import CrowdSim diff --git a/zoo/CrowdSim/envs/Crowdsim/env/base_env_config.py b/zoo/CrowdSim/envs/Crowdsim/env/base_env_config.py new file mode 100644 index 000000000..865ab56fb --- /dev/null +++ b/zoo/CrowdSim/envs/Crowdsim/env/base_env_config.py @@ -0,0 +1,137 @@ +from easydict import EasyDict + +# define base config +base_config = EasyDict({ + "num_timestep": 120, # 120x15=1800s=30min + "step_time": 15, # second per step + "max_uav_energy": 359640, # 359640 J <-- 359.64 kJ (4500mah, 22.2v) 大疆经纬 + "rotation_limit": 360, + "diameter_of_human_blockers": 0.5, # m + "h_rx": 1.3, # m, height of RX + "h_b": 1.7, # m, height of a human blocker + "velocity": 18, + "frequence_band": 28, # GHz + "h_d": 120, # m, height of drone-BS + "alpha_nlos": 113.63, + "beta_nlos": 1.16, + "zeta_nlos": 2.58, # Frequency 28GHz, sub-urban. channel model + "alpha_los": 84.64, + "beta_los": 1.55, + "zeta_los": 0.12, + "g_tx": 0, # dB + "g_rx": 5, # dB + "tallest_locs": None, # obstacle + "no_fly_zone": None, # obstacle + "start_timestamp": 1519894800, + "end_timestamp": 1519896600, + "energy_factor": 3, # TODO: energy factor in reward function + "robot_num": 2, # TODO: 多了要用多进程 + "rollout_num": 1, # 1 2 6 12 15, calculated based on robot_num +}) + +# define all dataset configs +dataset_configs = { + 'purdue': EasyDict({ + "lower_left": [-86.93, 40.4203], # 经纬度 + "upper_right": [-86.9103, 40.4313], + "nlon": 200, + "nlat": 120, + "human_num": 59, + "dataset_dir": '/home/nighoodRen/CrowdSim/CrowdSim/envs/crowd_sim/dataset/purdue/59 users.csv', + "sensing_range": 23.2, # unit 23.2 + "one_uav_action_space": [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30], [21, 21], [21, -21], [-21, 21], [-21, -21]], + "max_x_distance": 1667, # m + "max_y_distance": 1222, # m + "density_of_human_blockers": 30000 / 1667 / 1222, # block/m2 + }), + 'ncsu': EasyDict({ + "lower_left": [-78.6988, 35.7651], # 经纬度 + "upper_right": [-78.6628, 35.7896], + "nlon": 3600, + "nlat": 2450, + "human_num": 33, + "dataset_dir": '/home/nighoodRen/CrowdSim/CrowdSim/envs/crowd_sim/dataset/NCSU/33 users.csv', + "sensing_range": 220, # unit 220 + "one_uav_action_space": [[0, 0], [300, 0], [-300, 0], [0, 300], [0, -300], [210, 210], [210, -210], [-210, 210], [-210, -210]], + "max_x_distance": 3255.4913305859623, + "max_y_distance": 2718.3945272795013, + "density_of_human_blockers": 30000 / 3255.4913305859623 / 2718.3945272795013, # block/m2 + }), + 'kaist': EasyDict({ + "lower_left": [127.3475, 36.3597], + "upper_right": [127.3709, 36.3793], + "nlon": 2340, + "nlat": 1960, + "human_num": 92, + "dataset_dir": '/home/nighoodRen/CrowdSim/CrowdSim/envs/crowd_sim/dataset/KAIST/92 users.csv', + "sensing_range": 220, # unit 220 + "one_uav_action_space": [[0, 0], [300, 0], [-300, 0], [0, 300], [0, -300], [210, 210], [210, -210], [-210, 210], [-210, -210]], + "max_x_distance": 2100.207579392558, + "max_y_distance": 2174.930950809533, + "density_of_human_blockers": 30000 / 2100.207579392558 / 2174.930950809533, # block/m2 + }), + # ... could add more dataset configs here +} + +# get config according to data set name +def get_selected_config(data_set_name): + if data_set_name in dataset_configs: + dataset_config = dataset_configs[data_set_name] + return EasyDict({**base_config, **dataset_config}) + else: + raise ValueError(f"Data set '{data_set_name}' not found.") + +# r:meters, 2d distance +# threshold: dB +def try_sensing_range(r, data_set_name): + import math + config = get_selected_config(data_set_name) + p_los = math.exp( + -config.density_of_human_blockers * config.diameter_of_human_blockers * r * (config.h_b - config.h_rx) / ( + config.h_d - config.h_rx)) + p_nlos = 1 - p_los + PL_los = config.alpha_los + config.beta_los * 10 * math.log10( + math.sqrt(r * r + config.h_d * config.h_d)) + config.zeta_los + PL_nlos = config.alpha_nlos + config.beta_nlos * 10 * math.log10( + math.sqrt(r * r + config.h_d * config.h_d)) + config.zeta_nlos + PL = p_los * PL_los + p_nlos * PL_nlos + CL = PL - config.g_tx - config.g_rx + print(p_los, p_nlos) + print(CL) + + +# Maximum Coupling Loss (110dB is recommended) +# purdue: + +# 123dB -> 560m -> 60.5 range +# 121dB -> 420m -> 45.4 range +# 119dB -> 300m -> 32.4 range +# 117dB -> 215m -> 23.2 range √ +# 115dB -> 140m -> 15 range + +# ncsu: +# 123dB -> 600m -> 600 range +# 121dB -> 435m -> 435 range +# 119dB -> 315m -> 315 range +# 117dB -> 220m -> 220 range √ +# 115dB -> 145m -> 145 range + +# kaist: +# 123dB -> 600m -> 600 range +# 121dB -> 435m -> 435 range +# 119dB -> 315m -> 315 range +# 117dB -> 220m -> 220 range √ +# 115dB -> 145m -> 145 range + +# san: +# 123dB -> 600m -> 600 range +# 121dB -> 450m -> 450 range +# 119dB -> 330m -> 330 range +# 117dB -> 240m -> 240 range √ +# 115dB -> 165m -> 165 range + +if __name__ == '__main__': + # example usage + data_set_name = 'purdue' + selected_config = get_selected_config(data_set_name) + print(selected_config) \ No newline at end of file diff --git a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py new file mode 100644 index 000000000..c27a888d5 --- /dev/null +++ b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py @@ -0,0 +1,252 @@ +import pandas as pd + +import logging +import random +import gym +from shapely.geometry import Point +import numpy as np +import folium +from folium.plugins import TimestampedGeoJson, AntPath + +from zoo.CrowdSim.envs.Crowdsim.env.model.utils import * +from zoo.CrowdSim.envs.Crowdsim.env.model.mdp import HumanState, RobotState, JointState +from zoo.CrowdSim.envs.Crowdsim.env.base_env_config import get_selected_config + + + +class CrowdSim(gym.Env): + metadata = {'render.modes': ['human']} + + def __init__(self, dataset, custom_config=None): + # mcfg should include: + self.time_limit = None + self.robots = None + self.humans = None + self.agent = None + self.current_timestep = None + self.phase = None + + self.config = get_selected_config(dataset) + self.config.update(custom_config) + + self.human_num = self.config.human_num + self.robot_num = self.config.robot_num + self.num_timestep = self.config.num_timestep # max timestep + self.step_time = self.config.step_time # second per step + self.start_timestamp = self.config.start_timestamp # fit timpestamp to datetime + self.max_uav_energy = self.config.max_uav_energy + # self.action_space = gym.spaces.Discrete(4**self.robot_num) # for each robot have 4 actions(up, down, left, right), then product + self.action_space = gym.spaces.Discrete(len(self.config.one_uav_action_space)) + # human obs: [px, py, theta, aoi] + # robot obs: [px, py, theta, energy] + # self.observation_space = gym.spaces.Box(low=float("-inf"), high=float("inf"), shape=(4), dtype=np.float32) + self.observation_space = gym.spaces.Box(low=float("-inf"), high=float("inf"), shape=(self.robot_num+self.human_num, 4), dtype=np.float32) + + # load_dataset + self.nlon = self.config.nlon + self.nlat = self.config.nlat + self.lower_left = self.config.lower_left + self.upper_right = self.config.upper_right + self.human_df = pd.read_csv(self.config.dataset_dir) + logging.info("Finished reading {} rows".format(len(self.human_df))) + # # for temporarily processing data + # sample_list=np.random.choice(self.human_num, size=[50,], replace=False) + # sample_list=sample_list[np.argsort(sample_list)] + # print(sample_list) + # self.human_df= self.human_df[self.human_df["id"].isin(sample_list)] + # for i,human_id in enumerate(sample_list): + # mask=(self.human_df["id"]==human_id) + # self.human_df.loc[mask,"id"]=i + # self.human_df=self.human_df.sort_values(by=["id","timestamp"],ascending=[True,True]) + # print(self.human_df.head()) + # self.human_df.to_csv("50 users-5.csv",index=False) + # exit(0) + + self.human_df['t'] = pd.to_datetime(self.human_df['timestamp'], unit='s') # 's' stands for second + self.human_df['aoi'] = -1 # 加入aoi记录aoi + self.human_df['energy'] = -1 # 加入energy记录energy + logging.info('human number: {}'.format(self.human_num)) + logging.info('Robot number: {}'.format(self.robot_num)) + + # for debug + self.current_human_aoi_list = np.ones([self.human_num, ]) + self.mean_aoi_timelist = np.ones([self.config.num_timestep + 1, ]) + self.robot_energy_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num]) + self.robot_x_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num]) + self.robot_y_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num]) + self.update_human_timelist = np.zeros([self.config.num_timestep, ]) + self.data_collection = 0 + + def set_agent(self, agent): + self.agent = agent + + def generate_human(self, human_id, selected_data, selected_next_data): + human = Human(human_id, self.config) + px, py, theta = get_human_position_from_list(self.current_timestep, human_id, selected_data, selected_next_data, self.config) + # human obs: [px, py, theta, aoi] + human.set(px, py, theta, 1) # initial aoi of human is 1 + return human + + def generate_robot(self, robot_id): + robot = Robot(robot_id, self.config) + # robot obs: [px, py, theta, energy] + robot.set(self.nlon / 2, self.nlat / 2, 0, self.max_uav_energy) # robot有energy + return robot + + def sync_human_df(self, human_id, current_timestep, aoi): + """ + Overview: + Sync the human_df with the current timestep and aoi. + Args: + - human_id (:obj:`int`): The id of the human. + - current_timestep (:obj:`int`): The current timestep. + - aoi (:obj:`int`): The aoi of the human. + """ + current_timestamp = self.start_timestamp + current_timestep * self.step_time + current_index = self.human_df[ + (self.human_df.id == human_id) & (self.human_df.timestamp == current_timestamp)].index + # self.human_df.loc[current_index, "aoi"] = aoi # slower + self.human_df.iat[current_index.values[0], 9] = aoi # faster + + def reset(self, phase='test', test_case=None): + self.current_timestep = 0 + + # generate human + self.humans = [] + selected_data, selected_next_data = get_human_position_list(self.current_timestep, self.human_df, self.config) + for human_id in range(self.human_num): + self.humans.append(self.generate_human(human_id, selected_data, selected_next_data)) + self.sync_human_df(human_id, self.current_timestep, 1) + + # generate robot + self.robots = [] + for robot_id in range(self.robot_num): + self.robots.append(self.generate_robot(robot_id)) + + self.current_human_aoi_list = np.ones([self.human_num, ]) + self.mean_aoi_timelist = np.ones([self.config.num_timestep + 1, ]) + self.mean_aoi_timelist[self.current_timestep] = np.mean(self.current_human_aoi_list) + self.robot_energy_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num]) + self.robot_energy_timelist[self.current_timestep, :] = self.max_uav_energy + self.robot_x_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num]) + self.robot_x_timelist[self.current_timestep, :] = self.nlon / 2 + self.robot_y_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num]) + self.robot_y_timelist[self.current_timestep, :] = self.nlat / 2 + self.update_human_timelist = np.zeros([self.config.num_timestep, ]) + self.data_collection = 0 + + # for visualization + self.plot_states = [] + self.robot_actions = [] + self.rewards = [] + self.action_values = [] + self.plot_states.append([[robot.get_obs() for robot in self.robots], + [human.get_obs() for human in self.humans]]) + + state = JointState([robot.get_obs() for robot in self.robots], [human.get_obs() for human in self.humans]) + return state + + def step(self, action): + new_robot_position = np.zeros([self.robot_num, 2]) + current_enenrgy_consume = np.zeros([self.robot_num, ]) + + num_updated_human = 0 # number of humans whose AoI is updated + + for robot_id, robot in enumerate(self.robots): + new_robot_px = robot.px + action[robot_id][0] + new_robot_py = robot.py + action[robot_id][1] + robot_theta = get_theta(0, 0, action[robot_id][0], action[robot_id][1]) + # print(action[robot_id], robot_theta) + is_stopping = True if (action[robot_id][0] == 0 and action[robot_id][1] == 0) else False + is_collide = True if judge_collision(new_robot_px, new_robot_py, robot.px, robot.py, self.config) else False + + if is_stopping is True: + consume_energy = consume_uav_energy(0, self.step_time, self.config) + else: + consume_energy = consume_uav_energy(self.step_time, 0, self.config) + current_enenrgy_consume[robot_id] = consume_energy / self.config.max_uav_energy + new_energy = robot.energy - consume_energy + self.robot_energy_timelist[self.current_timestep + 1][robot_id] = new_energy + + if is_collide is True: + new_robot_position[robot_id][0] = robot.px + new_robot_position[robot_id][1] = robot.py + self.robot_x_timelist[self.current_timestep + 1][robot_id] = robot.px + self.robot_y_timelist[self.current_timestep + 1][robot_id] = robot.py + robot.set(robot.px, robot.py, robot_theta, energy=new_energy) + else: + new_robot_position[robot_id][0] = new_robot_px + new_robot_position[robot_id][1] = new_robot_py + self.robot_x_timelist[self.current_timestep + 1][robot_id] = new_robot_px + self.robot_y_timelist[self.current_timestep + 1][robot_id] = new_robot_py + robot.set(new_robot_px, new_robot_py, robot_theta, energy=new_energy) + + selected_data, selected_next_data = get_human_position_list(self.current_timestep + 1, self.human_df, self.config) + delta_human_aoi_list = np.zeros_like(self.current_human_aoi_list) # 0 means no update + for human_id, human in enumerate(self.humans): + next_px, next_py, next_theta = get_human_position_from_list(self.current_timestep + 1, human_id, + selected_data, selected_next_data, self.config) + should_reset = judge_aoi_update([next_px, next_py], new_robot_position, self.config) + if should_reset: + # if the human is in the range of the robot, then reset the aoi of the human + if human.aoi > 1: + delta_human_aoi_list[human_id] = human.aoi + else: + delta_human_aoi_list[human_id] = 1 + + human.set(next_px, next_py, next_theta, aoi=1) + num_updated_human += 1 + else: + # if the human is not in the range of the robot, then update the aoi of the human + delta_human_aoi_list[human_id] = 0 + new_aoi = human.aoi + 1 + human.set(next_px, next_py, next_theta, aoi=new_aoi) + + self.current_human_aoi_list[human_id] = human.aoi + self.sync_human_df(human_id, self.current_timestep + 1, human.aoi) + + self.mean_aoi_timelist[self.current_timestep + 1] = np.mean(self.current_human_aoi_list) + self.update_human_timelist[self.current_timestep] = num_updated_human + delta_sum_aoi = np.sum(delta_human_aoi_list) + self.data_collection += (delta_sum_aoi * 0.3) # Mb, 0.02M/s per person + + # TODO: need to be well-defined + reward = self.mean_aoi_timelist[self.current_timestep] - self.mean_aoi_timelist[self.current_timestep + 1] \ + - self.config.energy_factor * np.sum(current_enenrgy_consume) + + # if hasattr(self.agent.policy, 'action_values'): + # self.action_values.append(self.agent.policy.action_values) + self.robot_actions.append(action) + self.rewards.append(reward) + self.plot_states.append([[robot.get_obs() for robot in self.robots], + [human.get_obs() for human in self.humans]]) + + next_state = JointState([robot.get_obs() for robot in self.robots], + [human.get_obs() for human in self.humans]) + + self.current_timestep += 1 + # print('This game is on',self.current_timestep,' step\n') + if self.current_timestep >= self.num_timestep: + done = True + else: + done = False + info = { + "performance_info": { + "mean_aoi": self.mean_aoi_timelist[self.current_timestep], + "mean_energy_consumption": 1.0 - ( + np.mean(self.robot_energy_timelist[self.current_timestep]) / self.max_uav_energy), + "collected_data_amount": self.data_collection/(self.num_timestep*self.human_num*0.3), + "human_coverage": np.mean(self.update_human_timelist) / self.human_num + }, + } + + return next_state, reward, done, info + + def render(self, mode='traj', output_file=None, plot_loop=False, moving_line=False): + # ------------------------------------------------------------------- + if mode == 'html': + pass + elif mode == 'traj': + pass + else: + raise NotImplementedError \ No newline at end of file diff --git a/zoo/CrowdSim/envs/Crowdsim/env/model/__init__.py b/zoo/CrowdSim/envs/Crowdsim/env/model/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py b/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py new file mode 100644 index 000000000..5201c5b61 --- /dev/null +++ b/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py @@ -0,0 +1,72 @@ +import abc +import logging +from zoo.CrowdSim.envs.Crowdsim.env.model.mdp import * + + +class Agent(): + def __init__(self): + """ + Base class for robot and human. Have the physical attributes of an agent. + + """ + self.policy = None + + def print_info(self): + logging.info('Agent is visible and has "holonomic" kinematic constraint') + + def set_policy(self, policy): + self.policy = policy + + def act(self, state, current_timestep): + if self.policy is None: + raise AttributeError('Policy attribute has to be set!') + action = self.policy.predict(state, current_timestep) + return action + + +class Human(): + def __init__(self, id, config): + self.id = id + self.config = config + self.px = None + self.py = None + self.theta = None + self.aoi = None + + def set(self, px, py, theta, aoi): + self.px = px # position + self.py = py + self.theta = theta + self.aoi = aoi + + # TODO: change state,可能需要归一化 + def get_obs(self): + return HumanState(self.px / self.config.nlon, + self.py / self.config.nlat, + self.theta / self.config.rotation_limit, + self.aoi / self.config.num_timestep) + + +class Robot(): + def __init__(self, id, config): + self.id = id + self.config = config + self.px = None # position + self.py = None + self.theta = None + self.energy = None + + def set(self, px, py, theta, energy): + self.px = px # position + self.py = py + self.theta = theta + self.energy = energy + + # TODO: change state,可能需要归一化 + def get_obs(self): + return RobotState(self.px / self.config.nlon, + self.py / self.config.nlat, + self.theta / self.config.rotation_limit, + self.energy / self.config.max_uav_energy) + + diff --git a/zoo/CrowdSim/envs/Crowdsim/env/model/mdp.py b/zoo/CrowdSim/envs/Crowdsim/env/model/mdp.py new file mode 100644 index 000000000..72f1b9c4a --- /dev/null +++ b/zoo/CrowdSim/envs/Crowdsim/env/model/mdp.py @@ -0,0 +1,89 @@ +from collections import namedtuple +from itertools import product +import torch +import numpy as np + + +# State +class HumanState(object): + def __init__(self, px, py, theta, aoi): + self.px = px + self.py = py + self.theta = theta + self.aoi = aoi + self.position = (self.px, self.py) + + def __add__(self, other): + return other + (self.px, self.py, self.theta, self.aoi) + + def __str__(self): + return ' '.join([str(x) for x in [self.px, self.py, self.theta, self.aoi]]) + + def to_tuple(self): + return self.px, self.py, self.theta, self.aoi + + +class RobotState(object): + def __init__(self, px, py, theta, energy): + self.px = px + self.py = py + self.theta = theta + self.energy = energy + + self.position = (self.px, self.py) + + def __add__(self, other): + return other + (self.px, self.py, self.theta, self.energy) + + def __str__(self): + return ' '.join([str(x) for x in [self.px, self.py, self.theta, self.energy]]) + + def to_tuple(self): + return self.px, self.py, self.theta, self.energy + + +class JointState(object): + def __init__(self, robot_states, human_states): + for robot_state in robot_states: + assert isinstance(robot_state, RobotState) + for human_state in human_states: + assert isinstance(human_state, HumanState) + + self.robot_states = robot_states + self.human_states = human_states + + def to_tensor(self, add_batch_size=False, device=None): + robot_states_tensor = torch.tensor([robot_state.to_tuple() for robot_state in self.robot_states], + dtype=torch.float32) + human_states_tensor = torch.tensor([human_state.to_tuple() for human_state in self.human_states], + dtype=torch.float32) + + if add_batch_size: # True + robot_states_tensor = robot_states_tensor.unsqueeze(0) + human_states_tensor = human_states_tensor.unsqueeze(0) + + if device is not None: + robot_states_tensor = robot_states_tensor.to(device) + human_states_tensor = human_states_tensor.to(device) + + return robot_states_tensor, human_states_tensor + + def to_array(self): + robot_states_array = np.array([robot_state.to_tuple() for robot_state in self.robot_states]) + human_states_array = np.array([human_state.to_tuple() for human_state in self.human_states]) + + return robot_states_array, human_states_array + + +def build_action_space(config): + robot_num = config.robot_num + + # dx, dy + one_uav_action_space = config.one_uav_action_space + action_space = list(product(one_uav_action_space, repeat=robot_num)) + + return np.array(action_space) + + +if __name__ == "__main__": + print(build_action_space()) diff --git a/zoo/CrowdSim/envs/Crowdsim/env/model/utils.py b/zoo/CrowdSim/envs/Crowdsim/env/model/utils.py new file mode 100644 index 000000000..ce903d9f8 --- /dev/null +++ b/zoo/CrowdSim/envs/Crowdsim/env/model/utils.py @@ -0,0 +1,248 @@ +import numpy as np + +np.seterr(invalid='ignore') + +from zoo.CrowdSim.envs.Crowdsim.env.model.agent import * +from zoo.CrowdSim.envs.Crowdsim.env.model.mdp import JointState +from shapely.geometry import * + + +def tensor_to_joint_state(state, config): # 恢复原先尺度 + robot_states, human_states = state + + robot_states = robot_states.cpu().squeeze(0).data.numpy() + robot_states = [RobotState(robot_state[0] * config.nlon, + robot_state[1] * config.nlat, + robot_state[2] * config.rotation_limit, + robot_state[3] * config.max_uav_energy) for robot_state in robot_states] + + human_states = human_states.cpu().squeeze(0).data.numpy() + human_states = [HumanState(human_state[0] * config.nlon, + human_state[1] * config.nlat, + human_state[2] * config.rotation_limit, + human_state[3] * config.num_timestep) for human_state in human_states] + + return JointState(robot_states, human_states) + + +def tensor_to_robot_states(robot_state_tensor, config): + robot_states = robot_state_tensor.cpu().squeeze(0).data.numpy() + robot_states = [RobotState(robot_state[0] * config.nlon, + robot_state[1] * config.nlat, + robot_state[2] * config.rotation_limit, + robot_state[3] * config.max_uav_energy) for robot_state in robot_states] + return robot_states + + +def get_human_position_list(selected_timestep, human_df, config): + # config.step_time means the time interval between two timesteps + selected_timestamp = config.start_timestamp + selected_timestep * config.step_time + selected_data = human_df[human_df.timestamp == selected_timestamp] + selected_data = selected_data.set_index("id") + + if selected_timestep < config.num_timestep: + selected_next_data = human_df[human_df.timestamp == (selected_timestamp + config.step_time)] + selected_next_data = selected_next_data.set_index("id") + else: + selected_next_data = None + + return selected_data, selected_next_data + + +def get_human_position_from_list(selected_timestep, human_id, selected_data, selected_next_data, config): + px, py = selected_data.loc[human_id, ["x", "y"]] + + if selected_timestep < config.num_timestep: + npx, npy = selected_next_data.loc[human_id, ["x", "y"]] + theta = get_theta(0, 0, npx - px, npy - py) + # print(px, py, npx, npy, theta) + else: + theta = 0 + + return px, py, theta + + +def judge_aoi_update(human_position, robot_position, config): + """ + Overview: + Judge whether the AoI should be updated + Args: + - human_position (:obj:`list`): The position of the human. + - robot_position (:obj:`list`): The position of the robot. + - config (:obj:`dict`): The configuration of the environment. + """ + should_reset = False + for robot_id in range(config.robot_num): + unit_distance = np.sqrt(np.power(robot_position[robot_id][0] - human_position[0], 2) + + np.power(robot_position[robot_id][1] - human_position[1], 2)) + if unit_distance <= config.sensing_range: + should_reset = True + break + + return should_reset + + +def inPoly(polygon, x, y): + pt = (x, y) + line = LineString(polygon) + point = Point(pt) + polygon = Polygon(line) + return polygon.contains(point) + + +def iscrosses(line1, line2): + if LineString(line1).crosses(LineString(line2)): + return True + return False + + +def crossPoly(square, x1, y1, x2, y2): + our_line = LineString([[x1, y1], [x2, y2]]) + line1 = LineString([square[0], square[2]]) + line2 = LineString([square[1], square[3]]) + if our_line.crosses(line1) or our_line.crosses(line2): + return True + else: + return False + + +def judge_collision(new_robot_px, new_robot_py, old_robot_px, old_robot_py, config): + if config.no_fly_zone is None: + return False + + for square in config.no_fly_zone: + if inPoly(square, new_robot_px, new_robot_py): + return True + if crossPoly(square, new_robot_px, new_robot_py, old_robot_px, old_robot_py): + return True + return False + + +def get_theta(x1, y1, x2, y2): + ang1 = np.arctan2(y1, x1) + ang2 = np.arctan2(y2, x2) + theta = np.rad2deg((ang1 - ang2) % (2 * np.pi)) + return theta + + +def consume_uav_energy(fly_time, hover_time, config): + # configs + Pu = 0.5 # the average transmitted power of each user, W, e.g. mobile phone + P0 = 79.8563 # blade profile power, W + P1 = 88.6279 # derived power, W + U_tips = 120 # tip speed of the rotor blade of the UAV,m/s + v0 = 4.03 # the mean rotor induced velocity in the hovering state,m/s + d0 = 0.6 # fuselage drag ratio + rho = 1.225 # density of air,kg/m^3 + s0 = 0.05 # the rotor solidity + A = 0.503 # the area of the rotor disk, m^2 + Vt = config.velocity # velocity of the UAV,m/s + + Power_flying = P0 * (1 + 3 * Vt ** 2 / U_tips ** 2) + \ + P1 * np.sqrt((np.sqrt(1 + Vt ** 4 / (4 * v0 ** 4)) - Vt ** 2 / (2 * v0 ** 2))) + \ + 0.5 * d0 * rho * s0 * A * Vt ** 3 + + Power_hovering = P0 + P1 + + return fly_time * Power_flying + hover_time * Power_hovering + + +def get_border(ur, lf): + upper_left = [lf[0], ur[1]] + upper_right = [ur[0], ur[1]] + lower_right = [ur[0], lf[1]] + lower_left = [lf[0], lf[1]] + + coordinates = [ + upper_left, + upper_right, + lower_right, + lower_left, + upper_left + ] + + geo_json = {"type": "FeatureCollection", + "properties": { + "lower_left": lower_left, + "upper_right": upper_right + }, + "features": []} + + grid_feature = { + "type": "Feature", + "geometry": { + "type": "Polygon", + "coordinates": [coordinates], + } + } + + geo_json["features"].append(grid_feature) + + return geo_json + + +def traj_to_timestamped_geojson(index, trajectory, robot_num, color): + point_gdf = trajectory.df.copy() + point_gdf["previous_geometry"] = point_gdf["geometry"].shift() + point_gdf["time"] = point_gdf.index + point_gdf["previous_time"] = point_gdf["time"].shift() + + features = [] + + # for Point in GeoJSON type + for _, row in point_gdf.iterrows(): + corrent_point_coordinates = [ + row["geometry"].xy[0][0], + row["geometry"].xy[1][0] + ] + current_time = [row["time"].isoformat()] + + if index < robot_num: + radius = 8 # 125(5 units) + opacity = 0.05 + popup_html = f'

UAV {int(row["id"])}

' + f'

raw coord: {corrent_point_coordinates}

' \ + + f'

grid coord: ({row["x"]},{row["y"]})

' \ + + f'

dist coord: ({row["x_distance"]}m, {row["y_distance"]}m)

' \ + + f'

energy: {row["energy"]}J

' + else: + radius = 2 + opacity = 1 + popup_html = f'

Human {int(row["id"])}

' + f'

raw coord: {corrent_point_coordinates}

' \ + + f'

grid coord: ({row["x"]},{row["y"]})

' \ + + f'

dist coord: ({row["x_distance"]}m, {row["y_distance"]}m)

' \ + + f'

aoi: {int(row["aoi"])}

' + + # for Point in GeoJSON type (Temporally Deprecated) + features.append( + { + "type": "Feature", + "geometry": { + "type": "Point", + "coordinates": corrent_point_coordinates, + }, + "properties": { + "times": current_time, + 'popup': popup_html, + "icon": 'circle', # point + "iconstyle": { + 'fillColor': color, + 'fillOpacity': opacity, # 透明度 + 'stroke': 'true', + 'radius': radius, + 'weight': 1, + }, + + "style": { # line + "color": color, + }, + "code": 11, + + }, + } + ) + return features + + +if __name__ == "__main__": + print(judge_collision(new_robot_px=6505, new_robot_py=5130, + old_robot_px=6925, old_robot_py=5130)) diff --git a/zoo/CrowdSim/envs/test_CrowdSim_env.py b/zoo/CrowdSim/envs/test_CrowdSim_env.py index 4f95511dc..25a8bc411 100644 --- a/zoo/CrowdSim/envs/test_CrowdSim_env.py +++ b/zoo/CrowdSim/envs/test_CrowdSim_env.py @@ -4,6 +4,7 @@ mcfg=EasyDict( env_name='CrowdSim-v0', + dataset = 'purdue', robot_num = 2, human_num = 59, # purdue one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]) From 14542a116ac5e4ca409cff6543348feb0d729469 Mon Sep 17 00:00:00 2001 From: nighood Date: Mon, 8 Apr 2024 17:58:13 +0800 Subject: [PATCH 04/16] feature(rjy): add RGCN for represent net --- lzero/model/common.py | 214 ++++++++++++++++++ .../config/crowdsim_muzero_rgcn_config.py | 106 +++++++++ zoo/CrowdSim/envs/crowdsim_lightzero_env.py | 121 ++++++++++ .../envs/test_crowdsim_lightzero_env.py | 35 +++ 4 files changed, 476 insertions(+) create mode 100644 zoo/CrowdSim/config/crowdsim_muzero_rgcn_config.py create mode 100644 zoo/CrowdSim/envs/crowdsim_lightzero_env.py create mode 100644 zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py diff --git a/lzero/model/common.py b/lzero/model/common.py index 363f7f779..0b97d2b7a 100644 --- a/lzero/model/common.py +++ b/lzero/model/common.py @@ -8,6 +8,8 @@ import math from typing import Optional, Tuple from dataclasses import dataclass +import logging +import itertools import numpy as np import torch import torch.nn as nn @@ -271,6 +273,218 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: """ return self.fc_representation(x) +class RGCNLayer(nn.Module): + """ + Overview: + Relational graph convolutional network layer. + """ + def __init__( + self, + robot_state_dim, + human_state_dim, + similarity_function, + num_layer = 2, + X_dim = 32, + layerwise_graph = False, + skip_connection = True, + wr_dims = [64, 32], # the last dim should equal to X_dim + wh_dims = [64, 32], # the last dim should equal to X_dim + final_state_dim = 32, # should equal to X_dim + norm_type= 'BN', + last_linear_layer_init_zero=True, + activation: Optional[nn.Module] = nn.ReLU(inplace=True), + ): + super().__init__() + + # design choice + # 'gaussian', 'embedded_gaussian', 'cosine', 'cosine_softmax', 'concatenation' + self.similarity_function = similarity_function + self.robot_state_dim = robot_state_dim + self.human_state_dim = human_state_dim + self.num_layer = num_layer + self.X_dim = X_dim + self.layerwise_graph = layerwise_graph + self.skip_connection = skip_connection + + logging.info('Similarity_func: {}'.format(self.similarity_function)) + logging.info('Layerwise_graph: {}'.format(self.layerwise_graph)) + logging.info('Skip_connection: {}'.format(self.skip_connection)) + logging.info('Number of layers: {}'.format(self.num_layer)) + + self.w_r = MLP( + in_channels=robot_state_dim, + hidden_channels=wr_dims[0], + out_channels=wr_dims[1], + layer_num=num_layer, + activation=activation, + norm_type=norm_type, + last_linear_layer_init_zero=last_linear_layer_init_zero, + ) # inputs,64,32 + self.w_h = MLP( + in_channels=human_state_dim, + hidden_channels=wh_dims[0], + out_channels=wh_dims[1], + layer_num=num_layer, + activation=activation, + norm_type=norm_type, + last_linear_layer_init_zero=last_linear_layer_init_zero, + ) # inputs,64,32 + + if self.similarity_function == 'embedded_gaussian': + self.w_a = nn.Parameter(torch.randn(self.X_dim, self.X_dim)) + elif self.similarity_function == 'concatenation': + # TODO: fix the dim size + self.w_a = MLP( + in_channels=2 * X_dim, + hidden_channels=2 * X_dim, + out_channels=1, + layer_num=1, + ) + + embedding_dim = self.X_dim + self.Ws = torch.nn.ParameterList() + for i in range(self.num_layer): + if i == 0: + self.Ws.append(nn.Parameter(torch.randn(self.X_dim, embedding_dim))) + elif i == self.num_layer - 1: + self.Ws.append(nn.Parameter(torch.randn(embedding_dim, final_state_dim))) + else: + self.Ws.append(nn.Parameter(torch.randn(embedding_dim, embedding_dim))) + + # TODO: for visualization + self.A = None + + def compute_similarity_matrix(self, X): + if self.similarity_function == 'embedded_gaussian': + A = torch.matmul(torch.matmul(X, self.w_a), X.permute(0, 2, 1)) + normalized_A = nn.functional.softmax(A, dim=2) + elif self.similarity_function == 'gaussian': + A = torch.matmul(X, X.permute(0, 2, 1)) + normalized_A = nn.functional.softmax(A, dim=2) + elif self.similarity_function == 'cosine': + A = torch.matmul(X, X.permute(0, 2, 1)) + magnitudes = torch.norm(A, dim=2, keepdim=True) + norm_matrix = torch.matmul(magnitudes, magnitudes.permute(0, 2, 1)) + normalized_A = torch.div(A, norm_matrix) + elif self.similarity_function == 'cosine_softmax': + A = torch.matmul(X, X.permute(0, 2, 1)) + magnitudes = torch.norm(A, dim=2, keepdim=True) + norm_matrix = torch.matmul(magnitudes, magnitudes.permute(0, 2, 1)) + normalized_A = nn.functional.softmax(torch.div(A, norm_matrix), dim=2) + elif self.similarity_function == 'concatenation': + indices = [pair for pair in itertools.product(list(range(X.size(1))), repeat=2)] + selected_features = torch.index_select(X, dim=1, index=torch.LongTensor(indices).reshape(-1)) + pairwise_features = selected_features.reshape((-1, X.size(1) * X.size(1), X.size(2) * 2)) + A = self.w_a(pairwise_features).reshape(-1, X.size(1), X.size(1)) + normalized_A = A + elif self.similarity_function == 'squared': + A = torch.matmul(X, X.permute(0, 2, 1)) + squared_A = A * A + normalized_A = squared_A / torch.sum(squared_A, dim=2, keepdim=True) + elif self.similarity_function == 'equal_attention': + normalized_A = (torch.ones(X.size(1), X.size(1)) / X.size(1)).expand(X.size(0), X.size(1), X.size(1)) + elif self.similarity_function == 'diagonal': + normalized_A = (torch.eye(X.size(1), X.size(1))).expand(X.size(0), X.size(1), X.size(1)) + else: + raise NotImplementedError + + return normalized_A + + def forward(self, state): + robot_states = state['robot_state'] + human_states = state['human_state'] + + # compute feature matrix X + robot_state_embedings = self.w_r(robot_states) # batch x num x embedding_dim + human_state_embedings = self.w_h(human_states) + X = torch.cat([robot_state_embedings, human_state_embedings], dim=1) + + # compute matrix A + if not self.layerwise_graph: + normalized_A = self.compute_similarity_matrix(X) + self.A = normalized_A[0, :, :].data.cpu().numpy() # total_num x total_num + + # next_H = H = X + + H = X.contiguous().clone() + next_H = H.contiguous().clone() # batch x total_num x embedding_dim + for i in range(self.num_layer): # 2 + if self.layerwise_graph: # False + A = self.compute_similarity_matrix(H) + next_H = nn.functional.relu(torch.matmul(torch.matmul(A, H), self.Ws[i])) + else: # (A x H) x W_i + next_H = nn.functional.relu(torch.matmul(torch.matmul(normalized_A, H), self.Ws[i])) + + if self.skip_connection: + # next_H += H + next_H = next_H + H + H = next_H.contiguous().clone() + + return next_H + +class RepresentationNetworkGCN(nn.Module): + + def __init__( + self, + robot_observation_shape: tuple, + human_observation_shape: tuple, + hidden_channels: int = 64, + layer_num: int = 2, + activation: Optional[nn.Module] = nn.ReLU(inplace=True), + last_linear_layer_init_zero: bool = True, + norm_type: Optional[str] = 'BN', + ) -> torch.Tensor: + """ + Overview: + + Arguments: + - robot_observation_shape (:obj:`tuple`): The shape of robot observation space, e.g. (2, 4). + - human_observation_shape (:obj:`tuple`): The shape of human observation space, e.g. (59, 4). + - hidden_channels (:obj:`int`): The channel of output hidden state. + - activation (:obj:`nn.Module`): The activation function used in network, defaults to nn.ReLU(). \ + Use the inplace operation to speed up. + - last_linear_layer_init_zero (:obj:`bool`): Whether to initialize the last linear layer with zeros, \ + which can provide stable zero outputs in the beginning, defaults to True. + - norm_type (:obj:`str`): The type of normalization in networks. defaults to 'BN'. + """ + super().__init__() + # self.fc_representation = MLP( + # in_channels=observation_shape, + # hidden_channels=hidden_channels, + # out_channels=hidden_channels, + # layer_num=layer_num, + # activation=activation, + # norm_type=norm_type, + # # don't use activation and norm in the last layer of representation network is important for convergence. + # output_activation=False, + # output_norm=False, + # # last_linear_layer_init_zero=True is beneficial for convergence speed. + # last_linear_layer_init_zero=True, + # ) + self.rgl_representation = RGCNLayer( + robot_state_dim=robot_observation_shape[-1], + human_state_dim=human_observation_shape[-1], + similarity_function='embedded_gaussian', + num_layer=layer_num, + X_dim=hidden_channels, + layerwise_graph=False, + skip_connection=True, + wr_dims=[64, hidden_channels], + wh_dims=[64, hidden_channels], + final_state_dim=hidden_channels, + # for mlp + norm_type=norm_type, + last_linear_layer_init_zero=last_linear_layer_init_zero, + activation=activation, + ) + + def forward(self, x: dict) -> torch.Tensor: + """ + Shapes: + - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size, N is the length of vector observation. + - output (:obj:`torch.Tensor`): :math:`(B, hidden_channels)`, where B is batch size. + """ + return self.rgl_representation(x) class PredictionNetwork(nn.Module): diff --git a/zoo/CrowdSim/config/crowdsim_muzero_rgcn_config.py b/zoo/CrowdSim/config/crowdsim_muzero_rgcn_config.py new file mode 100644 index 000000000..e611aa6b1 --- /dev/null +++ b/zoo/CrowdSim/config/crowdsim_muzero_rgcn_config.py @@ -0,0 +1,106 @@ +from easydict import EasyDict +import os +os.environ["CUDA_VISIBLE_DEVICES"] = '2' +# ============================================================== +# begin of the most frequently changed config specified by the user +# ============================================================== +collector_env_num = 8 +n_episode = 8 +evaluator_env_num = 3 +num_simulations = 25 +update_per_collect = 100 +batch_size = 256 +max_env_step = int(3e5) +reanalyze_ratio = 0. +robot_num = 2 +human_num = 10 # purdue +# human_num = 33 # NCSU +# human_num = 92 # KAIST +one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]] +# ============================================================== +# end of the most frequently changed config specified by the user +# ============================================================== + +CrowdSim_muzero_config = dict( + exp_name= + f'result/crowd_num_human/CrowdSim_muzero_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_upc{update_per_collect}_rr{reanalyze_ratio}_seed0', + env=dict( + env_name='CrowdSim-v0', + robot_num = robot_num, + human_num = human_num, + one_uav_action_space = one_uav_action_space, + continuous=False, + manually_discretization=False, + collector_env_num=collector_env_num, + evaluator_env_num=evaluator_env_num, + n_evaluator_episode=evaluator_env_num, + manager=dict(shared_memory=False, ), + ), + policy=dict( + model=dict( + robot_observation_shape=(robot_num, 4), + human_observation_shape=(human_num, 4), + action_space_size=(len(one_uav_action_space))**robot_num, + model_type='rgcn', + lstm_hidden_size=256, + latent_state_dim=256, + self_supervised_learning_loss=True, # NOTE: default is False. + discrete_action_encoding_type='one_hot', + res_connection_in_dynamics=True, + norm_type='BN', + ), + cuda=True, + env_type='not_board_games', + game_segment_length=200, + update_per_collect=update_per_collect, + batch_size=batch_size, + optim_type='Adam', + lr_piecewise_constant_decay=False, + learning_rate=0.003, + ssl_loss_weight=2, # NOTE: default is 0. + grad_clip_value=0.5, + num_simulations=num_simulations, + reanalyze_ratio=reanalyze_ratio, + n_episode=n_episode, + eval_freq=int(1e3), + replay_buffer_size=int(1e6), # the size/capacity of replay_buffer, in the terms of transitions. + collector_env_num=collector_env_num, + evaluator_env_num=evaluator_env_num, + ), +) + +CrowdSim_muzero_config = EasyDict(CrowdSim_muzero_config) +main_config = CrowdSim_muzero_config + +CrowdSim_muzero_create_config = dict( + env=dict( + type='crowdsim_lightzero', + import_names=['zoo.CrowdSim.envs.CrowdSim_env'], + ), + env_manager=dict(type='subprocess'), + policy=dict( + type='muzero', + import_names=['lzero.policy.muzero'], + ), + collector=dict( + type='episode_muzero', + import_names=['lzero.worker.muzero_collector'], + ) +) +CrowdSim_muzero_create_config = EasyDict(CrowdSim_muzero_create_config) +create_config = CrowdSim_muzero_create_config + +if __name__ == "__main__": + # Users can use different train entry by specifying the entry_type. + entry_type = "train_muzero" # options={"train_muzero", "train_muzero_with_gym_env"} + + if entry_type == "train_muzero": + from lzero.entry import train_muzero + elif entry_type == "train_muzero_with_gym_env": + """ + The ``train_muzero_with_gym_env`` entry means that the environment used in the training process is generated by wrapping the original gym environment with LightZeroEnvWrapper. + Users can refer to lzero/envs/wrappers for more details. + """ + from lzero.entry import train_muzero_with_gym_env as train_muzero + + train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step) diff --git a/zoo/CrowdSim/envs/crowdsim_lightzero_env.py b/zoo/CrowdSim/envs/crowdsim_lightzero_env.py new file mode 100644 index 000000000..0bdedd202 --- /dev/null +++ b/zoo/CrowdSim/envs/crowdsim_lightzero_env.py @@ -0,0 +1,121 @@ +from typing import Union, Optional + +import gym +import numpy as np +from itertools import product +import logging + +from ding.envs import BaseEnv, BaseEnvTimestep +from ding.envs import ObsPlusPrevActRewWrapper +from ding.torch_utils import to_ndarray +from ding.utils import ENV_REGISTRY + +import zoo.CrowdSim.envs.Crowdsim.env + + +@ENV_REGISTRY.register('crowdsim_lightzero') +class CrowdSimEnv(BaseEnv): + + def __init__(self, cfg: dict = {}) -> None: + self._cfg = cfg + self._init_flag = False + self._replay_path = None + self._robot_num = self._cfg.robot_num + self._human_num = self._cfg.human_num + self._observation_space = gym.spaces.Dict({ + 'robot_state': gym.spaces.Box( + low=float("-inf"), + high=float("inf"), + shape=(self._robot_num, 4), + dtype=np.float32 + ), + 'human_state': gym.spaces.Box( + low=float("-inf"), + high=float("inf"), + shape=(self._human_num, 4), + dtype=np.float32 + ) + }) + # action space + # one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]] + self.real_action_space = list(product(self._cfg.one_uav_action_space, repeat=self._robot_num)) + one_uav_action_n = len(self._cfg.one_uav_action_space) + self._action_space = gym.spaces.Discrete(one_uav_action_n**self._robot_num) + self._action_space.seed(0) # default seed + self._reward_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1, ), dtype=np.float32) + self._continuous = False + + def reset(self) -> np.ndarray: + if not self._init_flag: + self._env = gym.make('CrowdSim-v0', dataset = self._cfg.dataset, custom_config = self._cfg) + self._init_flag = True + if hasattr(self, '_seed') and hasattr(self, '_dynamic_seed') and self._dynamic_seed: + np_seed = 100 * np.random.randint(1, 1000) + self._env.seed(self._seed + np_seed) + self._action_space.seed(self._seed + np_seed) + elif hasattr(self, '_seed'): + self._env.seed(self._seed) + self._action_space.seed(self._seed) + self._eval_episode_return = 0 + # process obs + raw_obs = self._env.reset() + obs_list = list(raw_obs.to_tensor()) + obs = {'robot_state': obs_list[0], 'human_state': obs_list[1]} + action_mask = np.ones(self.action_space.n, 'int8') + obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1} + + return obs + + def close(self) -> None: + if self._init_flag: + self._env.close() + self._init_flag = False + + def seed(self, seed: int, dynamic_seed: bool = True) -> None: + self._seed = seed + self._dynamic_seed = dynamic_seed + np.random.seed(self._seed) + + def step(self, action: Union[int, np.ndarray]) -> BaseEnvTimestep: + if isinstance(action, np.ndarray) and action.shape == (1, ): + action = action.squeeze() # 0-dim array + real_action = self.real_action_space[action] + assert isinstance(real_action, tuple) and len(real_action) == self._robot_num, "illegal action!" + raw_obs, rew, done, info = self._env.step(real_action) + obs_list = list(raw_obs.to_array()) + obs = {'robot_state': obs_list[0], 'human_state': obs_list[1]} + + self._eval_episode_return += rew + if done: + info['eval_episode_return'] = self._eval_episode_return + # logging.INFO('one game finish!') + + action_mask = np.ones(self.action_space.n, 'int8') + obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1} + rew = to_ndarray([rew]).astype(np.float32) + return BaseEnvTimestep(obs, rew, done, info) + + def enable_save_replay(self, replay_path: Optional[str] = None) -> None: + if replay_path is None: + replay_path = './video' + self._replay_path = replay_path + + def random_action(self) -> np.ndarray: + random_action = self.action_space.sample() + random_action = to_ndarray([random_action], dtype=np.int64) + return random_action + + @property + def observation_space(self) -> gym.spaces.Space: + return self._observation_space + + @property + def action_space(self) -> gym.spaces.Space: + return self._action_space + + @property + def reward_space(self) -> gym.spaces.Space: + return self._reward_space + + def __repr__(self) -> str: + return "LightZero CrowdSim Env" diff --git a/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py b/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py new file mode 100644 index 000000000..47f483717 --- /dev/null +++ b/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py @@ -0,0 +1,35 @@ +import numpy as np +import pytest +from easydict import EasyDict +from zoo.CrowdSim.envs.crowdsim_lightzero_env import CrowdSimEnv + +mcfg=EasyDict( + env_name='CrowdSim-v0', + dataset = 'purdue', + robot_num = 2, + human_num = 59, # purdue + one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]] + ) + +@ pytest.mark.envtest + +class TestCrowdSimEnv: + def test_naive(self): + env = CrowdSimEnv(mcfg) + env.seed(314) + assert env._seed == 314 + obs = env.reset() + assert isinstance(obs['observation'], dict) + assert obs['observation']['robot_state'].shape == (2, 4) + assert obs['observation']['human_state'].shape == (59, 4) + for i in range(10): + random_action = env.random_action() + timestep = env.step(random_action) + print(timestep) + assert isinstance(timestep.obs['observation'], dict) + assert timestep.obs['observation']['robot_state'].shape == (2, 4) + assert timestep.obs['observation']['human_state'].shape == (59, 4) + assert isinstance(timestep.done, bool) + assert timestep.reward.shape == (1, ) + print(env.observation_space, env.action_space, env.reward_space) + env.close() From dc4a7747b59d4e0c2f3c266dc126e8d2349aa732 Mon Sep 17 00:00:00 2001 From: nighood Date: Wed, 1 May 2024 18:00:31 +0800 Subject: [PATCH 05/16] feature(rjy): add obs/action env mode. fix rgcn pipeline. --- lzero/agent/muzero.py | 3 + lzero/mcts/utils.py | 14 +- lzero/model/common.py | 212 ------- lzero/model/common_gcn.py | 261 +++++++++ lzero/model/muzero_model_gcn.py | 457 +++++++++++++++ .../model/sampled_efficientzero_model_gcn.py | 534 ++++++++++++++++++ lzero/model/tests/test_common_gcn.py | 102 ++++ lzero/model/tests/test_rgcn.py | 50 ++ lzero/policy/muzero.py | 10 +- lzero/worker/muzero_collector.py | 2 +- .../config/crowdsim_muzero_rgcn_config.py | 20 +- zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py | 6 +- zoo/CrowdSim/envs/crowdsim_lightzero_env.py | 36 +- .../envs/test_crowdsim_lightzero_env.py | 40 +- 14 files changed, 1516 insertions(+), 231 deletions(-) create mode 100644 lzero/model/common_gcn.py create mode 100644 lzero/model/muzero_model_gcn.py create mode 100644 lzero/model/sampled_efficientzero_model_gcn.py create mode 100644 lzero/model/tests/test_common_gcn.py create mode 100644 lzero/model/tests/test_rgcn.py diff --git a/lzero/agent/muzero.py b/lzero/agent/muzero.py index 55dda5d00..b087377b5 100644 --- a/lzero/agent/muzero.py +++ b/lzero/agent/muzero.py @@ -110,6 +110,9 @@ def __init__( elif self.cfg.policy.model.model_type == 'conv': from lzero.model.muzero_model import MuZeroModel model = MuZeroModel(**self.cfg.policy.model) + elif self.cfg.policy.model.model_type == 'rgcn': + from lzero.model.muzero_model_gcn import MuZeroModelGCN + model = MuZeroModelGCN(**self.cfg.policy.model) else: raise NotImplementedError if self.cfg.policy.cuda and torch.cuda.is_available(): diff --git a/lzero/mcts/utils.py b/lzero/mcts/utils.py index c40052e62..1861cb2a9 100644 --- a/lzero/mcts/utils.py +++ b/lzero/mcts/utils.py @@ -97,7 +97,7 @@ def prepare_observation(observation_list, model_type='conv'): Returns: - np.ndarray: Reshaped array of observations. """ - assert model_type in ['conv', 'mlp'], "model_type must be either 'conv' or 'mlp'" + assert model_type in ['conv', 'mlp', 'rgcn'], "model_type must be either 'conv' or 'mlp'" observation_array = np.array(observation_list) batch_size = observation_array.shape[0] @@ -116,6 +116,18 @@ def prepare_observation(observation_list, model_type='conv'): observation_array = observation_array.reshape(batch_size, -1) else: raise ValueError("For 'mlp' model_type, the observation must have 3 dimensions [B, S, O]") + + elif model_type == 'rgcn': + if observation_array.ndim == 4: + # TODO(rjy): strage process + # observation_array should be reshaped to [B, S*M, O], where M is the agent number + # now observation_array.shape = [B, S, M, O] + observation_array = observation_array.reshape(batch_size, -1, observation_array.shape[-1]) + elif observation_array.ndim == 3: + # Flatten the last two dimensions + observation_array = observation_array.reshape(batch_size, -1) + else: + raise ValueError("For 'rgcn' model_type, the observation must have 3 dimensions [B, S, O] or 4 dimensions [B, S, M, O]") return observation_array diff --git a/lzero/model/common.py b/lzero/model/common.py index 0b97d2b7a..ddf4a5d59 100644 --- a/lzero/model/common.py +++ b/lzero/model/common.py @@ -273,218 +273,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: """ return self.fc_representation(x) -class RGCNLayer(nn.Module): - """ - Overview: - Relational graph convolutional network layer. - """ - def __init__( - self, - robot_state_dim, - human_state_dim, - similarity_function, - num_layer = 2, - X_dim = 32, - layerwise_graph = False, - skip_connection = True, - wr_dims = [64, 32], # the last dim should equal to X_dim - wh_dims = [64, 32], # the last dim should equal to X_dim - final_state_dim = 32, # should equal to X_dim - norm_type= 'BN', - last_linear_layer_init_zero=True, - activation: Optional[nn.Module] = nn.ReLU(inplace=True), - ): - super().__init__() - - # design choice - # 'gaussian', 'embedded_gaussian', 'cosine', 'cosine_softmax', 'concatenation' - self.similarity_function = similarity_function - self.robot_state_dim = robot_state_dim - self.human_state_dim = human_state_dim - self.num_layer = num_layer - self.X_dim = X_dim - self.layerwise_graph = layerwise_graph - self.skip_connection = skip_connection - - logging.info('Similarity_func: {}'.format(self.similarity_function)) - logging.info('Layerwise_graph: {}'.format(self.layerwise_graph)) - logging.info('Skip_connection: {}'.format(self.skip_connection)) - logging.info('Number of layers: {}'.format(self.num_layer)) - - self.w_r = MLP( - in_channels=robot_state_dim, - hidden_channels=wr_dims[0], - out_channels=wr_dims[1], - layer_num=num_layer, - activation=activation, - norm_type=norm_type, - last_linear_layer_init_zero=last_linear_layer_init_zero, - ) # inputs,64,32 - self.w_h = MLP( - in_channels=human_state_dim, - hidden_channels=wh_dims[0], - out_channels=wh_dims[1], - layer_num=num_layer, - activation=activation, - norm_type=norm_type, - last_linear_layer_init_zero=last_linear_layer_init_zero, - ) # inputs,64,32 - - if self.similarity_function == 'embedded_gaussian': - self.w_a = nn.Parameter(torch.randn(self.X_dim, self.X_dim)) - elif self.similarity_function == 'concatenation': - # TODO: fix the dim size - self.w_a = MLP( - in_channels=2 * X_dim, - hidden_channels=2 * X_dim, - out_channels=1, - layer_num=1, - ) - - embedding_dim = self.X_dim - self.Ws = torch.nn.ParameterList() - for i in range(self.num_layer): - if i == 0: - self.Ws.append(nn.Parameter(torch.randn(self.X_dim, embedding_dim))) - elif i == self.num_layer - 1: - self.Ws.append(nn.Parameter(torch.randn(embedding_dim, final_state_dim))) - else: - self.Ws.append(nn.Parameter(torch.randn(embedding_dim, embedding_dim))) - - # TODO: for visualization - self.A = None - - def compute_similarity_matrix(self, X): - if self.similarity_function == 'embedded_gaussian': - A = torch.matmul(torch.matmul(X, self.w_a), X.permute(0, 2, 1)) - normalized_A = nn.functional.softmax(A, dim=2) - elif self.similarity_function == 'gaussian': - A = torch.matmul(X, X.permute(0, 2, 1)) - normalized_A = nn.functional.softmax(A, dim=2) - elif self.similarity_function == 'cosine': - A = torch.matmul(X, X.permute(0, 2, 1)) - magnitudes = torch.norm(A, dim=2, keepdim=True) - norm_matrix = torch.matmul(magnitudes, magnitudes.permute(0, 2, 1)) - normalized_A = torch.div(A, norm_matrix) - elif self.similarity_function == 'cosine_softmax': - A = torch.matmul(X, X.permute(0, 2, 1)) - magnitudes = torch.norm(A, dim=2, keepdim=True) - norm_matrix = torch.matmul(magnitudes, magnitudes.permute(0, 2, 1)) - normalized_A = nn.functional.softmax(torch.div(A, norm_matrix), dim=2) - elif self.similarity_function == 'concatenation': - indices = [pair for pair in itertools.product(list(range(X.size(1))), repeat=2)] - selected_features = torch.index_select(X, dim=1, index=torch.LongTensor(indices).reshape(-1)) - pairwise_features = selected_features.reshape((-1, X.size(1) * X.size(1), X.size(2) * 2)) - A = self.w_a(pairwise_features).reshape(-1, X.size(1), X.size(1)) - normalized_A = A - elif self.similarity_function == 'squared': - A = torch.matmul(X, X.permute(0, 2, 1)) - squared_A = A * A - normalized_A = squared_A / torch.sum(squared_A, dim=2, keepdim=True) - elif self.similarity_function == 'equal_attention': - normalized_A = (torch.ones(X.size(1), X.size(1)) / X.size(1)).expand(X.size(0), X.size(1), X.size(1)) - elif self.similarity_function == 'diagonal': - normalized_A = (torch.eye(X.size(1), X.size(1))).expand(X.size(0), X.size(1), X.size(1)) - else: - raise NotImplementedError - - return normalized_A - - def forward(self, state): - robot_states = state['robot_state'] - human_states = state['human_state'] - - # compute feature matrix X - robot_state_embedings = self.w_r(robot_states) # batch x num x embedding_dim - human_state_embedings = self.w_h(human_states) - X = torch.cat([robot_state_embedings, human_state_embedings], dim=1) - - # compute matrix A - if not self.layerwise_graph: - normalized_A = self.compute_similarity_matrix(X) - self.A = normalized_A[0, :, :].data.cpu().numpy() # total_num x total_num - - # next_H = H = X - - H = X.contiguous().clone() - next_H = H.contiguous().clone() # batch x total_num x embedding_dim - for i in range(self.num_layer): # 2 - if self.layerwise_graph: # False - A = self.compute_similarity_matrix(H) - next_H = nn.functional.relu(torch.matmul(torch.matmul(A, H), self.Ws[i])) - else: # (A x H) x W_i - next_H = nn.functional.relu(torch.matmul(torch.matmul(normalized_A, H), self.Ws[i])) - - if self.skip_connection: - # next_H += H - next_H = next_H + H - H = next_H.contiguous().clone() - - return next_H - -class RepresentationNetworkGCN(nn.Module): - - def __init__( - self, - robot_observation_shape: tuple, - human_observation_shape: tuple, - hidden_channels: int = 64, - layer_num: int = 2, - activation: Optional[nn.Module] = nn.ReLU(inplace=True), - last_linear_layer_init_zero: bool = True, - norm_type: Optional[str] = 'BN', - ) -> torch.Tensor: - """ - Overview: - - Arguments: - - robot_observation_shape (:obj:`tuple`): The shape of robot observation space, e.g. (2, 4). - - human_observation_shape (:obj:`tuple`): The shape of human observation space, e.g. (59, 4). - - hidden_channels (:obj:`int`): The channel of output hidden state. - - activation (:obj:`nn.Module`): The activation function used in network, defaults to nn.ReLU(). \ - Use the inplace operation to speed up. - - last_linear_layer_init_zero (:obj:`bool`): Whether to initialize the last linear layer with zeros, \ - which can provide stable zero outputs in the beginning, defaults to True. - - norm_type (:obj:`str`): The type of normalization in networks. defaults to 'BN'. - """ - super().__init__() - # self.fc_representation = MLP( - # in_channels=observation_shape, - # hidden_channels=hidden_channels, - # out_channels=hidden_channels, - # layer_num=layer_num, - # activation=activation, - # norm_type=norm_type, - # # don't use activation and norm in the last layer of representation network is important for convergence. - # output_activation=False, - # output_norm=False, - # # last_linear_layer_init_zero=True is beneficial for convergence speed. - # last_linear_layer_init_zero=True, - # ) - self.rgl_representation = RGCNLayer( - robot_state_dim=robot_observation_shape[-1], - human_state_dim=human_observation_shape[-1], - similarity_function='embedded_gaussian', - num_layer=layer_num, - X_dim=hidden_channels, - layerwise_graph=False, - skip_connection=True, - wr_dims=[64, hidden_channels], - wh_dims=[64, hidden_channels], - final_state_dim=hidden_channels, - # for mlp - norm_type=norm_type, - last_linear_layer_init_zero=last_linear_layer_init_zero, - activation=activation, - ) - - def forward(self, x: dict) -> torch.Tensor: - """ - Shapes: - - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size, N is the length of vector observation. - - output (:obj:`torch.Tensor`): :math:`(B, hidden_channels)`, where B is batch size. - """ - return self.rgl_representation(x) class PredictionNetwork(nn.Module): diff --git a/lzero/model/common_gcn.py b/lzero/model/common_gcn.py new file mode 100644 index 000000000..857890c2f --- /dev/null +++ b/lzero/model/common_gcn.py @@ -0,0 +1,261 @@ +from typing import Optional, Tuple, Dict +import logging +import itertools + +import torch +import torch.nn as nn +from ding.torch_utils import MLP +from ding.utils import MODEL_REGISTRY, SequenceType + +from .utils import renormalize, get_params_mean, get_dynamic_mean, get_reward_mean + +class RGCNLayer(nn.Module): + """ + Overview: + Relational graph convolutional network layer. + """ + def __init__( + self, + robot_num: int, + human_num: int, + robot_state_dim, + human_state_dim, + similarity_function, + num_layer = 2, + X_dim = 32, + layerwise_graph = False, + skip_connection = True, + wr_dims = [64, 32], # the last dim should equal to X_dim + wh_dims = [64, 32], # the last dim should equal to X_dim + final_state_dim = 32, # should equal to X_dim + norm_type= None, + last_linear_layer_init_zero=True, + activation: Optional[nn.Module] = nn.ReLU(inplace=True), + ): + super().__init__() + + # design choice + # 'gaussian', 'embedded_gaussian', 'cosine', 'cosine_softmax', 'concatenation' + self.similarity_function = similarity_function + self.robot_num = robot_num + self.human_num = human_num + self.robot_state_dim = robot_state_dim + self.human_state_dim = human_state_dim + self.num_layer = num_layer + self.X_dim = X_dim + self.layerwise_graph = layerwise_graph + self.skip_connection = skip_connection + + logging.info('Similarity_func: {}'.format(self.similarity_function)) + logging.info('Layerwise_graph: {}'.format(self.layerwise_graph)) + logging.info('Skip_connection: {}'.format(self.skip_connection)) + logging.info('Number of layers: {}'.format(self.num_layer)) + + self.w_r = MLP( + in_channels=robot_state_dim, + hidden_channels=wr_dims[0], + out_channels=wr_dims[1], + layer_num=num_layer, + activation=activation, + norm_type=norm_type, + last_linear_layer_init_zero=last_linear_layer_init_zero, + ) # inputs,64,32 + self.w_h = MLP( + in_channels=human_state_dim, + hidden_channels=wh_dims[0], + out_channels=wh_dims[1], + layer_num=num_layer, + activation=activation, + norm_type=norm_type, + last_linear_layer_init_zero=last_linear_layer_init_zero, + ) # inputs,64,32 + + if self.similarity_function == 'embedded_gaussian': + self.w_a = nn.Parameter(torch.randn(self.X_dim, self.X_dim)) + elif self.similarity_function == 'concatenation': + # TODO: fix the dim size + self.w_a = MLP( + in_channels=2 * X_dim, + hidden_channels=2 * X_dim, + out_channels=1, + layer_num=1, + ) + + embedding_dim = self.X_dim + self.Ws = torch.nn.ParameterList() + for i in range(self.num_layer): + if i == 0: + self.Ws.append(nn.Parameter(torch.randn(self.X_dim, embedding_dim))) + elif i == self.num_layer - 1: + self.Ws.append(nn.Parameter(torch.randn(embedding_dim, final_state_dim))) + else: + self.Ws.append(nn.Parameter(torch.randn(embedding_dim, embedding_dim))) + + # TODO: for visualization + self.A = None + + def compute_similarity_matrix(self, X): + if self.similarity_function == 'embedded_gaussian': + A = torch.matmul(torch.matmul(X, self.w_a), X.permute(0, 2, 1)) + normalized_A = nn.functional.softmax(A, dim=2) + elif self.similarity_function == 'gaussian': + A = torch.matmul(X, X.permute(0, 2, 1)) + normalized_A = nn.functional.softmax(A, dim=2) + elif self.similarity_function == 'cosine': + A = torch.matmul(X, X.permute(0, 2, 1)) + magnitudes = torch.norm(A, dim=2, keepdim=True) + norm_matrix = torch.matmul(magnitudes, magnitudes.permute(0, 2, 1)) + normalized_A = torch.div(A, norm_matrix) + elif self.similarity_function == 'cosine_softmax': + A = torch.matmul(X, X.permute(0, 2, 1)) + magnitudes = torch.norm(A, dim=2, keepdim=True) + norm_matrix = torch.matmul(magnitudes, magnitudes.permute(0, 2, 1)) + normalized_A = nn.functional.softmax(torch.div(A, norm_matrix), dim=2) + elif self.similarity_function == 'concatenation': + indices = [pair for pair in itertools.product(list(range(X.size(1))), repeat=2)] + selected_features = torch.index_select(X, dim=1, index=torch.LongTensor(indices).reshape(-1)) + pairwise_features = selected_features.reshape((-1, X.size(1) * X.size(1), X.size(2) * 2)) + A = self.w_a(pairwise_features).reshape(-1, X.size(1), X.size(1)) + normalized_A = A + elif self.similarity_function == 'squared': + A = torch.matmul(X, X.permute(0, 2, 1)) + squared_A = A * A + normalized_A = squared_A / torch.sum(squared_A, dim=2, keepdim=True) + elif self.similarity_function == 'equal_attention': + normalized_A = (torch.ones(X.size(1), X.size(1)) / X.size(1)).expand(X.size(0), X.size(1), X.size(1)) + elif self.similarity_function == 'diagonal': + normalized_A = (torch.eye(X.size(1), X.size(1))).expand(X.size(0), X.size(1), X.size(1)) + else: + raise NotImplementedError + + return normalized_A + + def forward(self, state): + state = state.to(self.w_r[0].weight.dtype) + if isinstance(state, dict): + robot_states = state['robot_state'] + human_states = state['human_state'] + elif isinstance(state, torch.Tensor): + if state.dim() == 3: + # state shape:(B, stack_num*(robot_num+human_num), state_dim) + stack_num = state.size(1) // (self.robot_num + self.human_num) + # robot_states shape:(B, stack_num*robot_num, state_dim) + robot_states = state[:, :stack_num * self.robot_num, :] + # human_states shape:(B, stack_num*human_num, state_dim) + human_states = state[:, stack_num * self.robot_num:, :] + elif state.dim() == 2: + # state shape:(B, stack_num*(robot_num+human_num)*state_dim) + stack_num = state.size(1) // ((self.robot_num + self.human_num) * self.robot_state_dim) + assert stack_num == 1, "stack_num should be 1 for 1-dim-array obs" + # robot_states shape:(B, stack_num*robot_num, state_dim) + robot_states = state[:, :stack_num * self.robot_num * self.robot_state_dim].reshape(-1, self.robot_num, self.robot_state_dim) + # human_states shape:(B, stack_num*human_num, state_dim) + human_states = state[:, stack_num * self.robot_num * self.robot_state_dim:].reshape(-1, self.human_num, self.human_state_dim) + + # compute feature matrix X + robot_state_embedings = self.w_r(robot_states) # batch x num x embedding_dim + human_state_embedings = self.w_h(human_states) + X = torch.cat([robot_state_embedings, human_state_embedings], dim=1) + + # compute matrix A + if not self.layerwise_graph: + normalized_A = self.compute_similarity_matrix(X) + self.A = normalized_A[0, :, :].data.cpu().numpy() # total_num x total_num + + # next_H = H = X + + H = X.contiguous().clone() + next_H = H.contiguous().clone() # batch x total_num x embedding_dim + for i in range(self.num_layer): # 2 + if self.layerwise_graph: # False + A = self.compute_similarity_matrix(H) + next_H = nn.functional.relu(torch.matmul(torch.matmul(A, H), self.Ws[i])) + else: # (A x H) x W_i + next_H = nn.functional.relu(torch.matmul(torch.matmul(normalized_A, H), self.Ws[i])) + + if self.skip_connection: + # next_H += H + next_H = next_H + H + H = next_H.contiguous().clone() + + return next_H + +class RepresentationNetworkGCN(nn.Module): + + def __init__( + self, + robot_state_dim: int, + human_state_dim: int, + robot_num: int, + human_num: int, + hidden_channels: int = 64, + layer_num: int = 2, + activation: Optional[nn.Module] = nn.ReLU(inplace=True), + last_linear_layer_init_zero: bool = True, + norm_type: Optional[str] = 'BN', + ) -> torch.Tensor: + """ + Overview: + Representation network used in MuZero and derived algorithms. + Arguments: + - robot_state_dim (:obj:`int`): The dimension of robot state. + - human_state_dim (:obj:`int`): The dimension of human state. + - robot_num (:obj:`int`): The number of robots. + - human_num (:obj:`int`): The number of humans. + - num_res_blocks (:obj:`int`): The number of residual blocks. + - hidden_channels (:obj:`int`): The channel of output hidden state. + - downsample (:obj:`bool`): Whether to do downsampling for observations in ``representation_network``, \ + defaults to True. This option is often used in video games like Atari. In board games like go, \ + we don't need this module. + - activation (:obj:`nn.Module`): The activation function used in network, defaults to nn.ReLU(). \ + Use the inplace operation to speed up. + - last_linear_layer_init_zero (:obj:`bool`): Whether to initialize the last linear layer with zeros, \ + which can provide stable zero outputs in the beginning, defaults to True. + - norm_type (:obj:`str`): The type of normalization in networks. defaults to 'BN'. + """ + super().__init__() + self.robot_state_dim = robot_state_dim + self.human_state_dim = human_state_dim + self.hidden_channels = hidden_channels + self.similarity_function = 'embedded_gaussian' + self.robot_num = robot_num + self.human_num = human_num + self.rgcn = RGCNLayer( + robot_num=self.robot_num, + human_num=self.human_num, + robot_state_dim=self.robot_state_dim, + human_state_dim=self.human_state_dim, + similarity_function=self.similarity_function, + num_layer=2, + X_dim=hidden_channels, + final_state_dim=hidden_channels, + wr_dims=[hidden_channels, hidden_channels], # TODO: check dim + wh_dims=[hidden_channels, hidden_channels], + layerwise_graph=False, + skip_connection=True, + norm_type=None, + ) + mlp_input_shape = (robot_num + human_num) * hidden_channels + self.fc_representation = MLP( + in_channels=mlp_input_shape, + hidden_channels=hidden_channels, + out_channels=hidden_channels, + layer_num=layer_num, + activation=activation, + norm_type=norm_type, + # don't use activation and norm in the last layer of representation network is important for convergence. + output_activation=False, + output_norm=False, + # last_linear_layer_init_zero=True is beneficial for convergence speed. + last_linear_layer_init_zero=True, + ) + + def forward(self, x: Dict[str, torch.Tensor]) -> torch.Tensor: + """ + Shapes: + - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size, N is the length of vector observation. + - output (:obj:`torch.Tensor`): :math:`(B, hidden_channels)`, where B is batch size. + """ + gcn_embedding = self.rgcn(x) + gcn_embedding = gcn_embedding.view(gcn_embedding.shape[0], -1) # (B,M,N) -> (B,M*N) + return self.fc_representation(gcn_embedding) \ No newline at end of file diff --git a/lzero/model/muzero_model_gcn.py b/lzero/model/muzero_model_gcn.py new file mode 100644 index 000000000..6e2c808a6 --- /dev/null +++ b/lzero/model/muzero_model_gcn.py @@ -0,0 +1,457 @@ +from typing import Optional, Tuple, Dict + +import torch +import torch.nn as nn +from ding.torch_utils import MLP +from ding.utils import MODEL_REGISTRY, SequenceType + +from .common import MZNetworkOutput, RepresentationNetworkMLP, PredictionNetworkMLP +from .common_gcn import RepresentationNetworkGCN +from .utils import renormalize, get_params_mean, get_dynamic_mean, get_reward_mean + + +@MODEL_REGISTRY.register('MuZeroModelGCN') +class MuZeroModelGCN(nn.Module): + + def __init__( + self, + robot_state_dim: int, + human_state_dim: int, + robot_num: int, + human_num: int, + action_space_size: int, + latent_state_dim: int = 64, + fc_reward_layers: SequenceType = [32], + fc_value_layers: SequenceType = [32], + fc_policy_layers: SequenceType = [32], + reward_support_size: int = 601, + value_support_size: int = 601, + proj_hid: int = 1024, + proj_out: int = 1024, + pred_hid: int = 512, + pred_out: int = 1024, + self_supervised_learning_loss: bool = False, + categorical_distribution: bool = True, + activation: Optional[nn.Module] = nn.ReLU(inplace=True), + last_linear_layer_init_zero: bool = True, + state_norm: bool = False, + discrete_action_encoding_type: str = 'one_hot', + norm_type: Optional[str] = 'BN', + res_connection_in_dynamics: bool = False, + *args, + **kwargs + ): + """ + Overview: + The definition of the network model of MuZero, which is a generalization version for 1D vector obs. + The networks are mainly built on fully connected layers. + The representation network is an MLP network which maps the raw observation to a latent state. + The dynamics network is an MLP network which predicts the next latent state, and reward given the current latent state and action. + The prediction network is an MLP network which predicts the value and policy given the current latent state. + Arguments: + - observation_shape (:obj:`int`): Observation space shape, e.g. 8 for Lunarlander. + - action_space_size: (:obj:`int`): Action space size, e.g. 4 for Lunarlander. + - latent_state_dim (:obj:`int`): The dimension of latent state, such as 256. + - fc_reward_layers (:obj:`SequenceType`): The number of hidden layers of the reward head (MLP head). + - fc_value_layers (:obj:`SequenceType`): The number of hidden layers used in value head (MLP head). + - fc_policy_layers (:obj:`SequenceType`): The number of hidden layers used in policy head (MLP head). + - reward_support_size (:obj:`int`): The size of categorical reward output + - value_support_size (:obj:`int`): The size of categorical value output. + - proj_hid (:obj:`int`): The size of projection hidden layer. + - proj_out (:obj:`int`): The size of projection output layer. + - pred_hid (:obj:`int`): The size of prediction hidden layer. + - pred_out (:obj:`int`): The size of prediction output layer. + - self_supervised_learning_loss (:obj:`bool`): Whether to use self_supervised_learning related networks in MuZero model, default set it to False. + - categorical_distribution (:obj:`bool`): Whether to use discrete support to represent categorical distribution for value, reward/value_prefix. + - activation (:obj:`Optional[nn.Module]`): Activation function used in network, which often use in-place \ + operation to speedup, e.g. ReLU(inplace=True). + - last_linear_layer_init_zero (:obj:`bool`): Whether to use zero initializations for the last layer of value/policy mlp, default sets it to True. + - state_norm (:obj:`bool`): Whether to use normalization for latent states, default sets it to True. + - discrete_action_encoding_type (:obj:`str`): The encoding type of discrete action, which can be 'one_hot' or 'not_one_hot'. + - norm_type (:obj:`str`): The type of normalization in networks. defaults to 'BN'. + - res_connection_in_dynamics (:obj:`bool`): Whether to use residual connection for dynamics network, default set it to False. + """ + super(MuZeroModelGCN, self).__init__() + self.categorical_distribution = categorical_distribution + if not self.categorical_distribution: + self.reward_support_size = 1 + self.value_support_size = 1 + else: + self.reward_support_size = reward_support_size + self.value_support_size = value_support_size + + self.action_space_size = action_space_size + self.continuous_action_space = False + # The dim of action space. For discrete action space, it is 1. + # For continuous action space, it is the dimension of continuous action. + self.action_space_dim = action_space_size if self.continuous_action_space else 1 + assert discrete_action_encoding_type in ['one_hot', 'not_one_hot'], discrete_action_encoding_type + self.discrete_action_encoding_type = discrete_action_encoding_type + if self.continuous_action_space: + self.action_encoding_dim = action_space_size + else: + if self.discrete_action_encoding_type == 'one_hot': + self.action_encoding_dim = action_space_size + elif self.discrete_action_encoding_type == 'not_one_hot': + self.action_encoding_dim = 1 + + self.latent_state_dim = latent_state_dim + self.proj_hid = proj_hid + self.proj_out = proj_out + self.pred_hid = pred_hid + self.pred_out = pred_out + self.self_supervised_learning_loss = self_supervised_learning_loss + self.last_linear_layer_init_zero = last_linear_layer_init_zero + self.state_norm = state_norm + self.res_connection_in_dynamics = res_connection_in_dynamics + + self.representation_network = RepresentationNetworkGCN( + robot_state_dim = robot_state_dim, + human_state_dim = human_state_dim, + robot_num = robot_num, + human_num = human_num, + hidden_channels=self.latent_state_dim, + layer_num=2, + norm_type=norm_type + ) + + self.dynamics_network = DynamicsNetwork( + action_encoding_dim=self.action_encoding_dim, + num_channels=self.latent_state_dim + self.action_encoding_dim, + common_layer_num=2, + fc_reward_layers=fc_reward_layers, + output_support_size=self.reward_support_size, + last_linear_layer_init_zero=self.last_linear_layer_init_zero, + norm_type=norm_type, + res_connection_in_dynamics=self.res_connection_in_dynamics, + ) + + self.prediction_network = PredictionNetworkMLP( + action_space_size=action_space_size, + num_channels=latent_state_dim, + fc_value_layers=fc_value_layers, + fc_policy_layers=fc_policy_layers, + output_support_size=self.value_support_size, + last_linear_layer_init_zero=self.last_linear_layer_init_zero, + norm_type=norm_type + ) + + if self.self_supervised_learning_loss: + # self_supervised_learning_loss related network proposed in EfficientZero + self.projection_input_dim = latent_state_dim + + self.projection = nn.Sequential( + nn.Linear(self.projection_input_dim, self.proj_hid), nn.BatchNorm1d(self.proj_hid), activation, + nn.Linear(self.proj_hid, self.proj_hid), nn.BatchNorm1d(self.proj_hid), activation, + nn.Linear(self.proj_hid, self.proj_out), nn.BatchNorm1d(self.proj_out) + ) + self.prediction_head = nn.Sequential( + nn.Linear(self.proj_out, self.pred_hid), + nn.BatchNorm1d(self.pred_hid), + activation, + nn.Linear(self.pred_hid, self.pred_out), + ) + + def initial_inference(self, obs: torch.Tensor) -> MZNetworkOutput: + """ + Overview: + Initial inference of MuZero model, which is the first step of the MuZero model. + To perform the initial inference, we first use the representation network to obtain the "latent_state" of the observation. + Then we use the prediction network to predict the "value" and "policy_logits" of the "latent_state", and + also prepare the zeros-like ``reward`` for the next step of the MuZero model. + Arguments: + - obs (:obj:`torch.Tensor`): The 1D vector observation data. + Returns (MZNetworkOutput): + - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation. + - value_prefix (:obj:`torch.Tensor`): The predicted prefix sum of value for input state. \ + In initial inference, we set it to zero vector. + - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action. + - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state. + - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The hidden state of LSTM about reward. In initial inference, \ + we set it to the zeros-like hidden state (H and C). + Shapes: + - obs (:obj:`torch.Tensor`): :math:`(B, obs_shape)`, where B is batch_size. + - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size. + - reward (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size. + - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size. + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + """ + batch_size = obs.size(0) + latent_state = self._representation(obs) + policy_logits, value = self._prediction(latent_state) + return MZNetworkOutput( + value, + [0. for _ in range(batch_size)], + policy_logits, + latent_state, + ) + + def recurrent_inference(self, latent_state: torch.Tensor, action: torch.Tensor) -> MZNetworkOutput: + """ + Overview: + Recurrent inference of MuZero model, which is the rollout step of the MuZero model. + To perform the recurrent inference, we first use the dynamics network to predict ``next_latent_state``, + ``reward`` by the given current ``latent_state`` and ``action``. + We then use the prediction network to predict the ``value`` and ``policy_logits``. + Arguments: + - latent_state (:obj:`torch.Tensor`): The encoding latent state of input obs. + - action (:obj:`torch.Tensor`): The predicted action to rollout. + Returns (MZNetworkOutput): + - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation. + - reward (:obj:`torch.Tensor`): The predicted reward for input state. + - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action. + - next_latent_state (:obj:`torch.Tensor`): The predicted next latent state. + Shapes: + - action (:obj:`torch.Tensor`): :math:`(B, )`, where B is batch_size. + - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size. + - reward (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size. + - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size. + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - next_latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + """ + next_latent_state, reward = self._dynamics(latent_state, action) + policy_logits, value = self._prediction(next_latent_state) + return MZNetworkOutput(value, reward, policy_logits, next_latent_state) + + def _representation(self, observation: torch.Tensor) -> Tuple[torch.Tensor]: + """ + Overview: + Use the representation network to encode the observations into latent state. + Arguments: + - obs (:obj:`torch.Tensor`): The 1D vector observation data. + Returns: + - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state. + Shapes: + - obs (:obj:`torch.Tensor`): :math:`(B, obs_shape)`, where B is batch_size. + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + """ + latent_state = self.representation_network(observation) + if self.state_norm: + latent_state = renormalize(latent_state) + return latent_state + + def _prediction(self, latent_state: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Overview: + Use the representation network to encode the observations into latent state. + Arguments: + - obs (:obj:`torch.Tensor`): The 1D vector observation data. + Returns: + - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action. + - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation. + Shapes: + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size. + - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size. + """ + policy_logits, value = self.prediction_network(latent_state) + return policy_logits, value + + def _dynamics(self, latent_state: torch.Tensor, action: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Overview: + Concatenate ``latent_state`` and ``action`` and use the dynamics network to predict ``next_latent_state`` + ``reward`` and ``next_reward_hidden_state``. + Arguments: + - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state. + - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The input hidden state of LSTM about reward. + - action (:obj:`torch.Tensor`): The predicted action to rollout. + Returns: + - next_latent_state (:obj:`torch.Tensor`): The predicted latent state of the next timestep. + - next_reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The output hidden state of LSTM about reward. + - reward (:obj:`torch.Tensor`): The predicted reward for input state. + Shapes: + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - action (:obj:`torch.Tensor`): :math:`(B, )`, where B is batch_size. + - next_latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - reward (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size. + """ + # NOTE: the discrete action encoding type is important for some environments + + # discrete action space + if self.discrete_action_encoding_type == 'one_hot': + # Stack latent_state with the one hot encoded action + if len(action.shape) == 1: + # (batch_size, ) -> (batch_size, 1) + # e.g., torch.Size([8]) -> torch.Size([8, 1]) + action = action.unsqueeze(-1) + + # transform action to one-hot encoding. + # action_one_hot shape: (batch_size, action_space_size), e.g., (8, 4) + action_one_hot = torch.zeros(action.shape[0], self.action_space_size, device=action.device) + # transform action to torch.int64 + action = action.long() + action_one_hot.scatter_(1, action, 1) + action_encoding = action_one_hot + elif self.discrete_action_encoding_type == 'not_one_hot': + action_encoding = action / self.action_space_size + if len(action_encoding.shape) == 1: + # (batch_size, ) -> (batch_size, 1) + # e.g., torch.Size([8]) -> torch.Size([8, 1]) + action_encoding = action_encoding.unsqueeze(-1) + + action_encoding = action_encoding.to(latent_state.device).float() + # state_action_encoding shape: (batch_size, latent_state[1] + action_dim]) or + # (batch_size, latent_state[1] + action_space_size]) depending on the discrete_action_encoding_type. + state_action_encoding = torch.cat((latent_state, action_encoding), dim=1) + + next_latent_state, reward = self.dynamics_network(state_action_encoding) + + if not self.state_norm: + return next_latent_state, reward + else: + next_latent_state_normalized = renormalize(next_latent_state) + return next_latent_state_normalized, reward + + def project(self, latent_state: torch.Tensor, with_grad=True) -> torch.Tensor: + """ + Overview: + Project the latent state to a lower dimension to calculate the self-supervised loss, which is proposed in EfficientZero. + For more details, please refer to the paper ``Exploring Simple Siamese Representation Learning``. + Arguments: + - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state. + - with_grad (:obj:`bool`): Whether to calculate gradient for the projection result. + Returns: + - proj (:obj:`torch.Tensor`): The result embedding vector of projection operation. + Shapes: + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - proj (:obj:`torch.Tensor`): :math:`(B, projection_output_dim)`, where B is batch_size. + + Examples: + >>> latent_state = torch.randn(256, 64) + >>> output = self.project(latent_state) + >>> output.shape # (256, 1024) + """ + proj = self.projection(latent_state) + + if with_grad: + # with grad, use prediction_head + return self.prediction_head(proj) + else: + return proj.detach() + + def get_params_mean(self) -> float: + return get_params_mean(self) + + +class DynamicsNetwork(nn.Module): + + def __init__( + self, + action_encoding_dim: int = 2, + num_channels: int = 64, + common_layer_num: int = 2, + fc_reward_layers: SequenceType = [32], + output_support_size: int = 601, + last_linear_layer_init_zero: bool = True, + activation: Optional[nn.Module] = nn.ReLU(inplace=True), + norm_type: Optional[str] = 'BN', + res_connection_in_dynamics: bool = False, + ): + """ + Overview: + The definition of dynamics network in MuZero algorithm, which is used to predict next latent state + reward by the given current latent state and action. + The networks are mainly built on fully connected layers. + Arguments: + - action_encoding_dim (:obj:`int`): The dimension of action encoding. + - num_channels (:obj:`int`): The num of channels in latent states. + - common_layer_num (:obj:`int`): The number of common layers in dynamics network. + - fc_reward_layers (:obj:`SequenceType`): The number of hidden layers of the reward head (MLP head). + - output_support_size (:obj:`int`): The size of categorical reward output. + - last_linear_layer_init_zero (:obj:`bool`): Whether to use zero initializations for the last layer of value/policy mlp, default sets it to True. + - activation (:obj:`Optional[nn.Module]`): Activation function used in network, which often use in-place \ + operation to speedup, e.g. ReLU(inplace=True). + - norm_type (:obj:`str`): The type of normalization in networks. defaults to 'BN'. + - res_connection_in_dynamics (:obj:`bool`): Whether to use residual connection in dynamics network. + """ + super().__init__() + self.num_channels = num_channels + self.action_encoding_dim = action_encoding_dim + self.latent_state_dim = self.num_channels - self.action_encoding_dim + + self.res_connection_in_dynamics = res_connection_in_dynamics + if self.res_connection_in_dynamics: + self.fc_dynamics_1 = MLP( + in_channels=self.num_channels, + hidden_channels=self.latent_state_dim, + layer_num=common_layer_num, + out_channels=self.latent_state_dim, + activation=activation, + norm_type=norm_type, + output_activation=True, + output_norm=True, + # last_linear_layer_init_zero=False is important for convergence + last_linear_layer_init_zero=False, + ) + self.fc_dynamics_2 = MLP( + in_channels=self.latent_state_dim, + hidden_channels=self.latent_state_dim, + layer_num=common_layer_num, + out_channels=self.latent_state_dim, + activation=activation, + norm_type=norm_type, + output_activation=True, + output_norm=True, + # last_linear_layer_init_zero=False is important for convergence + last_linear_layer_init_zero=False, + ) + else: + self.fc_dynamics = MLP( + in_channels=self.num_channels, + hidden_channels=self.latent_state_dim, + layer_num=common_layer_num, + out_channels=self.latent_state_dim, + activation=activation, + norm_type=norm_type, + output_activation=True, + output_norm=True, + # last_linear_layer_init_zero=False is important for convergence + last_linear_layer_init_zero=False, + ) + + self.fc_reward_head = MLP( + in_channels=self.latent_state_dim, + hidden_channels=fc_reward_layers[0], + layer_num=2, + out_channels=output_support_size, + activation=activation, + norm_type=norm_type, + output_activation=False, + output_norm=False, + last_linear_layer_init_zero=last_linear_layer_init_zero + ) + + def forward(self, state_action_encoding: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Overview: + Forward computation of the dynamics network. Predict the next latent state given current latent state and action. + Arguments: + - state_action_encoding (:obj:`torch.Tensor`): The state-action encoding, which is the concatenation of \ + latent state and action encoding, with shape (batch_size, num_channels, height, width). + Returns: + - next_latent_state (:obj:`torch.Tensor`): The next latent state, with shape (batch_size, latent_state_dim). + - reward (:obj:`torch.Tensor`): The predicted reward for input state. + """ + if self.res_connection_in_dynamics: + # take the state encoding (e.g. latent_state), + # state_action_encoding[:, -self.action_encoding_dim:] is action encoding + latent_state = state_action_encoding[:, :-self.action_encoding_dim] + x = self.fc_dynamics_1(state_action_encoding) + # the residual link: add the latent_state to the state_action encoding + next_latent_state = x + latent_state + next_latent_state_encoding = self.fc_dynamics_2(next_latent_state) + else: + next_latent_state = self.fc_dynamics(state_action_encoding) + next_latent_state_encoding = next_latent_state + + reward = self.fc_reward_head(next_latent_state_encoding) + + return next_latent_state, reward + + def get_dynamic_mean(self) -> float: + return get_dynamic_mean(self) + + def get_reward_mean(self) -> float: + return get_reward_mean(self) + diff --git a/lzero/model/sampled_efficientzero_model_gcn.py b/lzero/model/sampled_efficientzero_model_gcn.py new file mode 100644 index 000000000..5747736eb --- /dev/null +++ b/lzero/model/sampled_efficientzero_model_gcn.py @@ -0,0 +1,534 @@ +from typing import Optional, Tuple + +import torch +import torch.nn as nn +from ding.model.common import ReparameterizationHead +from ding.torch_utils import MLP +from ding.utils import MODEL_REGISTRY, SequenceType + +from .common import EZNetworkOutput +from .common_gcn import RepresentationNetworkGCN +from .efficientzero_model_mlp import DynamicsNetworkMLP +from .utils import renormalize, get_params_mean + + +@MODEL_REGISTRY.register('SampledEfficientZeroModelMLP') +class SampledEfficientZeroModelMLP(nn.Module): + + def __init__( + self, + robot_state_dim: int = 10, + human_state_dim: int = 10, + robot_num: int = 5, + human_num: int = 5, + action_space_size: int = 6, + latent_state_dim: int = 256, + lstm_hidden_size: int = 512, + fc_reward_layers: SequenceType = [32], + fc_value_layers: SequenceType = [32], + fc_policy_layers: SequenceType = [32], + reward_support_size: int = 601, + value_support_size: int = 601, + proj_hid: int = 1024, + proj_out: int = 1024, + pred_hid: int = 512, + pred_out: int = 1024, + self_supervised_learning_loss: bool = True, + categorical_distribution: bool = True, + activation: Optional[nn.Module] = nn.ReLU(inplace=True), + last_linear_layer_init_zero: bool = True, + state_norm: bool = False, + # ============================================================== + # specific sampled related config + # ============================================================== + continuous_action_space: bool = False, + num_of_sampled_actions: int = 6, + sigma_type='conditioned', + fixed_sigma_value: float = 0.3, + bound_type: str = None, + norm_type: str = 'BN', + discrete_action_encoding_type: str = 'one_hot', + res_connection_in_dynamics: bool = False, + *args, + **kwargs, + ): + """ + Overview: + The definition of the network model of Sampled EfficientZero, which is a generalization version for 1D vector obs. + The networks are mainly built on fully connected layers. + Sampled EfficientZero model consists of a representation network, a dynamics network and a prediction network. + The representation network is an MLP network which maps the raw observation to a latent state. + The dynamics network is an MLP+LSTM network which predicts the next latent state, reward_hidden_state and value_prefix given the current latent state and action. + The prediction network is an MLP network which predicts the value and policy given the current latent state. + Arguments: + - observation_shape (:obj:`int`): Observation space shape, e.g. 8 for Lunarlander. + - action_space_size: (:obj:`int`): Action space size, which is an integer number. For discrete action space, it is the num of discrete actions, \ + e.g. 4 for Lunarlander. For continuous action space, it is the dimension of the continuous action, e.g. 4 for bipedalwalker. + - latent_state_dim (:obj:`int`): The dimension of latent state, such as 256. + - lstm_hidden_size (:obj:`int`): The hidden size of LSTM in dynamics network to predict value_prefix. + - fc_reward_layers (:obj:`SequenceType`): The number of hidden layers of the reward head (MLP head). + - fc_value_layers (:obj:`SequenceType`): The number of hidden layers used in value head (MLP head). + - fc_policy_layers (:obj:`SequenceType`): The number of hidden layers used in policy head (MLP head). + - reward_support_size (:obj:`int`): The size of categorical reward output + - value_support_size (:obj:`int`): The size of categorical value output. + - proj_hid (:obj:`int`): The size of projection hidden layer. + - proj_out (:obj:`int`): The size of projection output layer. + - pred_hid (:obj:`int`): The size of prediction hidden layer. + - pred_out (:obj:`int`): The size of prediction output layer. + - self_supervised_learning_loss (:obj:`bool`): Whether to use self_supervised_learning related networks in Sampled EfficientZero model, default set it to False. + - categorical_distribution (:obj:`bool`): Whether to use discrete support to represent categorical distribution for value, reward/value_prefix. + - activation (:obj:`Optional[nn.Module]`): Activation function used in network, which often use in-place \ + operation to speedup, e.g. ReLU(inplace=True). + - last_linear_layer_init_zero (:obj:`bool`): Whether to use zero initializations for the last layer of value/policy mlp, default sets it to True. + - state_norm (:obj:`bool`): Whether to use normalization for latent states, default sets it to True. + # ============================================================== + # specific sampled related config + # ============================================================== + - continuous_action_space (:obj:`bool`): The type of action space. default set it to False. + - num_of_sampled_actions (:obj:`int`): the number of sampled actions, i.e. the K in original Sampled MuZero paper. + # see ``ReparameterizationHead`` in ``ding.model.common.head`` for more details about the following arguments. + - sigma_type (:obj:`str`): the type of sigma in policy head of prediction network, options={'conditioned', 'fixed'}. + - fixed_sigma_value (:obj:`float`): the fixed sigma value in policy head of prediction network, + - bound_type (:obj:`str`): The type of bound in networks. Default sets it to None. + - norm_type (:obj:`str`): The type of normalization in networks. default set it to 'BN'. + - discrete_action_encoding_type (:obj:`str`): The type of encoding for discrete action. Default sets it to 'one_hot'. options = {'one_hot', 'not_one_hot'} + - res_connection_in_dynamics (:obj:`bool`): Whether to use residual connection for dynamics network, default set it to False. + """ + super(SampledEfficientZeroModelMLP, self).__init__() + if not categorical_distribution: + self.reward_support_size = 1 + self.value_support_size = 1 + else: + self.reward_support_size = reward_support_size + self.value_support_size = value_support_size + + self.continuous_action_space = continuous_action_space + # self.observation_shape = observation_shape + self.robot_state_dim = robot_state_dim + self.human_state_dim = human_state_dim + self.robot_num = robot_num + self.human_num = human_num + self.action_space_size = action_space_size + # The dim of action space. For discrete action space, it is 1. + # For continuous action space, it is the dimension of continuous action. + self.action_space_dim = action_space_size if self.continuous_action_space else 1 + assert discrete_action_encoding_type in ['one_hot', 'not_one_hot'], discrete_action_encoding_type + self.discrete_action_encoding_type = discrete_action_encoding_type + if self.continuous_action_space: + self.action_encoding_dim = action_space_size + else: + if self.discrete_action_encoding_type == 'one_hot': + self.action_encoding_dim = action_space_size + elif self.discrete_action_encoding_type == 'not_one_hot': + self.action_encoding_dim = 1 + + self.lstm_hidden_size = lstm_hidden_size + self.latent_state_dim = latent_state_dim + self.fc_reward_layers = fc_reward_layers + self.fc_value_layers = fc_value_layers + self.fc_policy_layers = fc_policy_layers + self.proj_hid = proj_hid + self.proj_out = proj_out + self.pred_hid = pred_hid + self.pred_out = pred_out + + self.last_linear_layer_init_zero = last_linear_layer_init_zero + self.state_norm = state_norm + self.self_supervised_learning_loss = self_supervised_learning_loss + + self.sigma_type = sigma_type + self.fixed_sigma_value = fixed_sigma_value + self.bound_type = bound_type + self.norm_type = norm_type + self.num_of_sampled_actions = num_of_sampled_actions + self.res_connection_in_dynamics = res_connection_in_dynamics + + self.representation_network = RepresentationNetworkGCN( + robot_state_dim = robot_state_dim, + human_state_dim = human_state_dim, + robot_num = robot_num, + human_num = human_num, + hidden_channels=self.latent_state_dim, + norm_type=norm_type + ) + + self.dynamics_network = DynamicsNetworkMLP( + action_encoding_dim=self.action_encoding_dim, + num_channels=self.latent_state_dim + self.action_encoding_dim, + common_layer_num=2, + lstm_hidden_size=self.lstm_hidden_size, + fc_reward_layers=self.fc_reward_layers, + output_support_size=self.reward_support_size, + last_linear_layer_init_zero=self.last_linear_layer_init_zero, + norm_type=norm_type, + res_connection_in_dynamics=self.res_connection_in_dynamics, + ) + + self.prediction_network = PredictionNetworkMLP( + continuous_action_space=self.continuous_action_space, + action_space_size=self.action_space_size, + num_channels=self.latent_state_dim, + fc_value_layers=self.fc_value_layers, + fc_policy_layers=self.fc_policy_layers, + output_support_size=self.value_support_size, + last_linear_layer_init_zero=self.last_linear_layer_init_zero, + sigma_type=self.sigma_type, + fixed_sigma_value=self.fixed_sigma_value, + bound_type=self.bound_type, + norm_type=self.norm_type, + ) + + if self.self_supervised_learning_loss: + # self_supervised_learning_loss related network proposed in EfficientZero + self.projection_input_dim = latent_state_dim + self.projection = nn.Sequential( + nn.Linear(self.projection_input_dim, self.proj_hid), nn.BatchNorm1d(self.proj_hid), activation, + nn.Linear(self.proj_hid, self.proj_hid), nn.BatchNorm1d(self.proj_hid), activation, + nn.Linear(self.proj_hid, self.proj_out), nn.BatchNorm1d(self.proj_out) + ) + self.prediction_head = nn.Sequential( + nn.Linear(self.proj_out, self.pred_hid), + nn.BatchNorm1d(self.pred_hid), + activation, + nn.Linear(self.pred_hid, self.pred_out), + ) + + def initial_inference(self, obs: torch.Tensor) -> EZNetworkOutput: + """ + Overview: + Initial inference of SampledEfficientZero model, which is the first step of the SampledEfficientZero model. + To perform the initial inference, we first use the representation network to obtain the "latent_state" of the observation. + Then we use the prediction network to predict the "value" and "policy_logits" of the "latent_state", and + also prepare the zeros-like ``reward_hidden_state`` for the next step of the Sampled EfficientZero model. + Arguments: + - obs (:obj:`torch.Tensor`): The 1D vector observation data. + Returns (EZNetworkOutput): + - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation. + - value_prefix (:obj:`torch.Tensor`): The predicted prefix sum of value for input state. \ + In initial inference, we set it to zero vector. + - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action. + - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state. + - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The hidden state of LSTM about reward. In initial inference, \ + we set it to the zeros-like hidden state (H and C). + Shapes: + - obs (:obj:`torch.Tensor`): :math:`(B, obs_shape)`, where B is batch_size. + - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size. + - value_prefix (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size. + - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size. + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The shape of each element is :math:`(1, B, lstm_hidden_size)`, where B is batch_size. + """ + batch_size = obs.size(0) + latent_state = self._representation(obs) + policy_logits, value = self._prediction(latent_state) + # zero initialization for reward hidden states + # (hn, cn), each element shape is (layer_num=1, batch_size, lstm_hidden_size) + reward_hidden_state = ( + torch.zeros(1, batch_size, + self.lstm_hidden_size).to(obs.device), torch.zeros(1, batch_size, + self.lstm_hidden_size).to(obs.device) + ) + return EZNetworkOutput(value, [0. for _ in range(batch_size)], policy_logits, latent_state, reward_hidden_state) + + def recurrent_inference( + self, latent_state: torch.Tensor, reward_hidden_state: torch.Tensor, action: torch.Tensor + ) -> EZNetworkOutput: + """ + Overview: + Recurrent inference of Sampled EfficientZero model, which is the rollout step of the Sampled EfficientZero model. + To perform the recurrent inference, we first use the dynamics network to predict ``next_latent_state``, + ``reward_hidden_state``, ``value_prefix`` by the given current ``latent_state`` and ``action``. + We then use the prediction network to predict the ``value`` and ``policy_logits``. + Arguments: + - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state. + - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The input hidden state of LSTM about reward. + - action (:obj:`torch.Tensor`): The predicted action to rollout. + Returns (EZNetworkOutput): + - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation. + - value_prefix (:obj:`torch.Tensor`): The predicted prefix sum of value for input state. + - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action. + - next_latent_state (:obj:`torch.Tensor`): The predicted next latent state. + - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The output hidden state of LSTM about reward. + Shapes: + - action (:obj:`torch.Tensor`): :math:`(B, )`, where B is batch_size. + - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size. + - value_prefix (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size. + - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size. + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - next_latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The shape of each element is :math:`(1, B, lstm_hidden_size)`, where B is batch_size. + """ + next_latent_state, reward_hidden_state, value_prefix = self._dynamics(latent_state, reward_hidden_state, action) + policy_logits, value = self._prediction(next_latent_state) + return EZNetworkOutput(value, value_prefix, policy_logits, next_latent_state, reward_hidden_state) + + def _representation(self, observation: torch.Tensor) -> Tuple[torch.Tensor]: + """ + Overview: + Use the representation network to encode the observations into latent state. + Arguments: + - obs (:obj:`torch.Tensor`): The 1D vector observation data. + Returns: + - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state. + Shapes: + - obs (:obj:`torch.Tensor`): :math:`(B, obs_shape)`, where B is batch_size. + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + """ + latent_state = self.representation_network(observation) + if self.state_norm: + latent_state = renormalize(latent_state) + return latent_state + + def _prediction(self, latent_state: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Overview: + Use the representation network to encode the observations into latent state. + Arguments: + - obs (:obj:`torch.Tensor`): The 1D vector observation data. + Returns: + - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action. + - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation. + Shapes: + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size. + - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size. + """ + policy, value = self.prediction_network(latent_state) + return policy, value + + def _dynamics(self, latent_state: torch.Tensor, reward_hidden_state: Tuple, + action: torch.Tensor) -> Tuple[torch.Tensor, Tuple[torch.Tensor], torch.Tensor]: + """ + Overview: + Concatenate ``latent_state`` and ``action`` and use the dynamics network to predict ``next_latent_state`` + ``value_prefix`` and ``next_reward_hidden_state``. + Arguments: + - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state. + - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The input hidden state of LSTM about reward. + - action (:obj:`torch.Tensor`): The predicted action to rollout. + Returns: + - next_latent_state (:obj:`torch.Tensor`): The predicted latent state of the next timestep. + - next_reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The output hidden state of LSTM about reward. + - value_prefix (:obj:`torch.Tensor`): The predicted prefix sum of value for input state. + Shapes: + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - action (:obj:`torch.Tensor`): :math:`(B, )`, where B is batch_size. + - next_latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - value_prefix (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size. + """ + # NOTE: the discrete action encoding type is important for some environments + + if not self.continuous_action_space: + # discrete action space + if self.discrete_action_encoding_type == 'one_hot': + # Stack latent_state with the one hot encoded action + if len(action.shape) == 1: + # (batch_size, ) -> (batch_size, 1) + # e.g., torch.Size([8]) -> torch.Size([8, 1]) + action = action.unsqueeze(-1) + + # transform action to one-hot encoding. + # action_one_hot shape: (batch_size, action_space_size), e.g., (8, 4) + action_one_hot = torch.zeros(action.shape[0], self.action_space_size, device=action.device) + # transform action to torch.int64 + action = action.long() + action_one_hot.scatter_(1, action, 1) + action_encoding = action_one_hot + elif self.discrete_action_encoding_type == 'not_one_hot': + action_encoding = action / self.action_space_size + if len(action_encoding.shape) == 1: + # (batch_size, ) -> (batch_size, 1) + # e.g., torch.Size([8]) -> torch.Size([8, 1]) + action_encoding = action_encoding.unsqueeze(-1) + else: + # continuous action space + if len(action.shape) == 1: + # (batch_size, ) -> (batch_size, 1) + # e.g., torch.Size([8]) -> torch.Size([8, 1]) + action = action.unsqueeze(-1) + elif len(action.shape) == 3: + # (batch_size, action_dim, 1) -> (batch_size, action_dim) + # e.g., torch.Size([8, 2, 1]) -> torch.Size([8, 2]) + action = action.squeeze(-1) + + action_encoding = action + + action_encoding = action_encoding.to(latent_state.device).float() + # state_action_encoding shape: (batch_size, latent_state[1] + action_dim]) or + # (batch_size, latent_state[1] + action_space_size]) depending on the discrete_action_encoding_type. + state_action_encoding = torch.cat((latent_state, action_encoding), dim=1) + + next_latent_state, next_reward_hidden_state, value_prefix = self.dynamics_network( + state_action_encoding, reward_hidden_state + ) + + if not self.state_norm: + return next_latent_state, next_reward_hidden_state, value_prefix + else: + next_latent_state_normalized = renormalize(next_latent_state) + return next_latent_state_normalized, next_reward_hidden_state, value_prefix + + def project(self, latent_state: torch.Tensor, with_grad=True) -> torch.Tensor: + """ + Overview: + Project the latent state to a lower dimension to calculate the self-supervised loss, which is proposed in EfficientZero. + For more details, please refer to the paper ``Exploring Simple Siamese Representation Learning``. + Arguments: + - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state. + - with_grad (:obj:`bool`): Whether to calculate gradient for the projection result. + Returns: + - proj (:obj:`torch.Tensor`): The result embedding vector of projection operation. + Shapes: + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - proj (:obj:`torch.Tensor`): :math:`(B, projection_output_dim)`, where B is batch_size. + + Examples: + >>> latent_state = torch.randn(256, 64) + >>> output = self.project(latent_state) + >>> output.shape # (256, 1024) + """ + proj = self.projection(latent_state) + + if with_grad: + # with grad, use prediction_head + return self.prediction_head(proj) + else: + return proj.detach() + + def get_params_mean(self): + return get_params_mean(self) + + +class PredictionNetworkMLP(nn.Module): + + def __init__( + self, + continuous_action_space, + action_space_size, + num_channels, + common_layer_num: int = 2, + fc_value_layers: SequenceType = [32], + fc_policy_layers: SequenceType = [32], + output_support_size: int = 601, + last_linear_layer_init_zero: bool = True, + activation: Optional[nn.Module] = nn.ReLU(inplace=True), + # ============================================================== + # specific sampled related config + # ============================================================== + sigma_type='conditioned', + fixed_sigma_value: float = 0.3, + bound_type: str = None, + norm_type: str = 'BN', + ): + """ + Overview: + The definition of policy and value prediction network, which is used to predict value and policy by the + given latent state. + The networks are mainly built on fully connected layers. + Arguments: + - continuous_action_space (:obj:`bool`): The type of action space. default set it to False. + - action_space_size: (:obj:`int`): Action space size, usually an integer number. For discrete action \ + space, it is the number of discrete actions. For continuous action space, it is the dimension of \ + continuous action. + - num_channels (:obj:`int`): The num of channels in latent states. + - num_res_blocks (:obj:`int`): The number of res blocks. + - fc_value_layers (:obj:`SequenceType`): hidden layers of the value prediction head (MLP head). + - fc_policy_layers (:obj:`SequenceType`): hidden layers of the policy prediction head (MLP head). + - output_support_size (:obj:`int`): dim of value output. + - last_linear_layer_init_zero (:obj:`bool`): Whether to use zero initializations for the last layer of value/policy mlp, default sets it to True. + # ============================================================== + # specific sampled related config + # ============================================================== + # see ``ReparameterizationHead`` in ``ding.model.common.head`` for more details about thee following arguments. + - sigma_type (:obj:`str`): the type of sigma in policy head of prediction network, options={'conditioned', 'fixed'}. + - fixed_sigma_value (:obj:`float`): the fixed sigma value in policy head of prediction network, + - bound_type (:obj:`str`): The type of bound in networks. default set it to None. + - norm_type (:obj:`str`): The type of normalization in networks. default set it to 'BN'. + """ + super().__init__() + self.num_channels = num_channels + self.continuous_action_space = continuous_action_space + self.norm_type = norm_type + self.sigma_type = sigma_type + self.fixed_sigma_value = fixed_sigma_value + self.bound_type = bound_type + self.action_space_size = action_space_size + if self.continuous_action_space: + self.action_encoding_dim = self.action_space_size + else: + self.action_encoding_dim = 1 + + # ******* common backbone ****** + self.fc_prediction_common = MLP( + in_channels=self.num_channels, + hidden_channels=self.num_channels, + out_channels=self.num_channels, + layer_num=common_layer_num, + activation=activation, + norm_type=norm_type, + output_activation=True, + output_norm=True, + # last_linear_layer_init_zero=False is important for convergence + last_linear_layer_init_zero=False, + ) + + # ******* value and policy head ****** + self.fc_value_head = MLP( + in_channels=self.num_channels, + hidden_channels=fc_value_layers[0], + out_channels=output_support_size, + layer_num=2, + activation=activation, + norm_type=norm_type, + output_activation=False, + output_norm=False, + # last_linear_layer_init_zero=True is beneficial for convergence speed. + last_linear_layer_init_zero=last_linear_layer_init_zero + ) + + # sampled related core code + if self.continuous_action_space: + self.fc_policy_head = ReparameterizationHead( + input_size=self.num_channels, + output_size=action_space_size, + layer_num=2, + sigma_type=self.sigma_type, + fixed_sigma_value=self.fixed_sigma_value, + activation=nn.ReLU(), + norm_type=None, + bound_type=self.bound_type + ) + else: + self.fc_policy_head = MLP( + in_channels=self.num_channels, + hidden_channels=fc_policy_layers[0], + out_channels=action_space_size, + layer_num=2, + activation=activation, + norm_type=self.norm_type, + output_activation=False, + output_norm=False, + # last_linear_layer_init_zero=True is beneficial for convergence speed. + last_linear_layer_init_zero=last_linear_layer_init_zero + ) + + def forward(self, latent_state: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Overview: + Forward computation of the prediction network. + Arguments: + - latent_state (:obj:`torch.Tensor`): input tensor with shape (B, in_channels). + Returns: + - policy (:obj:`torch.Tensor`): policy tensor. If action space is discrete, shape is (B, action_space_size). + If action space is continuous, shape is (B, action_space_size * 2). + - value (:obj:`torch.Tensor`): value tensor with shape (B, output_support_size). + """ + x_prediction_common = self.fc_prediction_common(latent_state) + value = self.fc_value_head(x_prediction_common) + + # sampled related core code + policy = self.fc_policy_head(x_prediction_common) + if self.continuous_action_space: + policy = torch.cat([policy['mu'], policy['sigma']], dim=-1) + + return policy, value diff --git a/lzero/model/tests/test_common_gcn.py b/lzero/model/tests/test_common_gcn.py new file mode 100644 index 000000000..a7574b3b8 --- /dev/null +++ b/lzero/model/tests/test_common_gcn.py @@ -0,0 +1,102 @@ +import torch +import numpy as np +from torch import nn +from lzero.model.common_gcn import RepresentationNetworkGCN, RGCNLayer + +# ... + +class TestLightZeroEnvWrapper: + + # ... + def test_representation_network_gcn_with_dict_obs(self): + robot_state_dim = 10 + human_state_dim = 5 + robot_num = 3 + human_num = 2 + hidden_channels = 64 + layer_num = 2 + activation = nn.ReLU(inplace=True) + last_linear_layer_init_zero = True + norm_type = 'BN' + + representation_network = RepresentationNetworkGCN( + robot_state_dim=robot_state_dim, + human_state_dim=human_state_dim, + robot_num=robot_num, + human_num=human_num, + hidden_channels=hidden_channels, + layer_num=layer_num, + activation=activation, + last_linear_layer_init_zero=last_linear_layer_init_zero, + norm_type=norm_type, + ) + + # Create dummy input + batch_size = 4 + x = { + 'robot_state': torch.randn(batch_size, robot_num, robot_state_dim), + 'human_state': torch.randn(batch_size, human_num, human_state_dim) + } + + # Forward pass + output = representation_network(x) + + # Check output shape + assert output.shape == (batch_size, hidden_channels) + + # Check output type + assert isinstance(output, torch.Tensor) + + # Check intermediate shape + assert representation_network.rgcn(x).shape == (batch_size, robot_num + human_num, hidden_channels) + + # Check intermediate type + assert isinstance(representation_network.rgcn(x), torch.Tensor) + + def test_representation_network_gcn_with_2d_array_obs(self): + robot_state_dim = 10 + human_state_dim = 10 # 2d_array_obs, so the dimensions must be the same + robot_num = 3 + human_num = 2 + hidden_channels = 64 + layer_num = 2 + activation = nn.ReLU(inplace=True) + last_linear_layer_init_zero = True + norm_type = 'BN' + + representation_network = RepresentationNetworkGCN( + robot_state_dim=robot_state_dim, + human_state_dim=human_state_dim, + robot_num=robot_num, + human_num=human_num, + hidden_channels=hidden_channels, + layer_num=layer_num, + activation=activation, + last_linear_layer_init_zero=last_linear_layer_init_zero, + norm_type=norm_type, + ) + + # Create dummy input + batch_size = 4 + x = torch.randn(batch_size, robot_num + human_num, robot_state_dim) + + # Forward pass + output = representation_network(x) + + # Check output shape + assert output.shape == (batch_size, hidden_channels) + + # Check output type + assert isinstance(output, torch.Tensor) + + # Check intermediate shape + assert representation_network.rgcn(x).shape == (batch_size, robot_num + human_num, hidden_channels) + + # Check intermediate type + assert isinstance(representation_network.rgcn(x), torch.Tensor) + +if __name__ == '__main__': + test = TestLightZeroEnvWrapper() + test.test_representation_network_gcn_with_dict_obs() + test.test_representation_network_gcn_with_2d_array_obs() + print("All tests passed.") \ No newline at end of file diff --git a/lzero/model/tests/test_rgcn.py b/lzero/model/tests/test_rgcn.py new file mode 100644 index 000000000..32b90b6bb --- /dev/null +++ b/lzero/model/tests/test_rgcn.py @@ -0,0 +1,50 @@ +import torch +import torch.nn as nn +from torch.nn import functional as F +from itertools import product +import unittest +from lzero.model.common_gcn import RGCNLayer + +class TestRGCNLayer(unittest.TestCase): + def setUp(self): + self.robot_state_dim = 10 + self.human_state_dim = 10 + self.similarity_function = 'embedded_gaussian' + self.batch_size = 4 + self.num_nodes = 5 # Suppose 5 robots and 5 humans + + # Create a RGCNLayer object + self.rgcn_layer = RGCNLayer( + robot_state_dim=self.robot_state_dim, + human_state_dim=self.human_state_dim, + similarity_function=self.similarity_function, + num_layer=2, + X_dim=32, + layerwise_graph=False, + skip_connection=True + ) + + # Creating dummy inputs + self.state = { + 'robot_state': torch.randn(self.batch_size, self.num_nodes, self.robot_state_dim), + 'human_state': torch.randn(self.batch_size, self.num_nodes, self.human_state_dim) + } + + def test_forward_shape(self): + # Forward pass + output = self.rgcn_layer(self.state) + expected_shape = (self.batch_size, self.num_nodes * 2, 32) # Since final_state_dim is set to X_dim + self.assertEqual(output.shape, expected_shape, "Output shape is incorrect.") + + def test_similarity_function(self): + # Check if the similarity matrix computation is working as expected + # This only checks for one similarity function due to space constraints + if self.similarity_function == 'embedded_gaussian': + X = torch.randn(self.batch_size, self.num_nodes * 2, 32) + A = self.rgcn_layer.compute_similarity_matrix(X) + self.assertEqual(A.shape, (self.batch_size, self.num_nodes * 2, self.num_nodes * 2), "Similarity matrix shape is incorrect.") + self.assertTrue(torch.all(A >= 0) and torch.all(A <= 1), "Similarity matrix values should be normalized.") + +# Running the tests +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/lzero/policy/muzero.py b/lzero/policy/muzero.py index 0acc66b07..6e12c3c3c 100644 --- a/lzero/policy/muzero.py +++ b/lzero/policy/muzero.py @@ -222,6 +222,8 @@ def default_model(self) -> Tuple[str, List[str]]: return 'MuZeroModel', ['lzero.model.muzero_model'] elif self._cfg.model.model_type == "mlp": return 'MuZeroModelMLP', ['lzero.model.muzero_model_mlp'] + elif self._cfg.model.model_type == "rgcn": + return 'MuZeroModelGCN', ['lzero.model.muzero_model_gcn'] else: raise ValueError("model type {} is not supported".format(self._cfg.model.model_type)) @@ -644,6 +646,9 @@ def _get_target_obs_index_in_step_k(self, step): elif self._cfg.model.model_type == 'mlp': beg_index = self._cfg.model.observation_shape * step end_index = self._cfg.model.observation_shape * (step + self._cfg.model.frame_stack_num) + elif self._cfg.model.model_type == 'rgcn': + beg_index = self._cfg.model.observation_shape * step + end_index = self._cfg.model.observation_shape * (step + self._cfg.model.frame_stack_num) return beg_index, end_index def _forward_eval(self, data: torch.Tensor, action_mask: list, to_play: int = -1, @@ -670,7 +675,10 @@ def _forward_eval(self, data: torch.Tensor, action_mask: list, to_play: int = -1 ``visit_count_distribution_entropy``, ``value``, ``pred_value``, ``policy_logits``. """ self._eval_model.eval() - active_eval_env_num = data.shape[0] + if type(data) is dict: + active_eval_env_num = data['robot_state'].shape[0] + else: + active_eval_env_num = data.shape[0] with torch.no_grad(): # data shape [B, S x C, W, H], e.g. {Tensor:(B, 12, 96, 96)} network_output = self._collect_model.initial_inference(data) diff --git a/lzero/worker/muzero_collector.py b/lzero/worker/muzero_collector.py index 55f0e9f85..ae516960c 100644 --- a/lzero/worker/muzero_collector.py +++ b/lzero/worker/muzero_collector.py @@ -414,7 +414,7 @@ def collect(self, stack_obs = prepare_observation(stack_obs, self.policy_config.model.model_type) # stack_obs = torch.from_numpy(stack_obs).to(self.policy_config.device).float() - stack_obs = torch.from_numpy(stack_obs).to(self.policy_config.device) + stack_obs = torch.from_numpy(stack_obs).to(self.policy_config.device).float() # ============================================================== # policy forward diff --git a/zoo/CrowdSim/config/crowdsim_muzero_rgcn_config.py b/zoo/CrowdSim/config/crowdsim_muzero_rgcn_config.py index e611aa6b1..bb6b9ced3 100644 --- a/zoo/CrowdSim/config/crowdsim_muzero_rgcn_config.py +++ b/zoo/CrowdSim/config/crowdsim_muzero_rgcn_config.py @@ -13,7 +13,7 @@ max_env_step = int(3e5) reanalyze_ratio = 0. robot_num = 2 -human_num = 10 # purdue +human_num = 59 # purdue # human_num = 33 # NCSU # human_num = 92 # KAIST one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]] @@ -23,9 +23,11 @@ CrowdSim_muzero_config = dict( exp_name= - f'result/crowd_num_human/CrowdSim_muzero_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_upc{update_per_collect}_rr{reanalyze_ratio}_seed0', + f'result/CrowdSim_muzerogcn_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0', env=dict( + obs_mode='1-dim-array', env_name='CrowdSim-v0', + dataset = 'purdue', robot_num = robot_num, human_num = human_num, one_uav_action_space = one_uav_action_space, @@ -38,8 +40,14 @@ ), policy=dict( model=dict( - robot_observation_shape=(robot_num, 4), - human_observation_shape=(human_num, 4), + # robot_observation_shape=(robot_num, 4), + # human_observation_shape=(human_num, 4), + observation_shape=(robot_num + human_num)*4, + obs_mode='1-dim-array', + robot_state_dim = 4, + human_state_dim = 4, + robot_num = robot_num, + human_num = human_num, action_space_size=(len(one_uav_action_space))**robot_num, model_type='rgcn', lstm_hidden_size=256, @@ -75,9 +83,9 @@ CrowdSim_muzero_create_config = dict( env=dict( type='crowdsim_lightzero', - import_names=['zoo.CrowdSim.envs.CrowdSim_env'], + import_names=['zoo.CrowdSim.envs.crowdsim_lightzero_env'], ), - env_manager=dict(type='subprocess'), + env_manager=dict(type='base'), policy=dict( type='muzero', import_names=['lzero.policy.muzero'], diff --git a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py index c27a888d5..fe058d8fd 100644 --- a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py +++ b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py @@ -3,10 +3,10 @@ import logging import random import gym -from shapely.geometry import Point +# from shapely.geometry import Point import numpy as np -import folium -from folium.plugins import TimestampedGeoJson, AntPath +# import folium +# from folium.plugins import TimestampedGeoJson, AntPath from zoo.CrowdSim.envs.Crowdsim.env.model.utils import * from zoo.CrowdSim.envs.Crowdsim.env.model.mdp import HumanState, RobotState, JointState diff --git a/zoo/CrowdSim/envs/crowdsim_lightzero_env.py b/zoo/CrowdSim/envs/crowdsim_lightzero_env.py index 0bdedd202..e28a1baac 100644 --- a/zoo/CrowdSim/envs/crowdsim_lightzero_env.py +++ b/zoo/CrowdSim/envs/crowdsim_lightzero_env.py @@ -44,6 +44,16 @@ def __init__(self, cfg: dict = {}) -> None: self._action_space.seed(0) # default seed self._reward_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1, ), dtype=np.float32) self._continuous = False + # obs_mode 'dict': {'robot_state': robot_state, 'human_state': human_state} + # obs_mode '2-dim-array': np.concatenate((robot_state, human_state), axis=0) + # obs_mode '1-dim-array': np.concatenate((robot_state, human_state), axis=0).flatten() + self.obs_mode = self._cfg.get('obs_mode', '2-dim-array') + assert self.obs_mode in ['dict', '2-dim-array', '1-dim-array'], "obs_mode should be 'dict' or '2-dim-array' or '1-dim-array'!" + # action_mode 'combine': combine all robot actions into one action, action space size = one_uav_action_n**robot_num + # action_mode 'separate': separate robot actions, shape (robot_num,), for each robot action space size = one_uav_action_n + self.action_mode = self._cfg.get('action_mode', 'combine') + assert self.action_mode in ['combine', 'separate'], "action_mode should be 'combine' or 'separate'!" + def reset(self) -> np.ndarray: if not self._init_flag: @@ -60,7 +70,13 @@ def reset(self) -> np.ndarray: # process obs raw_obs = self._env.reset() obs_list = list(raw_obs.to_tensor()) - obs = {'robot_state': obs_list[0], 'human_state': obs_list[1]} + if self.obs_mode == 'dict': + obs = {'robot_state': obs_list[0], 'human_state': obs_list[1]} + elif self.obs_mode == '2-dim-array': + # robot_state: (robot_num, 4), human_state: (human_num, 4) + obs = np.concatenate((obs_list[0], obs_list[1]), axis=0) + elif self.obs_mode == '1-dim-array': + obs = np.concatenate((obs_list[0], obs_list[1]), axis=0).flatten() action_mask = np.ones(self.action_space.n, 'int8') obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1} @@ -77,13 +93,23 @@ def seed(self, seed: int, dynamic_seed: bool = True) -> None: np.random.seed(self._seed) def step(self, action: Union[int, np.ndarray]) -> BaseEnvTimestep: - if isinstance(action, np.ndarray) and action.shape == (1, ): - action = action.squeeze() # 0-dim array - real_action = self.real_action_space[action] + if self.action_mode == 'combine': + if isinstance(action, np.ndarray) and action.shape == (1, ): + action = action.squeeze() + real_action = self.real_action_space[action] + elif self.action_mode == 'separate': + assert isinstance(action, np.ndarray) and action.shape == (self._robot_num, ), "illegal action!" + real_action = tuple([self._cfg.one_uav_action_space[action[i]] for i in range(self._robot_num)]) assert isinstance(real_action, tuple) and len(real_action) == self._robot_num, "illegal action!" raw_obs, rew, done, info = self._env.step(real_action) obs_list = list(raw_obs.to_array()) - obs = {'robot_state': obs_list[0], 'human_state': obs_list[1]} + if self.obs_mode == 'dict': + obs = {'robot_state': obs_list[0], 'human_state': obs_list[1]} + elif self.obs_mode == '2-dim-array': + # robot_state: (robot_num, 4), human_state: (human_num, 4) + obs = np.concatenate((obs_list[0], obs_list[1]), axis=0) + elif self.obs_mode == '1-dim-array': + obs = np.concatenate((obs_list[0], obs_list[1]), axis=0).flatten() self._eval_episode_return += rew if done: diff --git a/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py b/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py index 47f483717..b5721d0c3 100644 --- a/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py +++ b/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py @@ -8,13 +8,15 @@ dataset = 'purdue', robot_num = 2, human_num = 59, # purdue - one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]] + one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]], + obs_mode = '2-dim-array', ) @ pytest.mark.envtest class TestCrowdSimEnv: - def test_naive(self): + def test_obs_dict(self): + mcfg['obs_mode'] = 'dict' env = CrowdSimEnv(mcfg) env.seed(314) assert env._seed == 314 @@ -33,3 +35,37 @@ def test_naive(self): assert timestep.reward.shape == (1, ) print(env.observation_space, env.action_space, env.reward_space) env.close() + + def test_obs_2_dim_array(self): + mcfg['obs_mode'] = '2-dim-array' + env = CrowdSimEnv(mcfg) + env.seed(314) + assert env._seed == 314 + obs = env.reset() + assert obs['observation'].shape == (61, 4) + for i in range(10): + random_action = env.random_action() + timestep = env.step(random_action) + print(timestep) + assert timestep.obs['observation'].shape == (61, 4) + assert isinstance(timestep.done, bool) + assert timestep.reward.shape == (1, ) + print(env.observation_space, env.action_space, env.reward_space) + env.close() + + def test_obs_1_dim_array(self): + mcfg['obs_mode'] = '1-dim-array' + env = CrowdSimEnv(mcfg) + env.seed(314) + assert env._seed == 314 + obs = env.reset() + assert obs['observation'].shape == (244, ) + for i in range(10): + random_action = env.random_action() + timestep = env.step(random_action) + print(timestep) + assert timestep.obs['observation'].shape == (244, ) + assert isinstance(timestep.done, bool) + assert timestep.reward.shape == (1, ) + print(env.observation_space, env.action_space, env.reward_space) + env.close() \ No newline at end of file From c99db40f0b134110b46434d6cd304bd95828202b Mon Sep 17 00:00:00 2001 From: nighood Date: Fri, 3 May 2024 14:57:46 +0800 Subject: [PATCH 06/16] feature(rjy): add multi-head policy(combine logits) --- lzero/agent/muzero.py | 3 + lzero/mcts/utils.py | 4 +- lzero/model/muzero_model_md.py | 558 ++++++++++++++++++ lzero/policy/muzero.py | 4 +- lzero/worker/muzero_evaluator.py | 30 +- .../config/crowdsim_muzero_md_config.py | 117 ++++ 6 files changed, 698 insertions(+), 18 deletions(-) create mode 100644 lzero/model/muzero_model_md.py create mode 100644 zoo/CrowdSim/config/crowdsim_muzero_md_config.py diff --git a/lzero/agent/muzero.py b/lzero/agent/muzero.py index b087377b5..26ffc6f04 100644 --- a/lzero/agent/muzero.py +++ b/lzero/agent/muzero.py @@ -113,6 +113,9 @@ def __init__( elif self.cfg.policy.model.model_type == 'rgcn': from lzero.model.muzero_model_gcn import MuZeroModelGCN model = MuZeroModelGCN(**self.cfg.policy.model) + elif self.cfg.policy.model.model_type == 'mlp_md': + from lzero.model.muzero_model_md import MuZeroModelMD + model = MuZeroModelMD(**self.cfg.policy.model) else: raise NotImplementedError if self.cfg.policy.cuda and torch.cuda.is_available(): diff --git a/lzero/mcts/utils.py b/lzero/mcts/utils.py index 1861cb2a9..80e3f588e 100644 --- a/lzero/mcts/utils.py +++ b/lzero/mcts/utils.py @@ -97,7 +97,7 @@ def prepare_observation(observation_list, model_type='conv'): Returns: - np.ndarray: Reshaped array of observations. """ - assert model_type in ['conv', 'mlp', 'rgcn'], "model_type must be either 'conv' or 'mlp'" + assert model_type in ['conv', 'mlp', 'rgcn', 'mlp_md'], "model_type must be either 'conv', 'mlp', 'rgcn' or 'mlp_md'" observation_array = np.array(observation_list) batch_size = observation_array.shape[0] @@ -110,7 +110,7 @@ def prepare_observation(observation_list, model_type='conv'): _, stack_num, channels, width, height = observation_array.shape observation_array = observation_array.reshape(batch_size, stack_num * channels, width, height) - elif model_type == 'mlp': + elif model_type == 'mlp' or model_type == 'mlp_md': if observation_array.ndim == 3: # Flatten the last two dimensions observation_array = observation_array.reshape(batch_size, -1) diff --git a/lzero/model/muzero_model_md.py b/lzero/model/muzero_model_md.py new file mode 100644 index 000000000..187509ac8 --- /dev/null +++ b/lzero/model/muzero_model_md.py @@ -0,0 +1,558 @@ +from typing import Optional, Tuple + +import torch +import torch.nn as nn +from ding.torch_utils import MLP +from ding.utils import MODEL_REGISTRY, SequenceType +from ding.model.common.head import MultiHead, DiscreteHead + +from .common import MZNetworkOutput, RepresentationNetworkMLP +from .utils import renormalize, get_params_mean, get_dynamic_mean, get_reward_mean + + +@MODEL_REGISTRY.register('MuZeroModelMD') +class MuZeroModelMD(nn.Module): + + def __init__( + self, + agent_num: int, + output_separate_logit: bool = False, + observation_shape: int = 2, + single_agent_action_size: int = 5, + action_space_size: int = 6, + latent_state_dim: int = 256, + fc_reward_layers: SequenceType = [32], + fc_value_layers: SequenceType = [32], + reward_support_size: int = 601, + value_support_size: int = 601, + proj_hid: int = 1024, + proj_out: int = 1024, + pred_hid: int = 512, + pred_out: int = 1024, + self_supervised_learning_loss: bool = False, + categorical_distribution: bool = True, + activation: Optional[nn.Module] = nn.ReLU(inplace=True), + last_linear_layer_init_zero: bool = True, + state_norm: bool = False, + discrete_action_encoding_type: str = 'one_hot', + norm_type: Optional[str] = 'BN', + res_connection_in_dynamics: bool = False, + *args, + **kwargs + ): + """ + Overview: + The definition of the network model of MuZero, which is a generalization version for 1D vector obs. + The networks are mainly built on fully connected layers. + The representation network is an MLP network which maps the raw observation to a latent state. + The dynamics network is an MLP network which predicts the next latent state, and reward given the current latent state and action. + The prediction network is an network with agent_num multihead which predicts the value and policy given the current latent state. + Arguments: + - agent_num (:obj:`int`): The number of agents in the environment. + - output_separate_logit (:obj:`bool`): Whether to output separate logit for each action. + - observation_shape (:obj:`int`): Observation space shape, e.g. 8 for Lunarlander. + - action_space_size: (:obj:`int`): Combinational action space size. + - single_agent_action_size: (:obj:`int`): The size of action space for single agent. + - latent_state_dim (:obj:`int`): The dimension of latent state, such as 256. + - fc_reward_layers (:obj:`SequenceType`): The number of hidden layers of the reward head (MLP head). + - fc_value_layers (:obj:`SequenceType`): The number of hidden layers used in value head (MLP head). + - reward_support_size (:obj:`int`): The size of categorical reward output + - value_support_size (:obj:`int`): The size of categorical value output. + - proj_hid (:obj:`int`): The size of projection hidden layer. + - proj_out (:obj:`int`): The size of projection output layer. + - pred_hid (:obj:`int`): The size of prediction hidden layer. + - pred_out (:obj:`int`): The size of prediction output layer. + - self_supervised_learning_loss (:obj:`bool`): Whether to use self_supervised_learning related networks in MuZero model, default set it to False. + - categorical_distribution (:obj:`bool`): Whether to use discrete support to represent categorical distribution for value, reward/value_prefix. + - activation (:obj:`Optional[nn.Module]`): Activation function used in network, which often use in-place \ + operation to speedup, e.g. ReLU(inplace=True). + - last_linear_layer_init_zero (:obj:`bool`): Whether to use zero initializations for the last layer of value/policy mlp, default sets it to True. + - state_norm (:obj:`bool`): Whether to use normalization for latent states, default sets it to True. + - discrete_action_encoding_type (:obj:`str`): The encoding type of discrete action, which can be 'one_hot' or 'not_one_hot'. + - norm_type (:obj:`str`): The type of normalization in networks. defaults to 'BN'. + - res_connection_in_dynamics (:obj:`bool`): Whether to use residual connection for dynamics network, default set it to False. + """ + super(MuZeroModelMD, self).__init__() + self.categorical_distribution = categorical_distribution + if not self.categorical_distribution: + self.reward_support_size = 1 + self.value_support_size = 1 + else: + self.reward_support_size = reward_support_size + self.value_support_size = value_support_size + + self.action_space_size = action_space_size + self.continuous_action_space = False + # The dim of action space. For discrete action space, it is 1. + # For continuous action space, it is the dimension of continuous action. + self.action_space_dim = action_space_size if self.continuous_action_space else 1 + assert discrete_action_encoding_type in ['one_hot', 'not_one_hot'], discrete_action_encoding_type + self.discrete_action_encoding_type = discrete_action_encoding_type + if self.continuous_action_space: + self.action_encoding_dim = action_space_size + else: + if self.discrete_action_encoding_type == 'one_hot': + self.action_encoding_dim = action_space_size + elif self.discrete_action_encoding_type == 'not_one_hot': + self.action_encoding_dim = 1 + + self.latent_state_dim = latent_state_dim + self.proj_hid = proj_hid + self.proj_out = proj_out + self.pred_hid = pred_hid + self.pred_out = pred_out + self.self_supervised_learning_loss = self_supervised_learning_loss + self.last_linear_layer_init_zero = last_linear_layer_init_zero + self.state_norm = state_norm + self.res_connection_in_dynamics = res_connection_in_dynamics + + self.representation_network = RepresentationNetworkMLP( + observation_shape=observation_shape, hidden_channels=self.latent_state_dim, norm_type=norm_type + ) + + self.dynamics_network = DynamicsNetwork( + action_encoding_dim=self.action_encoding_dim, + num_channels=self.latent_state_dim + self.action_encoding_dim, + common_layer_num=2, + fc_reward_layers=fc_reward_layers, + output_support_size=self.reward_support_size, + last_linear_layer_init_zero=self.last_linear_layer_init_zero, + norm_type=norm_type, + res_connection_in_dynamics=self.res_connection_in_dynamics, + ) + + self.prediction_network = PredictionNetworkMD( + agent_num=agent_num, + single_agent_action_size=single_agent_action_size, + num_channels=latent_state_dim, + fc_value_layers=fc_value_layers, + output_support_size=self.value_support_size, + last_linear_layer_init_zero=self.last_linear_layer_init_zero, + norm_type=norm_type, + output_separate_logit=output_separate_logit, + ) + + if self.self_supervised_learning_loss: + # self_supervised_learning_loss related network proposed in EfficientZero + self.projection_input_dim = latent_state_dim + + self.projection = nn.Sequential( + nn.Linear(self.projection_input_dim, self.proj_hid), nn.BatchNorm1d(self.proj_hid), activation, + nn.Linear(self.proj_hid, self.proj_hid), nn.BatchNorm1d(self.proj_hid), activation, + nn.Linear(self.proj_hid, self.proj_out), nn.BatchNorm1d(self.proj_out) + ) + self.prediction_head = nn.Sequential( + nn.Linear(self.proj_out, self.pred_hid), + nn.BatchNorm1d(self.pred_hid), + activation, + nn.Linear(self.pred_hid, self.pred_out), + ) + + def initial_inference(self, obs: torch.Tensor) -> MZNetworkOutput: + """ + Overview: + Initial inference of MuZero model, which is the first step of the MuZero model. + To perform the initial inference, we first use the representation network to obtain the "latent_state" of the observation. + Then we use the prediction network to predict the "value" and "policy_logits" of the "latent_state", and + also prepare the zeros-like ``reward`` for the next step of the MuZero model. + Arguments: + - obs (:obj:`torch.Tensor`): The 1D vector observation data. + Returns (MZNetworkOutput): + - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation. + - value_prefix (:obj:`torch.Tensor`): The predicted prefix sum of value for input state. \ + In initial inference, we set it to zero vector. + - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action. + - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state. + - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The hidden state of LSTM about reward. In initial inference, \ + we set it to the zeros-like hidden state (H and C). + Shapes: + - obs (:obj:`torch.Tensor`): :math:`(B, obs_shape)`, where B is batch_size. + - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size. + - reward (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size. + - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size. + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + """ + batch_size = obs.size(0) + latent_state = self._representation(obs) + policy_logits, value = self._prediction(latent_state) + return MZNetworkOutput( + value, + [0. for _ in range(batch_size)], + policy_logits, + latent_state, + ) + + def recurrent_inference(self, latent_state: torch.Tensor, action: torch.Tensor) -> MZNetworkOutput: + """ + Overview: + Recurrent inference of MuZero model, which is the rollout step of the MuZero model. + To perform the recurrent inference, we first use the dynamics network to predict ``next_latent_state``, + ``reward`` by the given current ``latent_state`` and ``action``. + We then use the prediction network to predict the ``value`` and ``policy_logits``. + Arguments: + - latent_state (:obj:`torch.Tensor`): The encoding latent state of input obs. + - action (:obj:`torch.Tensor`): The predicted action to rollout. + Returns (MZNetworkOutput): + - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation. + - reward (:obj:`torch.Tensor`): The predicted reward for input state. + - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action. + - next_latent_state (:obj:`torch.Tensor`): The predicted next latent state. + Shapes: + - action (:obj:`torch.Tensor`): :math:`(B, )`, where B is batch_size. + - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size. + - reward (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size. + - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size. + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - next_latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + """ + next_latent_state, reward = self._dynamics(latent_state, action) + policy_logits, value = self._prediction(next_latent_state) + return MZNetworkOutput(value, reward, policy_logits, next_latent_state) + + def _representation(self, observation: torch.Tensor) -> Tuple[torch.Tensor]: + """ + Overview: + Use the representation network to encode the observations into latent state. + Arguments: + - obs (:obj:`torch.Tensor`): The 1D vector observation data. + Returns: + - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state. + Shapes: + - obs (:obj:`torch.Tensor`): :math:`(B, obs_shape)`, where B is batch_size. + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + """ + observation = observation.float() + latent_state = self.representation_network(observation) + if self.state_norm: + latent_state = renormalize(latent_state) + return latent_state + + def _prediction(self, latent_state: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Overview: + Use the representation network to encode the observations into latent state. + Arguments: + - obs (:obj:`torch.Tensor`): The 1D vector observation data. + Returns: + - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action. + - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation. + Shapes: + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size. + - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size. + """ + policy_logits, value = self.prediction_network(latent_state) + return policy_logits, value + + def _dynamics(self, latent_state: torch.Tensor, action: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Overview: + Concatenate ``latent_state`` and ``action`` and use the dynamics network to predict ``next_latent_state`` + ``reward`` and ``next_reward_hidden_state``. + Arguments: + - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state. + - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The input hidden state of LSTM about reward. + - action (:obj:`torch.Tensor`): The predicted action to rollout. + Returns: + - next_latent_state (:obj:`torch.Tensor`): The predicted latent state of the next timestep. + - next_reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The output hidden state of LSTM about reward. + - reward (:obj:`torch.Tensor`): The predicted reward for input state. + Shapes: + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - action (:obj:`torch.Tensor`): :math:`(B, )`, where B is batch_size. + - next_latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - reward (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size. + """ + # NOTE: the discrete action encoding type is important for some environments + + # discrete action space + if self.discrete_action_encoding_type == 'one_hot': + # Stack latent_state with the one hot encoded action + if len(action.shape) == 1: + # (batch_size, ) -> (batch_size, 1) + # e.g., torch.Size([8]) -> torch.Size([8, 1]) + action = action.unsqueeze(-1) + + # transform action to one-hot encoding. + # action_one_hot shape: (batch_size, action_space_size), e.g., (8, 4) + action_one_hot = torch.zeros(action.shape[0], self.action_space_size, device=action.device) + # transform action to torch.int64 + action = action.long() + action_one_hot.scatter_(1, action, 1) + action_encoding = action_one_hot + elif self.discrete_action_encoding_type == 'not_one_hot': + action_encoding = action / self.action_space_size + if len(action_encoding.shape) == 1: + # (batch_size, ) -> (batch_size, 1) + # e.g., torch.Size([8]) -> torch.Size([8, 1]) + action_encoding = action_encoding.unsqueeze(-1) + + action_encoding = action_encoding.to(latent_state.device).float() + # state_action_encoding shape: (batch_size, latent_state[1] + action_dim]) or + # (batch_size, latent_state[1] + action_space_size]) depending on the discrete_action_encoding_type. + state_action_encoding = torch.cat((latent_state, action_encoding), dim=1) + + next_latent_state, reward = self.dynamics_network(state_action_encoding) + + if not self.state_norm: + return next_latent_state, reward + else: + next_latent_state_normalized = renormalize(next_latent_state) + return next_latent_state_normalized, reward + + def project(self, latent_state: torch.Tensor, with_grad=True) -> torch.Tensor: + """ + Overview: + Project the latent state to a lower dimension to calculate the self-supervised loss, which is proposed in EfficientZero. + For more details, please refer to the paper ``Exploring Simple Siamese Representation Learning``. + Arguments: + - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state. + - with_grad (:obj:`bool`): Whether to calculate gradient for the projection result. + Returns: + - proj (:obj:`torch.Tensor`): The result embedding vector of projection operation. + Shapes: + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - proj (:obj:`torch.Tensor`): :math:`(B, projection_output_dim)`, where B is batch_size. + + Examples: + >>> latent_state = torch.randn(256, 64) + >>> output = self.project(latent_state) + >>> output.shape # (256, 1024) + """ + proj = self.projection(latent_state) + + if with_grad: + # with grad, use prediction_head + return self.prediction_head(proj) + else: + return proj.detach() + + def get_params_mean(self) -> float: + return get_params_mean(self) + + +class DynamicsNetwork(nn.Module): + + def __init__( + self, + action_encoding_dim: int = 2, + num_channels: int = 64, + common_layer_num: int = 2, + fc_reward_layers: SequenceType = [32], + output_support_size: int = 601, + last_linear_layer_init_zero: bool = True, + activation: Optional[nn.Module] = nn.ReLU(inplace=True), + norm_type: Optional[str] = 'BN', + res_connection_in_dynamics: bool = False, + ): + """ + Overview: + The definition of dynamics network in MuZero algorithm, which is used to predict next latent state + reward by the given current latent state and action. + The networks are mainly built on fully connected layers. + Arguments: + - action_encoding_dim (:obj:`int`): The dimension of action encoding. + - num_channels (:obj:`int`): The num of channels in latent states. + - common_layer_num (:obj:`int`): The number of common layers in dynamics network. + - fc_reward_layers (:obj:`SequenceType`): The number of hidden layers of the reward head (MLP head). + - output_support_size (:obj:`int`): The size of categorical reward output. + - last_linear_layer_init_zero (:obj:`bool`): Whether to use zero initializations for the last layer of value/policy mlp, default sets it to True. + - activation (:obj:`Optional[nn.Module]`): Activation function used in network, which often use in-place \ + operation to speedup, e.g. ReLU(inplace=True). + - norm_type (:obj:`str`): The type of normalization in networks. defaults to 'BN'. + - res_connection_in_dynamics (:obj:`bool`): Whether to use residual connection in dynamics network. + """ + super().__init__() + self.num_channels = num_channels + self.action_encoding_dim = action_encoding_dim + self.latent_state_dim = self.num_channels - self.action_encoding_dim + + self.res_connection_in_dynamics = res_connection_in_dynamics + if self.res_connection_in_dynamics: + self.fc_dynamics_1 = MLP( + in_channels=self.num_channels, + hidden_channels=self.latent_state_dim, + layer_num=common_layer_num, + out_channels=self.latent_state_dim, + activation=activation, + norm_type=norm_type, + output_activation=True, + output_norm=True, + # last_linear_layer_init_zero=False is important for convergence + last_linear_layer_init_zero=False, + ) + self.fc_dynamics_2 = MLP( + in_channels=self.latent_state_dim, + hidden_channels=self.latent_state_dim, + layer_num=common_layer_num, + out_channels=self.latent_state_dim, + activation=activation, + norm_type=norm_type, + output_activation=True, + output_norm=True, + # last_linear_layer_init_zero=False is important for convergence + last_linear_layer_init_zero=False, + ) + else: + self.fc_dynamics = MLP( + in_channels=self.num_channels, + hidden_channels=self.latent_state_dim, + layer_num=common_layer_num, + out_channels=self.latent_state_dim, + activation=activation, + norm_type=norm_type, + output_activation=True, + output_norm=True, + # last_linear_layer_init_zero=False is important for convergence + last_linear_layer_init_zero=False, + ) + + self.fc_reward_head = MLP( + in_channels=self.latent_state_dim, + hidden_channels=fc_reward_layers[0], + layer_num=2, + out_channels=output_support_size, + activation=activation, + norm_type=norm_type, + output_activation=False, + output_norm=False, + last_linear_layer_init_zero=last_linear_layer_init_zero + ) + + def forward(self, state_action_encoding: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Overview: + Forward computation of the dynamics network. Predict the next latent state given current latent state and action. + Arguments: + - state_action_encoding (:obj:`torch.Tensor`): The state-action encoding, which is the concatenation of \ + latent state and action encoding, with shape (batch_size, num_channels, height, width). + Returns: + - next_latent_state (:obj:`torch.Tensor`): The next latent state, with shape (batch_size, latent_state_dim). + - reward (:obj:`torch.Tensor`): The predicted reward for input state. + """ + if self.res_connection_in_dynamics: + # take the state encoding (e.g. latent_state), + # state_action_encoding[:, -self.action_encoding_dim:] is action encoding + latent_state = state_action_encoding[:, :-self.action_encoding_dim] + x = self.fc_dynamics_1(state_action_encoding) + # the residual link: add the latent_state to the state_action encoding + next_latent_state = x + latent_state + next_latent_state_encoding = self.fc_dynamics_2(next_latent_state) + else: + next_latent_state = self.fc_dynamics(state_action_encoding) + next_latent_state_encoding = next_latent_state + + reward = self.fc_reward_head(next_latent_state_encoding) + + return next_latent_state, reward + + def get_dynamic_mean(self) -> float: + return get_dynamic_mean(self) + + def get_reward_mean(self) -> float: + return get_reward_mean(self) + + + +class PredictionNetworkMD(nn.Module): + + def __init__( + self, + agent_num: int, + single_agent_action_size, + num_channels, + common_layer_num: int = 2, + fc_value_layers: SequenceType = [32], + output_support_size: int = 601, + last_linear_layer_init_zero: bool = True, + activation: Optional[nn.Module] = nn.ReLU(inplace=True), + norm_type: Optional[str] = 'BN', + output_separate_logit: bool = False, + ): + """ + Overview: + The definition of policy and value prediction network with Multi-Layer Perceptron (MLP), + which is used to predict value and policy by the given latent state. Policy network is a multihead network, + which predicts the policy for each agent. + Arguments: + - agent_num (:obj:`int`): The number of agents in the environment. + - single_agent_action_size: (:obj:`int`): Action space size for single agent. + - num_channels (:obj:`int`): The channels of latent states. + - fc_value_layers (:obj:`SequenceType`): The number of hidden layers used in value head (MLP head). + - output_support_size (:obj:`int`): The size of categorical value output. + - last_linear_layer_init_zero (:obj:`bool`): Whether to use zero initializations for the last layer of \ + dynamics/prediction mlp, default sets it to True. + - activation (:obj:`Optional[nn.Module]`): Activation function used in network, which often use in-place \ + operation to speedup, e.g. ReLU(inplace=True). + - norm_type (:obj:`str`): The type of normalization in networks. defaults to 'BN'. + """ + super().__init__() + self.num_channels = num_channels + + # ******* common backbone ****** + self.fc_prediction_common = MLP( + in_channels=self.num_channels, + hidden_channels=self.num_channels, + out_channels=self.num_channels, + layer_num=common_layer_num, + activation=activation, + norm_type=norm_type, + output_activation=True, + output_norm=True, + # last_linear_layer_init_zero=False is important for convergence + last_linear_layer_init_zero=False, + ) + + # ******* value and policy head ****** + self.fc_value_head = MLP( + in_channels=self.num_channels, + hidden_channels=fc_value_layers[0], + out_channels=output_support_size, + layer_num=len(fc_value_layers) + 1, + activation=activation, + norm_type=norm_type, + output_activation=False, + output_norm=False, + # last_linear_layer_init_zero=True is beneficial for convergence speed. + last_linear_layer_init_zero=last_linear_layer_init_zero + ) + self.policy_multi_head = MultiHead( + head_cls=DiscreteHead, + hidden_size=self.num_channels, + output_size_list=[single_agent_action_size for _ in range(agent_num)], + ) + self.output_separate_logit = output_separate_logit + + def forward(self, latent_state: torch.Tensor): + """ + Overview: + Forward computation of the prediction network. + Arguments: + - latent_state (:obj:`torch.Tensor`): input tensor with shape (B, latent_state_dim). + Returns: + - policy (:obj:`torch.Tensor`): policy tensor with shape (B, action_space_size). + - value (:obj:`torch.Tensor`): value tensor with shape (B, output_support_size). + """ + latent_state = latent_state.to(torch.float32) + x_prediction_common = self.fc_prediction_common(latent_state) + + value = self.fc_value_head(x_prediction_common) + # policy_list: {'logit': [policy1, policy2, ...],} + # policyi shape: (B, action_space_size) + policy_list = self.policy_multi_head(x_prediction_common)['logit'] + if not self.output_separate_logit: + # The joint action space policy is the product of each agent policy + # policy shape: (B, action_space_size^^agent_num) + batch_size = latent_state.size(0) + joint_logits_batches = [] + for i in range(batch_size): + current_batch = [policy[i] for policy in policy_list] + cartesian_prod_result = torch.cartesian_prod(*current_batch) + joint_logits = cartesian_prod_result.prod(dim=1) + joint_logits_batches.append(joint_logits) + policy = torch.stack(joint_logits_batches) + else: + # policy_list: [policy1, policy2, ...] + # policy sahpe: (B, agent_num, action_space_size) + policy = torch.stack(policy_list, dim=1) + return policy, value diff --git a/lzero/policy/muzero.py b/lzero/policy/muzero.py index 6e12c3c3c..d4e49c30f 100644 --- a/lzero/policy/muzero.py +++ b/lzero/policy/muzero.py @@ -224,6 +224,8 @@ def default_model(self) -> Tuple[str, List[str]]: return 'MuZeroModelMLP', ['lzero.model.muzero_model_mlp'] elif self._cfg.model.model_type == "rgcn": return 'MuZeroModelGCN', ['lzero.model.muzero_model_gcn'] + elif self._cfg.model.model_type == "mlp_md": + return 'MuZeroModelMD', ['lzero.model.muzero_model_md'] else: raise ValueError("model type {} is not supported".format(self._cfg.model.model_type)) @@ -643,7 +645,7 @@ def _get_target_obs_index_in_step_k(self, step): if self._cfg.model.model_type == 'conv': beg_index = self._cfg.model.image_channel * step end_index = self._cfg.model.image_channel * (step + self._cfg.model.frame_stack_num) - elif self._cfg.model.model_type == 'mlp': + elif self._cfg.model.model_type == 'mlp' or self._cfg.model.model_type == 'mlp_md': beg_index = self._cfg.model.observation_shape * step end_index = self._cfg.model.observation_shape * (step + self._cfg.model.frame_stack_num) elif self._cfg.model.model_type == 'rgcn': diff --git a/lzero/worker/muzero_evaluator.py b/lzero/worker/muzero_evaluator.py index 6d871fc33..a718d33b7 100644 --- a/lzero/worker/muzero_evaluator.py +++ b/lzero/worker/muzero_evaluator.py @@ -337,22 +337,22 @@ def eval( action_mask_dict[env_id] = to_ndarray(obs['action_mask']) to_play_dict[env_id] = to_ndarray(obs['to_play']) - dones[env_id] = done - if t.done: - # Env reset is done by env_manager automatically. - self._policy.reset([env_id]) - reward = t.info['eval_episode_return'] - # 'performance_info' and 'episode_info' only choose one - if 'performance_info' in t.info: - eval_monitor.update_info(env_id, t.info['performance_info']) - elif 'episode_info' in t.info: - eval_monitor.update_info(env_id, t.info['episode_info']) - eval_monitor.update_reward(env_id, reward) - self._logger.info( - "[EVALUATOR]env {} finish episode, final reward: {}, current episode: {}".format( - env_id, eval_monitor.get_latest_reward(env_id), eval_monitor.get_current_episode() + dones[env_id] = done + if t.done: + # Env reset is done by env_manager automatically. + self._policy.reset([env_id]) + reward = t.info['eval_episode_return'] + # 'performance_info' and 'episode_info' only choose one + if 'performance_info' in t.info: + eval_monitor.update_info(env_id, t.info['performance_info']) + elif 'episode_info' in t.info: + eval_monitor.update_info(env_id, t.info['episode_info']) + eval_monitor.update_reward(env_id, reward) + self._logger.info( + "[EVALUATOR]env {} finish episode, final reward: {}, current episode: {}".format( + env_id, eval_monitor.get_latest_reward(env_id), eval_monitor.get_current_episode() + ) ) - ) # reset the finished env and init game_segments if n_episode > self._env_num: diff --git a/zoo/CrowdSim/config/crowdsim_muzero_md_config.py b/zoo/CrowdSim/config/crowdsim_muzero_md_config.py new file mode 100644 index 000000000..ec4453ff8 --- /dev/null +++ b/zoo/CrowdSim/config/crowdsim_muzero_md_config.py @@ -0,0 +1,117 @@ +from easydict import EasyDict +import os +os.environ["CUDA_VISIBLE_DEVICES"] = '2' +# ============================================================== +# begin of the most frequently changed config specified by the user +# ============================================================== +collector_env_num = 8 +n_episode = 8 +evaluator_env_num = 3 +num_simulations = 25 +update_per_collect = 100 +batch_size = 256 +max_env_step = int(3e5) +reanalyze_ratio = 0. +robot_num = 2 +human_num = 59 # purdue +# human_num = 33 # NCSU +# human_num = 92 # KAIST +one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]] +# ============================================================== +# end of the most frequently changed config specified by the user +# ============================================================== + +CrowdSim_muzero_config = dict( + exp_name= + f'result/CrowdSim_muzeromd_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0', + env=dict( + obs_mode='1-dim-array', + env_name='CrowdSim-v0', + dataset = 'purdue', + robot_num = robot_num, + human_num = human_num, + one_uav_action_space = one_uav_action_space, + continuous=False, + manually_discretization=False, + collector_env_num=collector_env_num, + evaluator_env_num=evaluator_env_num, + n_evaluator_episode=evaluator_env_num, + manager=dict(shared_memory=False, ), + ), + policy=dict( + model=dict( + # robot_observation_shape=(robot_num, 4), + # human_observation_shape=(human_num, 4), + agent_num = robot_num, + observation_shape=(robot_num + human_num)*4, + obs_mode='1-dim-array', + robot_state_dim = 4, + human_state_dim = 4, + robot_num = robot_num, + human_num = human_num, + single_agent_action_size=len(one_uav_action_space), + action_space_size=(len(one_uav_action_space))**robot_num, + model_type='mlp_md', + output_separate_logit=False, # not output separate logit for each action. + lstm_hidden_size=256, + latent_state_dim=256, + self_supervised_learning_loss=True, # NOTE: default is False. + discrete_action_encoding_type='one_hot', + res_connection_in_dynamics=True, + norm_type='BN', + ), + cuda=True, + env_type='not_board_games', + game_segment_length=200, + update_per_collect=update_per_collect, + batch_size=batch_size, + optim_type='Adam', + lr_piecewise_constant_decay=False, + learning_rate=0.003, + ssl_loss_weight=2, # NOTE: default is 0. + grad_clip_value=0.5, + num_simulations=num_simulations, + reanalyze_ratio=reanalyze_ratio, + n_episode=n_episode, + eval_freq=int(1e3), + replay_buffer_size=int(1e6), # the size/capacity of replay_buffer, in the terms of transitions. + collector_env_num=collector_env_num, + evaluator_env_num=evaluator_env_num, + ), +) + +CrowdSim_muzero_config = EasyDict(CrowdSim_muzero_config) +main_config = CrowdSim_muzero_config + +CrowdSim_muzero_create_config = dict( + env=dict( + type='crowdsim_lightzero', + import_names=['zoo.CrowdSim.envs.crowdsim_lightzero_env'], + ), + env_manager=dict(type='base'), + policy=dict( + type='muzero', + import_names=['lzero.policy.muzero'], + ), + collector=dict( + type='episode_muzero', + import_names=['lzero.worker.muzero_collector'], + ) +) +CrowdSim_muzero_create_config = EasyDict(CrowdSim_muzero_create_config) +create_config = CrowdSim_muzero_create_config + +if __name__ == "__main__": + # Users can use different train entry by specifying the entry_type. + entry_type = "train_muzero" # options={"train_muzero", "train_muzero_with_gym_env"} + + if entry_type == "train_muzero": + from lzero.entry import train_muzero + elif entry_type == "train_muzero_with_gym_env": + """ + The ``train_muzero_with_gym_env`` entry means that the environment used in the training process is generated by wrapping the original gym environment with LightZeroEnvWrapper. + Users can refer to lzero/envs/wrappers for more details. + """ + from lzero.entry import train_muzero_with_gym_env as train_muzero + + train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step) From 61831f1c3c5b374f5d418666497272ababc1e43a Mon Sep 17 00:00:00 2001 From: nighood Date: Sat, 4 May 2024 01:56:31 +0800 Subject: [PATCH 07/16] feature(rjy): modify new env with transmitted data --- zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py | 59 +++++++++++-------- zoo/CrowdSim/envs/Crowdsim/env/model/agent.py | 53 +++++++++++++++-- zoo/CrowdSim/envs/Crowdsim/env/model/utils.py | 2 +- 3 files changed, 83 insertions(+), 31 deletions(-) diff --git a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py index fe058d8fd..f05567891 100644 --- a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py +++ b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py @@ -37,12 +37,13 @@ def __init__(self, dataset, custom_config=None): self.max_uav_energy = self.config.max_uav_energy # self.action_space = gym.spaces.Discrete(4**self.robot_num) # for each robot have 4 actions(up, down, left, right), then product self.action_space = gym.spaces.Discrete(len(self.config.one_uav_action_space)) - # human obs: [px, py, theta, aoi] + # human obs: [px, py, remaining_data_amount, aoi] # robot obs: [px, py, theta, energy] # self.observation_space = gym.spaces.Box(low=float("-inf"), high=float("inf"), shape=(4), dtype=np.float32) self.observation_space = gym.spaces.Box(low=float("-inf"), high=float("inf"), shape=(self.robot_num+self.human_num, 4), dtype=np.float32) # load_dataset + self.transmit_v = 5 # 5*0.3Mb/s self.nlon = self.config.nlon self.nlat = self.config.nlat self.lower_left = self.config.lower_left @@ -64,18 +65,20 @@ def __init__(self, dataset, custom_config=None): self.human_df['t'] = pd.to_datetime(self.human_df['timestamp'], unit='s') # 's' stands for second self.human_df['aoi'] = -1 # 加入aoi记录aoi + self.human_df['data_amount'] = -1 # record the remaining data amount of each human self.human_df['energy'] = -1 # 加入energy记录energy logging.info('human number: {}'.format(self.human_num)) logging.info('Robot number: {}'.format(self.robot_num)) # for debug - self.current_human_aoi_list = np.ones([self.human_num, ]) + self.current_human_aoi_list = np.zeros([self.human_num, ]) self.mean_aoi_timelist = np.ones([self.config.num_timestep + 1, ]) + self.cur_data_amount_timelist = np.zeros([self.human_num, ]) self.robot_energy_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num]) self.robot_x_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num]) self.robot_y_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num]) self.update_human_timelist = np.zeros([self.config.num_timestep, ]) - self.data_collection = 0 + self.data_transmission = 0 def set_agent(self, agent): self.agent = agent @@ -83,8 +86,8 @@ def set_agent(self, agent): def generate_human(self, human_id, selected_data, selected_next_data): human = Human(human_id, self.config) px, py, theta = get_human_position_from_list(self.current_timestep, human_id, selected_data, selected_next_data, self.config) - # human obs: [px, py, theta, aoi] - human.set(px, py, theta, 1) # initial aoi of human is 1 + # human obs: [px, py, data_amount, aoi] + human.set(px, py, theta, 0, 0) # initial aoi of human is 0 return human def generate_robot(self, robot_id): @@ -93,7 +96,7 @@ def generate_robot(self, robot_id): robot.set(self.nlon / 2, self.nlat / 2, 0, self.max_uav_energy) # robot有energy return robot - def sync_human_df(self, human_id, current_timestep, aoi): + def sync_human_df(self, human_id, current_timestep, aoi, data_amount): """ Overview: Sync the human_df with the current timestep and aoi. @@ -107,6 +110,7 @@ def sync_human_df(self, human_id, current_timestep, aoi): (self.human_df.id == human_id) & (self.human_df.timestamp == current_timestamp)].index # self.human_df.loc[current_index, "aoi"] = aoi # slower self.human_df.iat[current_index.values[0], 9] = aoi # faster + # self.human_df.iat[current_index.values[0], 10] = data_amount def reset(self, phase='test', test_case=None): self.current_timestep = 0 @@ -114,17 +118,21 @@ def reset(self, phase='test', test_case=None): # generate human self.humans = [] selected_data, selected_next_data = get_human_position_list(self.current_timestep, self.human_df, self.config) + self.generate_data_amount_per_step = 0 + self.total_generated_data_amount = 0 for human_id in range(self.human_num): self.humans.append(self.generate_human(human_id, selected_data, selected_next_data)) - self.sync_human_df(human_id, self.current_timestep, 1) + self.generate_data_amount_per_step += self.humans[human_id].collect_v + self.sync_human_df(human_id, self.current_timestep, aoi=0, data_amount=0) # generate robot self.robots = [] for robot_id in range(self.robot_num): self.robots.append(self.generate_robot(robot_id)) - self.current_human_aoi_list = np.ones([self.human_num, ]) - self.mean_aoi_timelist = np.ones([self.config.num_timestep + 1, ]) + self.cur_data_amount_timelist = np.zeros([self.human_num, ]) + self.current_human_aoi_list = np.zeros([self.human_num, ]) + self.mean_aoi_timelist = np.zeros([self.config.num_timestep + 1, ]) self.mean_aoi_timelist[self.current_timestep] = np.mean(self.current_human_aoi_list) self.robot_energy_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num]) self.robot_energy_timelist[self.current_timestep, :] = self.max_uav_energy @@ -133,7 +141,7 @@ def reset(self, phase='test', test_case=None): self.robot_y_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num]) self.robot_y_timelist[self.current_timestep, :] = self.nlat / 2 self.update_human_timelist = np.zeros([self.config.num_timestep, ]) - self.data_collection = 0 + self.data_transmission = 0 # for visualization self.plot_states = [] @@ -182,33 +190,31 @@ def step(self, action): robot.set(new_robot_px, new_robot_py, robot_theta, energy=new_energy) selected_data, selected_next_data = get_human_position_list(self.current_timestep + 1, self.human_df, self.config) - delta_human_aoi_list = np.zeros_like(self.current_human_aoi_list) # 0 means no update + human_transmit_data_list = np.zeros_like(self.cur_data_amount_timelist) # 0 means no update for human_id, human in enumerate(self.humans): next_px, next_py, next_theta = get_human_position_from_list(self.current_timestep + 1, human_id, selected_data, selected_next_data, self.config) should_reset = judge_aoi_update([next_px, next_py], new_robot_position, self.config) if should_reset: - # if the human is in the range of the robot, then reset the aoi of the human - if human.aoi > 1: - delta_human_aoi_list[human_id] = human.aoi - else: - delta_human_aoi_list[human_id] = 1 - - human.set(next_px, next_py, next_theta, aoi=1) + # if the human is in the range of the robot, then part of human's data will be transmitted + last_data_amount = human.data_amount + human.update(next_px, next_py, next_theta, transmitted_data=self.transmit_v) + human_transmit_data_list[human_id] = min(last_data_amount, self.transmit_v) num_updated_human += 1 else: - # if the human is not in the range of the robot, then update the aoi of the human - delta_human_aoi_list[human_id] = 0 - new_aoi = human.aoi + 1 - human.set(next_px, next_py, next_theta, aoi=new_aoi) + # if the human is not in the range of the robot, then no data will be transmitted, \ + # and update aoi and caculate new collected data amount + human_transmit_data_list[human_id] = 0 + human.update(next_px, next_py, next_theta, transmitted_data=0) self.current_human_aoi_list[human_id] = human.aoi - self.sync_human_df(human_id, self.current_timestep + 1, human.aoi) + self.sync_human_df(human_id, self.current_timestep + 1, human.aoi, human.data_amount) self.mean_aoi_timelist[self.current_timestep + 1] = np.mean(self.current_human_aoi_list) self.update_human_timelist[self.current_timestep] = num_updated_human - delta_sum_aoi = np.sum(delta_human_aoi_list) - self.data_collection += (delta_sum_aoi * 0.3) # Mb, 0.02M/s per person + delta_sum_transmit_data = np.sum(human_transmit_data_list) + self.data_transmission += (delta_sum_transmit_data * 0.3) # Mb, 0.02M/s per person + self.total_generated_data_amount += self.generate_data_amount_per_step # TODO: need to be well-defined reward = self.mean_aoi_timelist[self.current_timestep] - self.mean_aoi_timelist[self.current_timestep + 1] \ @@ -233,9 +239,10 @@ def step(self, action): info = { "performance_info": { "mean_aoi": self.mean_aoi_timelist[self.current_timestep], + "mean_transmit_data": delta_sum_transmit_data / self.human_num, "mean_energy_consumption": 1.0 - ( np.mean(self.robot_energy_timelist[self.current_timestep]) / self.max_uav_energy), - "collected_data_amount": self.data_collection/(self.num_timestep*self.human_num*0.3), + "transmitted_data_ratio": self.data_transmission/(self.total_generated_data_amount*0.3), "human_coverage": np.mean(self.update_human_timelist) / self.human_num }, } diff --git a/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py b/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py index 5201c5b61..aabc184d2 100644 --- a/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py +++ b/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py @@ -1,4 +1,5 @@ import abc +import random import logging from zoo.CrowdSim.envs.Crowdsim.env.model.mdp import * @@ -25,25 +26,39 @@ def act(self, state, current_timestep): class Human(): + collect_v_prob = {1: 0.3, 2: 0.6} def __init__(self, id, config): self.id = id self.config = config self.px = None self.py = None self.theta = None - self.aoi = None + self.aoi = 0 + self.data_queue = InformationQueue() + self.data_amount = 0 + self.collect_v = random.choices(list(self.collect_v_prob.keys()), list(self.collect_v_prob.values()))[0] - def set(self, px, py, theta, aoi): - self.px = px # position + def set(self, px, py, theta, aoi, data_amount): + self.px = px self.py = py self.theta = theta self.aoi = aoi + self.data_amount = data_amount + + def update(self, px, py, theta, transmitted_data): + self.px = px # position + self.py = py + self.theta = theta + self.data_queue.update(self.collect_v, transmitted_data) + self.aoi = self.data_queue.total_aoi() + self.data_amount = self.data_queue.total_blocks() # TODO: change state,可能需要归一化 def get_obs(self): + # obs: (px, py, remaining_data, aoi) return HumanState(self.px / self.config.nlon, self.py / self.config.nlat, - self.theta / self.config.rotation_limit, + self.data_amount / self.config.num_timestep, self.aoi / self.config.num_timestep) @@ -70,3 +85,33 @@ def get_obs(self): self.energy / self.config.max_uav_energy) +class InformationQueue: + def __init__(self): + # Initialize the queue to hold the age of each information block + self.queue = [] + + def update(self, arrivals, departures): + # Increase the age of information (aoi) for each block in the queue + self.queue = [age + 1 for age in self.queue] + + # Add new information blocks with aoi of 0 + self.queue.extend([0] * arrivals) + + # Remove the specified number of oldest information blocks + self.queue = self.queue[departures:] if departures <= len(self.queue) else [] + + def total_aoi(self): + # Return the total age of information in the queue + return sum(self.queue) + + def total_blocks(self): + # Return the total number of information blocks in the queue + return len(self.queue) + +# # Example of using the InformationQueue class +# info_queue = InformationQueue() +# info_queue.update(arrivals=5, departures=0) # 5 blocks enter the queue, all with aoi of 0 +# info_queue.update(arrivals=3, departures=2) # 3 new blocks enter, 2 blocks leave +# total_age = info_queue.total_aoi() # Calculate the total age of information in the queue + +# total_age diff --git a/zoo/CrowdSim/envs/Crowdsim/env/model/utils.py b/zoo/CrowdSim/envs/Crowdsim/env/model/utils.py index ce903d9f8..e81ec842f 100644 --- a/zoo/CrowdSim/envs/Crowdsim/env/model/utils.py +++ b/zoo/CrowdSim/envs/Crowdsim/env/model/utils.py @@ -65,7 +65,7 @@ def get_human_position_from_list(selected_timestep, human_id, selected_data, sel def judge_aoi_update(human_position, robot_position, config): """ Overview: - Judge whether the AoI should be updated + Judge whether the AoI should be updated, i.e., the human is in the sensing range of the robot. Args: - human_position (:obj:`list`): The position of the human. - robot_position (:obj:`list`): The position of the robot. From 9599faa62df9f98a92e67b36b85b54ac251de7a2 Mon Sep 17 00:00:00 2001 From: nighood Date: Mon, 6 May 2024 01:00:05 +0800 Subject: [PATCH 08/16] feature(rjy): add rough vis of crowdsim --- zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py | 61 ++++++++++++++++--- zoo/CrowdSim/envs/crowdsim_lightzero_env.py | 7 +++ .../envs/test_crowdsim_lightzero_env.py | 14 ++++- 3 files changed, 70 insertions(+), 12 deletions(-) diff --git a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py index f05567891..b84a92ba9 100644 --- a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py +++ b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py @@ -43,7 +43,7 @@ def __init__(self, dataset, custom_config=None): self.observation_space = gym.spaces.Box(low=float("-inf"), high=float("inf"), shape=(self.robot_num+self.human_num, 4), dtype=np.float32) # load_dataset - self.transmit_v = 5 # 5*0.3Mb/s + self.transmit_v = 20 # 5*0.3Mb/s self.nlon = self.config.nlon self.nlat = self.config.nlat self.lower_left = self.config.lower_left @@ -110,7 +110,7 @@ def sync_human_df(self, human_id, current_timestep, aoi, data_amount): (self.human_df.id == human_id) & (self.human_df.timestamp == current_timestamp)].index # self.human_df.loc[current_index, "aoi"] = aoi # slower self.human_df.iat[current_index.values[0], 9] = aoi # faster - # self.human_df.iat[current_index.values[0], 10] = data_amount + self.human_df.iat[current_index.values[0], 10] = data_amount def reset(self, phase='test', test_case=None): self.current_timestep = 0 @@ -249,11 +249,52 @@ def step(self, action): return next_state, reward, done, info - def render(self, mode='traj', output_file=None, plot_loop=False, moving_line=False): - # ------------------------------------------------------------------- - if mode == 'html': - pass - elif mode == 'traj': - pass - else: - raise NotImplementedError \ No newline at end of file + def render(self): + import matplotlib.pyplot as plt + import matplotlib.patches as patches + import io + import imageio + + map_max_x = self.config.nlon + map_max_y = self.config.nlat + # 创建一个新的图形 + fig, ax = plt.subplots(figsize=(8, 6)) + + # 绘制机器人的历史轨迹 + for timestep in range(len(self.robot_x_timelist)): + for robot_id in range(len(self.robot_x_timelist[timestep])): + ax.plot(self.robot_x_timelist[timestep][robot_id], self.robot_y_timelist[timestep][robot_id], color='gray', alpha=0.5) + + # 绘制机器人的位置 + for robot in self.robots: + ax.plot(robot.px, robot.py, marker='o', markersize=5, color='blue') + + # 绘制机器人的感知范围 + for robot in self.robots: + robot_x, robot_y = robot.px, robot.py + circle = patches.Circle((robot_x, robot_y), self.config.sensing_range, edgecolor='blue', facecolor='none') + ax.add_patch(circle) + + # 绘制人类的位置和AOI变化 + for human in self.humans: + human_x, human_y, aoi = human.px, human.py, human.aoi + ax.plot(human_x, human_y, marker='x', markersize=5, color='red') + ax.text(human_x, human_y, str(aoi), fontsize=8, color='black') + + # 设置图形标题和坐标轴标签 + # ax.set_xlim(0, map_max_x) + # ax.set_ylim(0, map_max_y) + ax.set_xlabel('X') + ax.set_ylabel('Y') + + # 在地图之外留出一些空白区域 + ax.margins(x=0.1, y=0.1) + ax.set_title('Crowd Simulation Visualization') + + # 显示图形 + fig.canvas.draw() + image = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8) + image = image.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + + return image \ No newline at end of file diff --git a/zoo/CrowdSim/envs/crowdsim_lightzero_env.py b/zoo/CrowdSim/envs/crowdsim_lightzero_env.py index e28a1baac..4aca9dd1a 100644 --- a/zoo/CrowdSim/envs/crowdsim_lightzero_env.py +++ b/zoo/CrowdSim/envs/crowdsim_lightzero_env.py @@ -79,6 +79,8 @@ def reset(self) -> np.ndarray: obs = np.concatenate((obs_list[0], obs_list[1]), axis=0).flatten() action_mask = np.ones(self.action_space.n, 'int8') obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1} + if self._replay_path is not None: + self._frame = [] return obs @@ -119,6 +121,11 @@ def step(self, action: Union[int, np.ndarray]) -> BaseEnvTimestep: action_mask = np.ones(self.action_space.n, 'int8') obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1} rew = to_ndarray([rew]).astype(np.float32) + if self._replay_path is not None: + self._frame.append(self._env.render()) + if done: + import imageio + imageio.mimsave(self._replay_path + '/replay.gif', self._frame) return BaseEnvTimestep(obs, rew, done, info) def enable_save_replay(self, replay_path: Optional[str] = None) -> None: diff --git a/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py b/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py index b5721d0c3..0a5266143 100644 --- a/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py +++ b/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py @@ -57,15 +57,25 @@ def test_obs_1_dim_array(self): mcfg['obs_mode'] = '1-dim-array' env = CrowdSimEnv(mcfg) env.seed(314) + env.enable_save_replay('/home/nighoodRen/LightZero/result/test_replay') assert env._seed == 314 obs = env.reset() assert obs['observation'].shape == (244, ) - for i in range(10): + while True: random_action = env.random_action() timestep = env.step(random_action) print(timestep) assert timestep.obs['observation'].shape == (244, ) assert isinstance(timestep.done, bool) assert timestep.reward.shape == (1, ) + if timestep.done: + break print(env.observation_space, env.action_space, env.reward_space) - env.close() \ No newline at end of file + env.close() + + +if __name__ == '__main__': + test = TestCrowdSimEnv() + # test.test_obs_dict() + # test.test_obs_2_dim_array() + test.test_obs_1_dim_array() \ No newline at end of file From 15d9a44cda18f6cd02697ab8eb970c6676a60e23 Mon Sep 17 00:00:00 2001 From: nighood Date: Mon, 6 May 2024 01:21:12 +0800 Subject: [PATCH 09/16] polish(rjy): fix new env info in collecter --- lzero/worker/muzero_collector.py | 12 ++++++++---- zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py | 1 + 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/lzero/worker/muzero_collector.py b/lzero/worker/muzero_collector.py index ae516960c..3c1a630dd 100644 --- a/lzero/worker/muzero_collector.py +++ b/lzero/worker/muzero_collector.py @@ -573,8 +573,9 @@ def collect(self, reward = timestep.info['eval_episode_return'] if timestep.info.get('performance_info') is not None: mean_aoi = timestep.info['performance_info']['mean_aoi'] + mean_transmit_data = timestep.info['performance_info']['mean_transmit_data'] mean_energy_consumption = timestep.info['performance_info']['mean_energy_consumption'] - collected_data_amount = timestep.info['performance_info']['collected_data_amount'] + transmitted_data_ratio = timestep.info['performance_info']['transmitted_data_ratio'] human_coverage = timestep.info['performance_info']['human_coverage'] info = { 'reward': reward, @@ -582,8 +583,9 @@ def collect(self, 'step': self._env_info[env_id]['step'], 'visit_entropy': visit_entropies_lst[env_id] / eps_steps_lst[env_id], 'mean_aoi': mean_aoi, + 'mean_transmit_data': mean_transmit_data, 'mean_energy_consumption': mean_energy_consumption, - 'collected_data_amount': collected_data_amount, + 'transmitted_data_ratio': transmitted_data_ratio, 'human_coverage': human_coverage, } else: @@ -730,8 +732,9 @@ def _output_log(self, train_iter: int) -> None: if self._episode_info[0].get('mean_aoi') is not None: episode_aoi = [d['mean_aoi'] for d in self._episode_info] episode_energy_consumption = [d['mean_energy_consumption'] for d in self._episode_info] - episode_collected_data_amount = [d['collected_data_amount'] for d in self._episode_info] + episode_transmitted_data_ratio = [d['transmitted_data_ratio'] for d in self._episode_info] episode_human_coverage = [d['human_coverage'] for d in self._episode_info] + mean_transmit_data = [d['mean_transmit_data'] for d in self._episode_info] info = { 'episode_count': episode_count, 'envstep_count': envstep_count, @@ -748,8 +751,9 @@ def _output_log(self, train_iter: int) -> None: 'total_duration': self._total_duration, 'visit_entropy': np.mean(visit_entropy), 'episode_mean_aoi': np.mean(episode_aoi), + 'episode_mean_transmit_data': np.mean(mean_transmit_data), 'episode_mean_energy_consumption': np.mean(episode_energy_consumption), - 'episode_mean_collected_data_amount': np.mean(episode_collected_data_amount), + 'episode_mean_transmitted_data_ratio': np.mean(episode_transmitted_data_ratio), 'episode_mean_human_coverage': np.mean(episode_human_coverage), } else: diff --git a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py index b84a92ba9..858bbe1a0 100644 --- a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py +++ b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py @@ -207,6 +207,7 @@ def step(self, action): human_transmit_data_list[human_id] = 0 human.update(next_px, next_py, next_theta, transmitted_data=0) + self.cur_data_amount_timelist[human_id] = human.data_amount self.current_human_aoi_list[human_id] = human.aoi self.sync_human_df(human_id, self.current_timestep + 1, human.aoi, human.data_amount) From 3c8804d65bb3c6ac8f434ef9f39b7c084281c20b Mon Sep 17 00:00:00 2001 From: nighood Date: Mon, 6 May 2024 21:40:03 +0800 Subject: [PATCH 10/16] feature(rjy): add sez mlp_multi-head --- lzero/agent/sampled_efficientzero.py | 3 + lzero/model/sampled_efficientzero_model_md.py | 547 ++++++++++++++++++ lzero/policy/sampled_efficientzero.py | 2 + zoo/CrowdSim/config/crowdsim_sez_md_config.py | 118 ++++ 4 files changed, 670 insertions(+) create mode 100644 lzero/model/sampled_efficientzero_model_md.py create mode 100644 zoo/CrowdSim/config/crowdsim_sez_md_config.py diff --git a/lzero/agent/sampled_efficientzero.py b/lzero/agent/sampled_efficientzero.py index 079bdd11d..ece54b784 100644 --- a/lzero/agent/sampled_efficientzero.py +++ b/lzero/agent/sampled_efficientzero.py @@ -110,6 +110,9 @@ def __init__( elif self.cfg.policy.model.model_type == 'conv': from lzero.model.sampled_efficientzero_model import SampledEfficientZeroModel model = SampledEfficientZeroModel(**self.cfg.policy.model) + elif self.cfg.policy.model.model_type == 'mlp_md': + from lzero.model.sampled_efficientzero_model_md import SampledEfficientZeroModelMD + model = SampledEfficientZeroModelMD(**self.cfg.policy.model) else: raise NotImplementedError if self.cfg.policy.cuda and torch.cuda.is_available(): diff --git a/lzero/model/sampled_efficientzero_model_md.py b/lzero/model/sampled_efficientzero_model_md.py new file mode 100644 index 000000000..99092de4a --- /dev/null +++ b/lzero/model/sampled_efficientzero_model_md.py @@ -0,0 +1,547 @@ +from typing import Optional, Tuple + +import torch +import torch.nn as nn +from ding.model.common import ReparameterizationHead +from ding.model.common.head import MultiHead, DiscreteHead +from ding.torch_utils import MLP +from ding.utils import MODEL_REGISTRY, SequenceType + +from .common import EZNetworkOutput, RepresentationNetworkMLP +from .efficientzero_model_mlp import DynamicsNetworkMLP +from .utils import renormalize, get_params_mean + + +@MODEL_REGISTRY.register('SampledEfficientZeroModelMD') +class SampledEfficientZeroModelMD(nn.Module): + + def __init__( + self, + agent_num: int, + single_agent_action_size: int, + output_separate_logit: bool = False, + observation_shape: int = 2, + action_space_size: int = 6, + latent_state_dim: int = 256, + lstm_hidden_size: int = 512, + fc_reward_layers: SequenceType = [32], + fc_value_layers: SequenceType = [32], + fc_policy_layers: SequenceType = [32], + reward_support_size: int = 601, + value_support_size: int = 601, + proj_hid: int = 1024, + proj_out: int = 1024, + pred_hid: int = 512, + pred_out: int = 1024, + self_supervised_learning_loss: bool = True, + categorical_distribution: bool = True, + activation: Optional[nn.Module] = nn.ReLU(inplace=True), + last_linear_layer_init_zero: bool = True, + state_norm: bool = False, + # ============================================================== + # specific sampled related config + # ============================================================== + continuous_action_space: bool = False, + num_of_sampled_actions: int = 6, + sigma_type='conditioned', + fixed_sigma_value: float = 0.3, + bound_type: str = None, + norm_type: str = 'BN', + discrete_action_encoding_type: str = 'one_hot', + res_connection_in_dynamics: bool = False, + *args, + **kwargs, + ): + """ + Overview: + The definition of the network model of Sampled EfficientZero, which is a generalization version for 1D vector obs. + The networks are mainly built on fully connected layers. + Sampled EfficientZero model consists of a representation network, a dynamics network and a prediction network. + The representation network is an MLP network which maps the raw observation to a latent state. + The dynamics network is an MLP+LSTM network which predicts the next latent state, reward_hidden_state and value_prefix given the current latent state and action. + The prediction network is an MLP network which predicts the value and policy given the current latent state. + Arguments: + - observation_shape (:obj:`int`): Observation space shape, e.g. 8 for Lunarlander. + - action_space_size: (:obj:`int`): Action space size, which is an integer number. For discrete action space, it is the num of discrete actions, \ + e.g. 4 for Lunarlander. For continuous action space, it is the dimension of the continuous action, e.g. 4 for bipedalwalker. + - latent_state_dim (:obj:`int`): The dimension of latent state, such as 256. + - lstm_hidden_size (:obj:`int`): The hidden size of LSTM in dynamics network to predict value_prefix. + - fc_reward_layers (:obj:`SequenceType`): The number of hidden layers of the reward head (MLP head). + - fc_value_layers (:obj:`SequenceType`): The number of hidden layers used in value head (MLP head). + - fc_policy_layers (:obj:`SequenceType`): The number of hidden layers used in policy head (MLP head). + - reward_support_size (:obj:`int`): The size of categorical reward output + - value_support_size (:obj:`int`): The size of categorical value output. + - proj_hid (:obj:`int`): The size of projection hidden layer. + - proj_out (:obj:`int`): The size of projection output layer. + - pred_hid (:obj:`int`): The size of prediction hidden layer. + - pred_out (:obj:`int`): The size of prediction output layer. + - self_supervised_learning_loss (:obj:`bool`): Whether to use self_supervised_learning related networks in Sampled EfficientZero model, default set it to False. + - categorical_distribution (:obj:`bool`): Whether to use discrete support to represent categorical distribution for value, reward/value_prefix. + - activation (:obj:`Optional[nn.Module]`): Activation function used in network, which often use in-place \ + operation to speedup, e.g. ReLU(inplace=True). + - last_linear_layer_init_zero (:obj:`bool`): Whether to use zero initializations for the last layer of value/policy mlp, default sets it to True. + - state_norm (:obj:`bool`): Whether to use normalization for latent states, default sets it to True. + # ============================================================== + # specific sampled related config + # ============================================================== + - continuous_action_space (:obj:`bool`): The type of action space. default set it to False. + - num_of_sampled_actions (:obj:`int`): the number of sampled actions, i.e. the K in original Sampled MuZero paper. + # see ``ReparameterizationHead`` in ``ding.model.common.head`` for more details about the following arguments. + - sigma_type (:obj:`str`): the type of sigma in policy head of prediction network, options={'conditioned', 'fixed'}. + - fixed_sigma_value (:obj:`float`): the fixed sigma value in policy head of prediction network, + - bound_type (:obj:`str`): The type of bound in networks. Default sets it to None. + - norm_type (:obj:`str`): The type of normalization in networks. default set it to 'BN'. + - discrete_action_encoding_type (:obj:`str`): The type of encoding for discrete action. Default sets it to 'one_hot'. options = {'one_hot', 'not_one_hot'} + - res_connection_in_dynamics (:obj:`bool`): Whether to use residual connection for dynamics network, default set it to False. + """ + super(SampledEfficientZeroModelMD, self).__init__() + if not categorical_distribution: + self.reward_support_size = 1 + self.value_support_size = 1 + else: + self.reward_support_size = reward_support_size + self.value_support_size = value_support_size + + self.continuous_action_space = continuous_action_space + self.observation_shape = observation_shape + self.action_space_size = action_space_size + # The dim of action space. For discrete action space, it is 1. + # For continuous action space, it is the dimension of continuous action. + self.action_space_dim = action_space_size if self.continuous_action_space else 1 + assert discrete_action_encoding_type in ['one_hot', 'not_one_hot'], discrete_action_encoding_type + self.discrete_action_encoding_type = discrete_action_encoding_type + if self.continuous_action_space: + self.action_encoding_dim = action_space_size + else: + if self.discrete_action_encoding_type == 'one_hot': + self.action_encoding_dim = action_space_size + elif self.discrete_action_encoding_type == 'not_one_hot': + self.action_encoding_dim = 1 + + self.lstm_hidden_size = lstm_hidden_size + self.latent_state_dim = latent_state_dim + self.fc_reward_layers = fc_reward_layers + self.fc_value_layers = fc_value_layers + self.fc_policy_layers = fc_policy_layers + self.proj_hid = proj_hid + self.proj_out = proj_out + self.pred_hid = pred_hid + self.pred_out = pred_out + + self.last_linear_layer_init_zero = last_linear_layer_init_zero + self.state_norm = state_norm + self.self_supervised_learning_loss = self_supervised_learning_loss + + self.sigma_type = sigma_type + self.fixed_sigma_value = fixed_sigma_value + self.bound_type = bound_type + self.norm_type = norm_type + self.num_of_sampled_actions = num_of_sampled_actions + self.res_connection_in_dynamics = res_connection_in_dynamics + + self.representation_network = RepresentationNetworkMLP( + observation_shape=self.observation_shape, hidden_channels=self.latent_state_dim, norm_type=norm_type + ) + + self.dynamics_network = DynamicsNetworkMLP( + action_encoding_dim=self.action_encoding_dim, + num_channels=self.latent_state_dim + self.action_encoding_dim, + common_layer_num=2, + lstm_hidden_size=self.lstm_hidden_size, + fc_reward_layers=self.fc_reward_layers, + output_support_size=self.reward_support_size, + last_linear_layer_init_zero=self.last_linear_layer_init_zero, + norm_type=norm_type, + res_connection_in_dynamics=self.res_connection_in_dynamics, + ) + + self.prediction_network = PredictionNetworkMD( + agent_num=agent_num, + single_agent_action_size=single_agent_action_size, + continuous_action_space=self.continuous_action_space, + action_space_size=self.action_space_size, + num_channels=self.latent_state_dim, + fc_value_layers=self.fc_value_layers, + output_support_size=self.value_support_size, + last_linear_layer_init_zero=self.last_linear_layer_init_zero, + sigma_type=self.sigma_type, + fixed_sigma_value=self.fixed_sigma_value, + bound_type=self.bound_type, + norm_type=self.norm_type, + output_separate_logit=output_separate_logit, + ) + + if self.self_supervised_learning_loss: + # self_supervised_learning_loss related network proposed in EfficientZero + self.projection_input_dim = latent_state_dim + self.projection = nn.Sequential( + nn.Linear(self.projection_input_dim, self.proj_hid), nn.BatchNorm1d(self.proj_hid), activation, + nn.Linear(self.proj_hid, self.proj_hid), nn.BatchNorm1d(self.proj_hid), activation, + nn.Linear(self.proj_hid, self.proj_out), nn.BatchNorm1d(self.proj_out) + ) + self.prediction_head = nn.Sequential( + nn.Linear(self.proj_out, self.pred_hid), + nn.BatchNorm1d(self.pred_hid), + activation, + nn.Linear(self.pred_hid, self.pred_out), + ) + + def initial_inference(self, obs: torch.Tensor) -> EZNetworkOutput: + """ + Overview: + Initial inference of SampledEfficientZero model, which is the first step of the SampledEfficientZero model. + To perform the initial inference, we first use the representation network to obtain the "latent_state" of the observation. + Then we use the prediction network to predict the "value" and "policy_logits" of the "latent_state", and + also prepare the zeros-like ``reward_hidden_state`` for the next step of the Sampled EfficientZero model. + Arguments: + - obs (:obj:`torch.Tensor`): The 1D vector observation data. + Returns (EZNetworkOutput): + - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation. + - value_prefix (:obj:`torch.Tensor`): The predicted prefix sum of value for input state. \ + In initial inference, we set it to zero vector. + - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action. + - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state. + - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The hidden state of LSTM about reward. In initial inference, \ + we set it to the zeros-like hidden state (H and C). + Shapes: + - obs (:obj:`torch.Tensor`): :math:`(B, obs_shape)`, where B is batch_size. + - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size. + - value_prefix (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size. + - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size. + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The shape of each element is :math:`(1, B, lstm_hidden_size)`, where B is batch_size. + """ + batch_size = obs.size(0) + obs = obs.to(torch.float32) + latent_state = self._representation(obs) + policy_logits, value = self._prediction(latent_state) + # zero initialization for reward hidden states + # (hn, cn), each element shape is (layer_num=1, batch_size, lstm_hidden_size) + reward_hidden_state = ( + torch.zeros(1, batch_size, + self.lstm_hidden_size).to(obs.device), torch.zeros(1, batch_size, + self.lstm_hidden_size).to(obs.device) + ) + return EZNetworkOutput(value, [0. for _ in range(batch_size)], policy_logits, latent_state, reward_hidden_state) + + def recurrent_inference( + self, latent_state: torch.Tensor, reward_hidden_state: torch.Tensor, action: torch.Tensor + ) -> EZNetworkOutput: + """ + Overview: + Recurrent inference of Sampled EfficientZero model, which is the rollout step of the Sampled EfficientZero model. + To perform the recurrent inference, we first use the dynamics network to predict ``next_latent_state``, + ``reward_hidden_state``, ``value_prefix`` by the given current ``latent_state`` and ``action``. + We then use the prediction network to predict the ``value`` and ``policy_logits``. + Arguments: + - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state. + - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The input hidden state of LSTM about reward. + - action (:obj:`torch.Tensor`): The predicted action to rollout. + Returns (EZNetworkOutput): + - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation. + - value_prefix (:obj:`torch.Tensor`): The predicted prefix sum of value for input state. + - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action. + - next_latent_state (:obj:`torch.Tensor`): The predicted next latent state. + - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The output hidden state of LSTM about reward. + Shapes: + - action (:obj:`torch.Tensor`): :math:`(B, )`, where B is batch_size. + - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size. + - value_prefix (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size. + - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size. + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - next_latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The shape of each element is :math:`(1, B, lstm_hidden_size)`, where B is batch_size. + """ + next_latent_state, reward_hidden_state, value_prefix = self._dynamics(latent_state, reward_hidden_state, action) + policy_logits, value = self._prediction(next_latent_state) + return EZNetworkOutput(value, value_prefix, policy_logits, next_latent_state, reward_hidden_state) + + def _representation(self, observation: torch.Tensor) -> Tuple[torch.Tensor]: + """ + Overview: + Use the representation network to encode the observations into latent state. + Arguments: + - obs (:obj:`torch.Tensor`): The 1D vector observation data. + Returns: + - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state. + Shapes: + - obs (:obj:`torch.Tensor`): :math:`(B, obs_shape)`, where B is batch_size. + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + """ + latent_state = self.representation_network(observation) + if self.state_norm: + latent_state = renormalize(latent_state) + return latent_state + + def _prediction(self, latent_state: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Overview: + Use the representation network to encode the observations into latent state. + Arguments: + - obs (:obj:`torch.Tensor`): The 1D vector observation data. + Returns: + - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action. + - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation. + Shapes: + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size. + - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size. + """ + policy, value = self.prediction_network(latent_state) + return policy, value + + def _dynamics(self, latent_state: torch.Tensor, reward_hidden_state: Tuple, + action: torch.Tensor) -> Tuple[torch.Tensor, Tuple[torch.Tensor], torch.Tensor]: + """ + Overview: + Concatenate ``latent_state`` and ``action`` and use the dynamics network to predict ``next_latent_state`` + ``value_prefix`` and ``next_reward_hidden_state``. + Arguments: + - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state. + - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The input hidden state of LSTM about reward. + - action (:obj:`torch.Tensor`): The predicted action to rollout. + Returns: + - next_latent_state (:obj:`torch.Tensor`): The predicted latent state of the next timestep. + - next_reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The output hidden state of LSTM about reward. + - value_prefix (:obj:`torch.Tensor`): The predicted prefix sum of value for input state. + Shapes: + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - action (:obj:`torch.Tensor`): :math:`(B, )`, where B is batch_size. + - next_latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - value_prefix (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size. + """ + # NOTE: the discrete action encoding type is important for some environments + + if not self.continuous_action_space: + # discrete action space + if self.discrete_action_encoding_type == 'one_hot': + # Stack latent_state with the one hot encoded action + if len(action.shape) == 1: + # (batch_size, ) -> (batch_size, 1) + # e.g., torch.Size([8]) -> torch.Size([8, 1]) + action = action.unsqueeze(-1) + + # transform action to one-hot encoding. + # action_one_hot shape: (batch_size, action_space_size), e.g., (8, 4) + action_one_hot = torch.zeros(action.shape[0], self.action_space_size, device=action.device) + # transform action to torch.int64 + action = action.long() + action_one_hot.scatter_(1, action, 1) + action_encoding = action_one_hot + elif self.discrete_action_encoding_type == 'not_one_hot': + action_encoding = action / self.action_space_size + if len(action_encoding.shape) == 1: + # (batch_size, ) -> (batch_size, 1) + # e.g., torch.Size([8]) -> torch.Size([8, 1]) + action_encoding = action_encoding.unsqueeze(-1) + else: + # continuous action space + if len(action.shape) == 1: + # (batch_size, ) -> (batch_size, 1) + # e.g., torch.Size([8]) -> torch.Size([8, 1]) + action = action.unsqueeze(-1) + elif len(action.shape) == 3: + # (batch_size, action_dim, 1) -> (batch_size, action_dim) + # e.g., torch.Size([8, 2, 1]) -> torch.Size([8, 2]) + action = action.squeeze(-1) + + action_encoding = action + + action_encoding = action_encoding.to(latent_state.device).float() + # state_action_encoding shape: (batch_size, latent_state[1] + action_dim]) or + # (batch_size, latent_state[1] + action_space_size]) depending on the discrete_action_encoding_type. + state_action_encoding = torch.cat((latent_state, action_encoding), dim=1) + + next_latent_state, next_reward_hidden_state, value_prefix = self.dynamics_network( + state_action_encoding, reward_hidden_state + ) + + if not self.state_norm: + return next_latent_state, next_reward_hidden_state, value_prefix + else: + next_latent_state_normalized = renormalize(next_latent_state) + return next_latent_state_normalized, next_reward_hidden_state, value_prefix + + def project(self, latent_state: torch.Tensor, with_grad=True) -> torch.Tensor: + """ + Overview: + Project the latent state to a lower dimension to calculate the self-supervised loss, which is proposed in EfficientZero. + For more details, please refer to the paper ``Exploring Simple Siamese Representation Learning``. + Arguments: + - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state. + - with_grad (:obj:`bool`): Whether to calculate gradient for the projection result. + Returns: + - proj (:obj:`torch.Tensor`): The result embedding vector of projection operation. + Shapes: + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - proj (:obj:`torch.Tensor`): :math:`(B, projection_output_dim)`, where B is batch_size. + + Examples: + >>> latent_state = torch.randn(256, 64) + >>> output = self.project(latent_state) + >>> output.shape # (256, 1024) + """ + proj = self.projection(latent_state) + + if with_grad: + # with grad, use prediction_head + return self.prediction_head(proj) + else: + return proj.detach() + + def get_params_mean(self): + return get_params_mean(self) + + +class PredictionNetworkMD(nn.Module): + + def __init__( + self, + agent_num, + single_agent_action_size, + continuous_action_space, + action_space_size, + num_channels, + common_layer_num: int = 2, + fc_value_layers: SequenceType = [32], + output_support_size: int = 601, + last_linear_layer_init_zero: bool = True, + activation: Optional[nn.Module] = nn.ReLU(inplace=True), + # ============================================================== + # specific sampled related config + # ============================================================== + sigma_type='conditioned', + fixed_sigma_value: float = 0.3, + bound_type: str = None, + norm_type: str = 'BN', + output_separate_logit: bool = False, + ): + """ + Overview: + The definition of policy and value prediction network, which is used to predict value and policy by the + given latent state. + The networks are mainly built on fully connected layers. + Arguments: + - agent_num (:obj:`int`): The number of agents in the environment. + - single_agent_action_size (:obj:`int`): The number of actions for each agent. + - continuous_action_space (:obj:`bool`): The type of action space. default set it to False. + - action_space_size: (:obj:`int`): Action space size, usually an integer number. For discrete action \ + space, it is the number of discrete actions. For continuous action space, it is the dimension of \ + continuous action. + - num_channels (:obj:`int`): The num of channels in latent states. + - num_res_blocks (:obj:`int`): The number of res blocks. + - fc_value_layers (:obj:`SequenceType`): hidden layers of the value prediction head (MLP head). + - output_support_size (:obj:`int`): dim of value output. + - last_linear_layer_init_zero (:obj:`bool`): Whether to use zero initializations for the last layer of value/policy mlp, default sets it to True. + - output_separate_logit (:obj:`bool`): Whether to output separate logit for each action. + # ============================================================== + # specific sampled related config + # ============================================================== + # see ``ReparameterizationHead`` in ``ding.model.common.head`` for more details about thee following arguments. + - sigma_type (:obj:`str`): the type of sigma in policy head of prediction network, options={'conditioned', 'fixed'}. + - fixed_sigma_value (:obj:`float`): the fixed sigma value in policy head of prediction network, + - bound_type (:obj:`str`): The type of bound in networks. default set it to None. + - norm_type (:obj:`str`): The type of normalization in networks. default set it to 'BN'. + """ + super().__init__() + self.num_channels = num_channels + self.continuous_action_space = continuous_action_space + self.norm_type = norm_type + self.sigma_type = sigma_type + self.fixed_sigma_value = fixed_sigma_value + self.bound_type = bound_type + self.action_space_size = action_space_size + if self.continuous_action_space: + self.action_encoding_dim = self.action_space_size + else: + self.action_encoding_dim = 1 + + # ******* common backbone ****** + self.fc_prediction_common = MLP( + in_channels=self.num_channels, + hidden_channels=self.num_channels, + out_channels=self.num_channels, + layer_num=common_layer_num, + activation=activation, + norm_type=norm_type, + output_activation=True, + output_norm=True, + # last_linear_layer_init_zero=False is important for convergence + last_linear_layer_init_zero=False, + ) + + # ******* value and policy head ****** + self.fc_value_head = MLP( + in_channels=self.num_channels, + hidden_channels=fc_value_layers[0], + out_channels=output_support_size, + layer_num=2, + activation=activation, + norm_type=norm_type, + output_activation=False, + output_norm=False, + # last_linear_layer_init_zero=True is beneficial for convergence speed. + last_linear_layer_init_zero=last_linear_layer_init_zero + ) + + # sampled related core code + if self.continuous_action_space: + self.policy_multi_head = MultiHead( + head_cls=ReparameterizationHead, + hidden_size=self.num_channels, + output_size_list=[single_agent_action_size for _ in range(agent_num)], + layer_num=2, + sigma_type=self.sigma_type, + fixed_sigma_value=self.fixed_sigma_value, + activation=nn.ReLU(), + norm_type=None, + bound_type=self.bound_type + ) + else: + self.policy_multi_head = MultiHead( + head_cls=DiscreteHead, + hidden_size=self.num_channels, + output_size_list=[single_agent_action_size for _ in range(agent_num)], + ) + self.output_separate_logit = output_separate_logit + + def forward(self, latent_state: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Overview: + Forward computation of the prediction network. + Arguments: + - latent_state (:obj:`torch.Tensor`): input tensor with shape (B, in_channels). + Returns: + - policy (:obj:`torch.Tensor`): policy tensor. If action space is discrete, shape is (B, action_space_size). + If action space is continuous, shape is (B, action_space_size * 2). + - value (:obj:`torch.Tensor`): value tensor with shape (B, output_support_size). + """ + x_prediction_common = self.fc_prediction_common(latent_state) + value = self.fc_value_head(x_prediction_common) + + # sampled related core code + if not self.continuous_action_space: + # policy_list: {'logit': [policy1, policy2, ...],} + # policyi shape: (B, action_space_size) + policy_list = self.policy_multi_head(x_prediction_common)['logit'] + if not self.output_separate_logit: + # The joint action space policy is the product of each agent policy + # policy shape: (B, action_space_size^^agent_num) + batch_size = latent_state.size(0) + joint_logits_batches = [] + for i in range(batch_size): + current_batch = [policy[i] for policy in policy_list] + cartesian_prod_result = torch.cartesian_prod(*current_batch) + joint_logits = cartesian_prod_result.prod(dim=1) + joint_logits_batches.append(joint_logits) + policy = torch.stack(joint_logits_batches) + else: + # policy_list: [policy1, policy2, ...] + # policy sahpe: (B, agent_num, action_space_size) + policy = torch.stack(policy_list, dim=1) + elif self.continuous_action_space: + # policy = torch.cat([policy['mu'], policy['sigma']], dim=-1) + # TODO(rjy): complete the continuous action space policy + pass + + return policy, value diff --git a/lzero/policy/sampled_efficientzero.py b/lzero/policy/sampled_efficientzero.py index 7003f6808..687fe631b 100644 --- a/lzero/policy/sampled_efficientzero.py +++ b/lzero/policy/sampled_efficientzero.py @@ -234,6 +234,8 @@ def default_model(self) -> Tuple[str, List[str]]: return 'SampledEfficientZeroModel', ['lzero.model.sampled_efficientzero_model'] elif self._cfg.model.model_type == "mlp": return 'SampledEfficientZeroModelMLP', ['lzero.model.sampled_efficientzero_model_mlp'] + elif self._cfg.model.model_type == "mlp_md": + return 'SampledEfficientZeroModelMD', ['lzero.model.sampled_efficientzero_model_md'] else: raise ValueError("model type {} is not supported".format(self._cfg.model.model_type)) diff --git a/zoo/CrowdSim/config/crowdsim_sez_md_config.py b/zoo/CrowdSim/config/crowdsim_sez_md_config.py new file mode 100644 index 000000000..08ace6a98 --- /dev/null +++ b/zoo/CrowdSim/config/crowdsim_sez_md_config.py @@ -0,0 +1,118 @@ +from easydict import EasyDict +import os +os.environ["CUDA_VISIBLE_DEVICES"] = '0' +# ============================================================== +# begin of the most frequently changed config specified by the user +# ============================================================== +collector_env_num = 8 +n_episode = 8 +evaluator_env_num = 3 +num_simulations = 25 +update_per_collect = 100 +batch_size = 256 +max_env_step = int(3e5) +reanalyze_ratio = 0. +robot_num = 2 +human_num = 59 # purdue +# human_num = 33 # NCSU +# human_num = 92 # KAIST +one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]] +K = 10 +# ============================================================== +# end of the most frequently changed config specified by the user +# ============================================================== + +CrowdSim_sez_config = dict( + exp_name= + f'result/old_env/new_CrowdSim_sez_md_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0', + env=dict( + obs_mode='1-dim-array', + env_name='CrowdSim-v0', + dataset = 'purdue', + robot_num = robot_num, + human_num = human_num, + one_uav_action_space = one_uav_action_space, + continuous=False, + manually_discretization=False, + collector_env_num=collector_env_num, + evaluator_env_num=evaluator_env_num, + n_evaluator_episode=evaluator_env_num, + manager=dict(shared_memory=False, ), + ), + policy=dict( + model=dict( + agent_num = robot_num, + observation_shape=(robot_num + human_num)*4, + obs_mode='1-dim-array', + robot_state_dim = 4, + human_state_dim = 4, + robot_num = robot_num, + human_num = human_num, + single_agent_action_size=len(one_uav_action_space), + action_space_size=(len(one_uav_action_space))**robot_num, + model_type='mlp_md', + output_separate_logit=False, # not output separate logit for each action. + continuous_action_space=False, + num_of_sampled_actions=K, + lstm_hidden_size=128, + latent_state_dim=128, + self_supervised_learning_loss=True, # NOTE: default is False. + discrete_action_encoding_type='one_hot', + res_connection_in_dynamics=True, + norm_type='BN', + ), + cuda=True, + env_type='not_board_games', + game_segment_length=200, + update_per_collect=update_per_collect, + batch_size=batch_size, + optim_type='Adam', + lr_piecewise_constant_decay=False, + learning_rate=0.003, + ssl_loss_weight=2, # NOTE: default is 0. + grad_clip_value=0.5, + num_simulations=num_simulations, + reanalyze_ratio=reanalyze_ratio, + n_episode=n_episode, + eval_freq=int(1e3), + replay_buffer_size=int(1e6), # the size/capacity of replay_buffer, in the terms of transitions. + collector_env_num=collector_env_num, + evaluator_env_num=evaluator_env_num, + ), +) + +CrowdSim_sez_config = EasyDict(CrowdSim_sez_config) +main_config = CrowdSim_sez_config + +CrowdSim_sez_create_config = dict( + env=dict( + type='crowdsim_lightzero', + import_names=['zoo.CrowdSim.envs.crowdsim_lightzero_env'], + ), + env_manager=dict(type='base'), + policy=dict( + type='sampled_efficientzero', + import_names=['lzero.policy.sampled_efficientzero'], + ), + collector=dict( + type='episode_muzero', + import_names=['lzero.worker.muzero_collector'], + ) +) +CrowdSim_sez_create_config = EasyDict(CrowdSim_sez_create_config) +create_config = CrowdSim_sez_create_config + +if __name__ == "__main__": + # Users can use different train entry by specifying the entry_type. + entry_type = "train_muzero" # options={"train_muzero", "train_muzero_with_gym_env"} + + if entry_type == "train_muzero": + from lzero.entry import train_muzero + elif entry_type == "train_muzero_with_gym_env": + """ + The ``train_muzero_with_gym_env`` entry means that the environment used in the training process is generated by wrapping the original gym environment with LightZeroEnvWrapper. + Users can refer to lzero/envs/wrappers for more details. + """ + from lzero.entry import train_muzero_with_gym_env as train_muzero + + train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step) From c6723a013a4b74938f2a294310fc06d1206c776d Mon Sep 17 00:00:00 2001 From: nighood Date: Tue, 7 May 2024 00:01:54 +0800 Subject: [PATCH 11/16] feature(rjy): set the environment to two modes --- .../config/crowdsim_muzero_md_config.py | 5 +- zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py | 48 ++++++++++++++----- .../envs/test_crowdsim_lightzero_env.py | 1 + 3 files changed, 41 insertions(+), 13 deletions(-) diff --git a/zoo/CrowdSim/config/crowdsim_muzero_md_config.py b/zoo/CrowdSim/config/crowdsim_muzero_md_config.py index ec4453ff8..c0b3b1018 100644 --- a/zoo/CrowdSim/config/crowdsim_muzero_md_config.py +++ b/zoo/CrowdSim/config/crowdsim_muzero_md_config.py @@ -1,6 +1,6 @@ from easydict import EasyDict import os -os.environ["CUDA_VISIBLE_DEVICES"] = '2' +os.environ["CUDA_VISIBLE_DEVICES"] = '1' # ============================================================== # begin of the most frequently changed config specified by the user # ============================================================== @@ -23,8 +23,9 @@ CrowdSim_muzero_config = dict( exp_name= - f'result/CrowdSim_muzeromd_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0', + f'result/new_env/new_CrowdSim_muzeromd_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0', env=dict( + env_mode = 'easy', obs_mode='1-dim-array', env_name='CrowdSim-v0', dataset = 'purdue', diff --git a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py index 858bbe1a0..ba855b35d 100644 --- a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py +++ b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py @@ -29,6 +29,7 @@ def __init__(self, dataset, custom_config=None): self.config = get_selected_config(dataset) self.config.update(custom_config) + self.env_mode = self.config.env_mode # 'easy' or 'hard' self.human_num = self.config.human_num self.robot_num = self.config.robot_num self.num_timestep = self.config.num_timestep # max timestep @@ -67,6 +68,7 @@ def __init__(self, dataset, custom_config=None): self.human_df['aoi'] = -1 # 加入aoi记录aoi self.human_df['data_amount'] = -1 # record the remaining data amount of each human self.human_df['energy'] = -1 # 加入energy记录energy + logging.info('Env mode:', self.env_mode) logging.info('human number: {}'.format(self.human_num)) logging.info('Robot number: {}'.format(self.robot_num)) @@ -195,17 +197,36 @@ def step(self, action): next_px, next_py, next_theta = get_human_position_from_list(self.current_timestep + 1, human_id, selected_data, selected_next_data, self.config) should_reset = judge_aoi_update([next_px, next_py], new_robot_position, self.config) - if should_reset: - # if the human is in the range of the robot, then part of human's data will be transmitted - last_data_amount = human.data_amount - human.update(next_px, next_py, next_theta, transmitted_data=self.transmit_v) - human_transmit_data_list[human_id] = min(last_data_amount, self.transmit_v) - num_updated_human += 1 + if self.env_mode == 'easy': + if should_reset: + # if the human is in the range of the robot, then part of human's data will be transmitted + if human.aoi > 1: + human_transmit_data_list[human_id] = human.aoi + else: + human_transmit_data_list[human_id] = 1 + + human.set(next_px, next_py, next_theta, aoi=1, data_amount=human.aoi) + num_updated_human += 1 + else: + # if the human is not in the range of the robot, then update the aoi of the human + human_transmit_data_list[human_id] = 0 + new_aoi = human.aoi + 1 + human.set(next_px, next_py, next_theta, aoi=new_aoi, data_amount=human.aoi) + + elif self.env_mode == 'hard': + if should_reset: + # if the human is in the range of the robot, then part of human's data will be transmitted + last_data_amount = human.data_amount + human.update(next_px, next_py, next_theta, transmitted_data=self.transmit_v) + human_transmit_data_list[human_id] = min(last_data_amount, self.transmit_v) + num_updated_human += 1 + else: + # if the human is not in the range of the robot, then no data will be transmitted, \ + # and update aoi and caculate new collected data amount + human_transmit_data_list[human_id] = 0 + human.update(next_px, next_py, next_theta, transmitted_data=0) else: - # if the human is not in the range of the robot, then no data will be transmitted, \ - # and update aoi and caculate new collected data amount - human_transmit_data_list[human_id] = 0 - human.update(next_px, next_py, next_theta, transmitted_data=0) + raise ValueError("env_mode should be 'easy' or 'hard'") self.cur_data_amount_timelist[human_id] = human.data_amount self.current_human_aoi_list[human_id] = human.aoi @@ -215,7 +236,12 @@ def step(self, action): self.update_human_timelist[self.current_timestep] = num_updated_human delta_sum_transmit_data = np.sum(human_transmit_data_list) self.data_transmission += (delta_sum_transmit_data * 0.3) # Mb, 0.02M/s per person - self.total_generated_data_amount += self.generate_data_amount_per_step + if self.env_mode == 'easy': + # in easy mode, the data amount generated per step is equal to the number of humans + self.total_generated_data_amount = self.num_timestep*self.human_num + elif self.env_mode == 'hard': + # in hard mode, the data amount generated per step is equal to the sum of the data amount of all humans + self.total_generated_data_amount += self.generate_data_amount_per_step # TODO: need to be well-defined reward = self.mean_aoi_timelist[self.current_timestep] - self.mean_aoi_timelist[self.current_timestep + 1] \ diff --git a/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py b/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py index 0a5266143..09f36d6b7 100644 --- a/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py +++ b/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py @@ -10,6 +10,7 @@ human_num = 59, # purdue one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]], obs_mode = '2-dim-array', + env_mode = 'easy', ) @ pytest.mark.envtest From c4e9d584ef8ac5f0adc11c7c7c66c2706eb9a442 Mon Sep 17 00:00:00 2001 From: nighood Date: Tue, 7 May 2024 16:46:37 +0800 Subject: [PATCH 12/16] feature(rjy): add ez multi-head model --- lzero/agent/efficientzero.py | 3 + lzero/model/efficientzero_model_md.py | 479 ++++++++++++++++++ lzero/policy/efficientzero.py | 2 + .../crowdsim_efficientzero_md_config.py | 114 +++++ 4 files changed, 598 insertions(+) create mode 100644 lzero/model/efficientzero_model_md.py create mode 100644 zoo/CrowdSim/config/crowdsim_efficientzero_md_config.py diff --git a/lzero/agent/efficientzero.py b/lzero/agent/efficientzero.py index 421cea881..f06cd844b 100644 --- a/lzero/agent/efficientzero.py +++ b/lzero/agent/efficientzero.py @@ -110,6 +110,9 @@ def __init__( elif self.cfg.policy.model.model_type == 'conv': from lzero.model.efficientzero_model import EfficientZeroModel model = EfficientZeroModel(**self.cfg.policy.model) + elif self.cfg.policy.model.model_type == 'mlp_md': + from lzero.model.efficientzero_model_md import EfficientZeroModelMD + model = EfficientZeroModelMD(**self.cfg.policy.model) else: raise NotImplementedError if self.cfg.policy.cuda and torch.cuda.is_available(): diff --git a/lzero/model/efficientzero_model_md.py b/lzero/model/efficientzero_model_md.py new file mode 100644 index 000000000..d520995d8 --- /dev/null +++ b/lzero/model/efficientzero_model_md.py @@ -0,0 +1,479 @@ +from typing import Optional, Tuple + +import torch +import torch.nn as nn +from ding.torch_utils import MLP +from ding.utils import MODEL_REGISTRY, SequenceType +from numpy import ndarray + +from .common import EZNetworkOutput, RepresentationNetworkMLP +from .muzero_model_md import PredictionNetworkMD +from .utils import renormalize, get_params_mean, get_dynamic_mean, get_reward_mean + + +@MODEL_REGISTRY.register('EfficientZeroModelMD') +class EfficientZeroModelMD(nn.Module): + + def __init__( + self, + agent_num: int, + output_separate_logit: bool = False, + observation_shape: int = 2, + single_agent_action_size: int = 5, + action_space_size: int = 6, + lstm_hidden_size: int = 512, + latent_state_dim: int = 256, + fc_reward_layers: SequenceType = [32], + fc_value_layers: SequenceType = [32], + reward_support_size: int = 601, + value_support_size: int = 601, + proj_hid: int = 1024, + proj_out: int = 1024, + pred_hid: int = 512, + pred_out: int = 1024, + self_supervised_learning_loss: bool = True, + categorical_distribution: bool = True, + last_linear_layer_init_zero: bool = True, + state_norm: bool = False, + activation: Optional[nn.Module] = nn.ReLU(inplace=True), + norm_type: Optional[str] = 'BN', + discrete_action_encoding_type: str = 'one_hot', + res_connection_in_dynamics: bool = False, + *args, + **kwargs, + ): + """ + Overview: + The definition of the network model of EfficientZero, which is a generalization version for 1D vector obs. + The networks are mainly built on fully connected layers. + Sampled EfficientZero model consists of a representation network, a dynamics network and a prediction network. + The representation network is an MLP network which maps the raw observation to a latent state. + The dynamics network is an MLP+LSTM network which predicts the next latent state, reward_hidden_state and value_prefix given the current latent state and action. + The prediction network is an MLP network which predicts the value and policy given the current latent state. + Arguments: + - observation_shape (:obj:`int`): Observation space shape, e.g. 8 for Lunarlander. + - action_space_size: (:obj:`int`): Action space size, e.g. 4 for Lunarlander. + - lstm_hidden_size (:obj:`int`): The hidden size of LSTM in dynamics network to predict value_prefix. + - latent_state_dim (:obj:`int`): The dimension of latent state, such as 256. + - fc_reward_layers (:obj:`SequenceType`): The number of hidden layers of the reward head (MLP head). + - fc_value_layers (:obj:`SequenceType`): The number of hidden layers used in value head (MLP head). + - fc_policy_layers (:obj:`SequenceType`): The number of hidden layers used in policy head (MLP head). + - reward_support_size (:obj:`int`): The size of categorical reward output + - value_support_size (:obj:`int`): The size of categorical value output. + - proj_hid (:obj:`int`): The size of projection hidden layer. + - proj_out (:obj:`int`): The size of projection output layer. + - pred_hid (:obj:`int`): The size of prediction hidden layer. + - pred_out (:obj:`int`): The size of prediction output layer. + - self_supervised_learning_loss (:obj:`bool`): Whether to use self_supervised_learning related networks in Sampled EfficientZero model, default set it to False. + - categorical_distribution (:obj:`bool`): Whether to use discrete support to represent categorical distribution for value, reward/value_prefix. + - last_linear_layer_init_zero (:obj:`bool`): Whether to use zero initializations for the last layer of value/policy mlp, default sets it to True. + - state_norm (:obj:`bool`): Whether to use normalization for latent states, default sets it to True. + - activation (:obj:`Optional[nn.Module]`): Activation function used in network, which often use in-place \ + operation to speedup, e.g. ReLU(inplace=True). + - discrete_action_encoding_type (:obj:`str`): The type of encoding for discrete action. Default sets it to 'one_hot'. options = {'one_hot', 'not_one_hot'} + - norm_type (:obj:`str`): The type of normalization in networks. defaults to 'BN'. + - res_connection_in_dynamics (:obj:`bool`): Whether to use residual connection for dynamics network, default set it to False. + """ + super(EfficientZeroModelMD, self).__init__() + if not categorical_distribution: + self.reward_support_size = 1 + self.value_support_size = 1 + else: + self.reward_support_size = reward_support_size + self.value_support_size = value_support_size + + self.action_space_size = action_space_size + self.continuous_action_space = False + # The dim of action space. For discrete action space, it is 1. + # For continuous action space, it is the dimension of continuous action. + self.action_space_dim = action_space_size if self.continuous_action_space else 1 + assert discrete_action_encoding_type in ['one_hot', 'not_one_hot'], discrete_action_encoding_type + self.discrete_action_encoding_type = discrete_action_encoding_type + if self.continuous_action_space: + self.action_encoding_dim = action_space_size + else: + if self.discrete_action_encoding_type == 'one_hot': + self.action_encoding_dim = action_space_size + elif self.discrete_action_encoding_type == 'not_one_hot': + self.action_encoding_dim = 1 + + self.lstm_hidden_size = lstm_hidden_size + self.proj_hid = proj_hid + self.proj_out = proj_out + self.pred_hid = pred_hid + self.pred_out = pred_out + self.self_supervised_learning_loss = self_supervised_learning_loss + self.last_linear_layer_init_zero = last_linear_layer_init_zero + self.state_norm = state_norm + self.res_connection_in_dynamics = res_connection_in_dynamics + + self.representation_network = RepresentationNetworkMLP( + observation_shape=observation_shape, hidden_channels=latent_state_dim, norm_type=norm_type + ) + + self.dynamics_network = DynamicsNetworkMLP( + action_encoding_dim=self.action_encoding_dim, + num_channels=latent_state_dim + self.action_encoding_dim, + common_layer_num=2, + lstm_hidden_size=lstm_hidden_size, + fc_reward_layers=fc_reward_layers, + output_support_size=self.reward_support_size, + last_linear_layer_init_zero=self.last_linear_layer_init_zero, + norm_type=norm_type, + res_connection_in_dynamics=self.res_connection_in_dynamics, + ) + + self.prediction_network = PredictionNetworkMD( + agent_num=agent_num, + single_agent_action_size=single_agent_action_size, + num_channels=latent_state_dim, + fc_value_layers=fc_value_layers, + output_support_size=self.value_support_size, + last_linear_layer_init_zero=self.last_linear_layer_init_zero, + norm_type=norm_type, + output_separate_logit=output_separate_logit, + ) + + if self.self_supervised_learning_loss: + # self_supervised_learning_loss related network proposed in EfficientZero + self.projection_input_dim = latent_state_dim + + self.projection = nn.Sequential( + nn.Linear(self.projection_input_dim, self.proj_hid), nn.BatchNorm1d(self.proj_hid), activation, + nn.Linear(self.proj_hid, self.proj_hid), nn.BatchNorm1d(self.proj_hid), activation, + nn.Linear(self.proj_hid, self.proj_out), nn.BatchNorm1d(self.proj_out) + ) + self.prediction_head = nn.Sequential( + nn.Linear(self.proj_out, self.pred_hid), + nn.BatchNorm1d(self.pred_hid), + activation, + nn.Linear(self.pred_hid, self.pred_out), + ) + + def initial_inference(self, obs: torch.Tensor) -> EZNetworkOutput: + """ + Overview: + Initial inference of EfficientZero model, which is the first step of the EfficientZero model. + To perform the initial inference, we first use the representation network to obtain the "latent_state" of the observation. + Then we use the prediction network to predict the "value" and "policy_logits" of the "latent_state", and + also prepare the zeros-like ``reward_hidden_state`` for the next step of the EfficientZero model. + Arguments: + - obs (:obj:`torch.Tensor`): The 1D vector observation data. + Returns (EZNetworkOutput): + - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation. + - value_prefix (:obj:`torch.Tensor`): The predicted prefix sum of value for input state. \ + In initial inference, we set it to zero vector. + - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action. + - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state. + - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The hidden state of LSTM about reward. In initial inference, \ + we set it to the zeros-like hidden state (H and C). + Shapes: + - obs (:obj:`torch.Tensor`): :math:`(B, obs_shape)`, where B is batch_size. + - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size. + - value_prefix (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size. + - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size. + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The shape of each element is :math:`(1, B, lstm_hidden_size)`, where B is batch_size. + """ + batch_size = obs.size(0) + latent_state = self._representation(obs) + policy_logits, value = self._prediction(latent_state) + # zero initialization for reward hidden states + # (hn, cn), each element shape is (layer_num=1, batch_size, lstm_hidden_size) + reward_hidden_state = ( + torch.zeros(1, batch_size, + self.lstm_hidden_size).to(obs.device), torch.zeros(1, batch_size, + self.lstm_hidden_size).to(obs.device) + ) + return EZNetworkOutput(value, [0. for _ in range(batch_size)], policy_logits, latent_state, reward_hidden_state) + + def recurrent_inference( + self, latent_state: torch.Tensor, reward_hidden_state: torch.Tensor, action: torch.Tensor + ) -> EZNetworkOutput: + """ + Overview: + Recurrent inference of EfficientZero model, which is the rollout step of the EfficientZero model. + To perform the recurrent inference, we first use the dynamics network to predict ``next_latent_state``, + ``reward_hidden_state``, ``value_prefix`` by the given current ``latent_state`` and ``action``. + We then use the prediction network to predict the ``value`` and ``policy_logits``. + Arguments: + - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state. + - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The input hidden state of LSTM about reward. + - action (:obj:`torch.Tensor`): The predicted action to rollout. + Returns (EZNetworkOutput): + - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation. + - value_prefix (:obj:`torch.Tensor`): The predicted prefix sum of value for input state. + - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action. + - next_latent_state (:obj:`torch.Tensor`): The predicted next latent state. + - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The output hidden state of LSTM about reward. + Shapes: + - action (:obj:`torch.Tensor`): :math:`(B, )`, where B is batch_size. + - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size. + - value_prefix (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size. + - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size. + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - next_latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The shape of each element is :math:`(1, B, lstm_hidden_size)`, where B is batch_size. + """ + next_latent_state, reward_hidden_state, value_prefix = self._dynamics(latent_state, reward_hidden_state, action) + policy_logits, value = self._prediction(next_latent_state) + return EZNetworkOutput(value, value_prefix, policy_logits, next_latent_state, reward_hidden_state) + + def _representation(self, observation: torch.Tensor) -> Tuple[torch.Tensor]: + """ + Overview: + Use the representation network to encode the observations into latent state. + Arguments: + - obs (:obj:`torch.Tensor`): The 1D vector observation data. + Returns: + - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state. + Shapes: + - obs (:obj:`torch.Tensor`): :math:`(B, obs_shape)`, where B is batch_size. + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + """ + observation = observation.float() + latent_state = self.representation_network(observation) + if self.state_norm: + latent_state = renormalize(latent_state) + return latent_state + + def _prediction(self, latent_state: torch.Tensor) -> Tuple[torch.Tensor]: + """ + Overview: + Use the representation network to encode the observations into latent state. + Arguments: + - obs (:obj:`torch.Tensor`): The 1D vector observation data. + Returns: + - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action. + - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation. + Shapes: + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size. + - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size. + """ + policy_logits, value = self.prediction_network(latent_state) + return policy_logits, value + + def _dynamics(self, latent_state: torch.Tensor, reward_hidden_state: Tuple, + action: torch.Tensor) -> Tuple[torch.Tensor, Tuple[torch.Tensor], torch.Tensor]: + """ + Overview: + Concatenate ``latent_state`` and ``action`` and use the dynamics network to predict ``next_latent_state`` + ``value_prefix`` and ``next_reward_hidden_state``. + Arguments: + - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state. + - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The input hidden state of LSTM about reward. + - action (:obj:`torch.Tensor`): The predicted action to rollout. + Returns: + - next_latent_state (:obj:`torch.Tensor`): The predicted latent state of the next timestep. + - next_reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The output hidden state of LSTM about reward. + - value_prefix (:obj:`torch.Tensor`): The predicted prefix sum of value for input state. + Shapes: + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - action (:obj:`torch.Tensor`): :math:`(B, )`, where B is batch_size. + - next_latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - value_prefix (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size. + """ + # NOTE: the discrete action encoding type is important for some environments + + # discrete action space + if self.discrete_action_encoding_type == 'one_hot': + # Stack latent_state with the one hot encoded action + if len(action.shape) == 1: + # (batch_size, ) -> (batch_size, 1) + # e.g., torch.Size([8]) -> torch.Size([8, 1]) + action = action.unsqueeze(-1) + + # transform action to one-hot encoding. + # action_one_hot shape: (batch_size, action_space_size), e.g., (8, 4) + action_one_hot = torch.zeros(action.shape[0], self.action_space_size, device=action.device) + # transform action to torch.int64 + action = action.long() + action_one_hot.scatter_(1, action, 1) + action_encoding = action_one_hot + elif self.discrete_action_encoding_type == 'not_one_hot': + action_encoding = action / self.action_space_size + if len(action_encoding.shape) == 1: + # (batch_size, ) -> (batch_size, 1) + # e.g., torch.Size([8]) -> torch.Size([8, 1]) + action_encoding = action_encoding.unsqueeze(-1) + + action_encoding = action_encoding.to(latent_state.device).float() + # state_action_encoding shape: (batch_size, latent_state[1] + action_dim]) or + # (batch_size, latent_state[1] + action_space_size]) depending on the discrete_action_encoding_type. + state_action_encoding = torch.cat((latent_state, action_encoding), dim=1) + + # NOTE: the key difference with MuZero + next_latent_state, next_reward_hidden_state, value_prefix = self.dynamics_network( + state_action_encoding, reward_hidden_state + ) + + if self.state_norm: + next_latent_state = renormalize(next_latent_state) + return next_latent_state, next_reward_hidden_state, value_prefix + + def project(self, latent_state: torch.Tensor, with_grad=True): + """ + Overview: + Project the latent state to a lower dimension to calculate the self-supervised loss, which is proposed in EfficientZero. + For more details, please refer to the paper ``Exploring Simple Siamese Representation Learning``. + Arguments: + - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state. + - with_grad (:obj:`bool`): Whether to calculate gradient for the projection result. + Returns: + - proj (:obj:`torch.Tensor`): The result embedding vector of projection operation. + Shapes: + - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state. + - proj (:obj:`torch.Tensor`): :math:`(B, projection_output_dim)`, where B is batch_size. + + Examples: + >>> latent_state = torch.randn(256, 64) + >>> output = self.project(latent_state) + >>> output.shape # (256, 1024) + """ + proj = self.projection(latent_state) + + if with_grad: + # with grad, use prediction_head + return self.prediction_head(proj) + else: + return proj.detach() + + def get_params_mean(self) -> float: + return get_params_mean(self) + + +class DynamicsNetworkMLP(nn.Module): + + def __init__( + self, + action_encoding_dim: int = 2, + num_channels: int = 64, + common_layer_num: int = 2, + fc_reward_layers: SequenceType = [32], + output_support_size: int = 601, + lstm_hidden_size: int = 512, + last_linear_layer_init_zero: bool = True, + activation: Optional[nn.Module] = nn.ReLU(inplace=True), + norm_type: Optional[str] = 'BN', + res_connection_in_dynamics: bool = False, + ): + """ + Overview: + The definition of dynamics network in EfficientZero algorithm, which is used to predict next latent state + value_prefix and reward_hidden_state by the given current latent state and action. + The networks are mainly built on fully connected layers. + Arguments: + - action_encoding_dim (:obj:`int`): The dimension of action encoding. + - num_channels (:obj:`int`): The num of channels in latent states. + - common_layer_num (:obj:`int`): The number of common layers in dynamics network. + - fc_reward_layers (:obj:`SequenceType`): The number of hidden layers of the reward head (MLP head). + - output_support_size (:obj:`int`): The size of categorical reward output. + - lstm_hidden_size (:obj:`int`): The hidden size of lstm in dynamics network. + - last_linear_layer_init_zero (:obj:`bool`): Whether to use zero initializationss for the last layer of value/policy head, default sets it to True. + - activation (:obj:`Optional[nn.Module]`): Activation function used in network, which often use in-place \ + operation to speedup, e.g. ReLU(inplace=True). + - norm_type (:obj:`str`): The type of normalization in networks. defaults to 'BN'. + - res_connection_in_dynamics (:obj:`bool`): Whether to use residual connection in dynamics network. + """ + super().__init__() + assert num_channels > action_encoding_dim, f'num_channels:{num_channels} <= action_encoding_dim:{action_encoding_dim}' + + self.num_channels = num_channels + self.action_encoding_dim = action_encoding_dim + self.latent_state_dim = self.num_channels - self.action_encoding_dim + self.lstm_hidden_size = lstm_hidden_size + self.activation = activation + self.res_connection_in_dynamics = res_connection_in_dynamics + + if self.res_connection_in_dynamics: + self.fc_dynamics_1 = MLP( + in_channels=self.num_channels, + hidden_channels=self.latent_state_dim, + layer_num=common_layer_num, + out_channels=self.latent_state_dim, + activation=activation, + norm_type=norm_type, + output_activation=True, + output_norm=True, + # last_linear_layer_init_zero=False is important for convergence + last_linear_layer_init_zero=False, + ) + self.fc_dynamics_2 = MLP( + in_channels=self.latent_state_dim, + hidden_channels=self.latent_state_dim, + layer_num=common_layer_num, + out_channels=self.latent_state_dim, + activation=activation, + norm_type=norm_type, + output_activation=True, + output_norm=True, + # last_linear_layer_init_zero=False is important for convergence + last_linear_layer_init_zero=False, + ) + else: + self.fc_dynamics = MLP( + in_channels=self.num_channels, + hidden_channels=self.latent_state_dim, + layer_num=common_layer_num, + out_channels=self.latent_state_dim, + activation=activation, + norm_type=norm_type, + output_activation=True, + output_norm=True, + # last_linear_layer_init_zero=False is important for convergence + last_linear_layer_init_zero=False, + ) + + # input_shape: (sequence_length,batch_size,input_size) + # output_shape: (sequence_length, batch_size, hidden_size) + self.lstm = nn.LSTM(input_size=self.latent_state_dim, hidden_size=self.lstm_hidden_size) + + self.fc_reward_head = MLP( + in_channels=self.lstm_hidden_size, + hidden_channels=fc_reward_layers[0], + layer_num=2, + out_channels=output_support_size, + activation=self.activation, + norm_type=norm_type, + output_activation=False, + output_norm=False, + last_linear_layer_init_zero=last_linear_layer_init_zero + ) + + def forward(self, state_action_encoding: torch.Tensor, reward_hidden_state): + """ + Overview: + Forward computation of the dynamics network. Predict next latent state given current state_action_encoding and reward hidden state. + Arguments: + - state_action_encoding (:obj:`torch.Tensor`): The state-action encoding, which is the concatenation of \ + latent state and action encoding, with shape (batch_size, num_channels, height, width). + - reward_hidden_state (:obj:`Tuple[torch.Tensor, torch.Tensor]`): The input hidden state of LSTM about reward. + Returns: + - next_latent_state (:obj:`torch.Tensor`): The next latent state, with shape (batch_size, latent_state_dim). + - next_reward_hidden_state (:obj:`torch.Tensor`): The input hidden state of LSTM about reward. + - value_prefix (:obj:`torch.Tensor`): The predicted prefix sum of value for input state. + """ + if self.res_connection_in_dynamics: + # take the state encoding (latent_state), state_action_encoding[:, -self.action_encoding_dim] + # is action encoding + latent_state = state_action_encoding[:, :-self.action_encoding_dim] + x = self.fc_dynamics_1(state_action_encoding) + # the residual link: add state encoding to the state_action encoding + next_latent_state = x + latent_state + next_latent_state_ = self.fc_dynamics_2(next_latent_state) + else: + next_latent_state = self.fc_dynamics(state_action_encoding) + next_latent_state_ = next_latent_state + + next_latent_state_unsqueeze = next_latent_state_.unsqueeze(0) + value_prefix, next_reward_hidden_state = self.lstm(next_latent_state_unsqueeze, reward_hidden_state) + value_prefix = self.fc_reward_head(value_prefix.squeeze(0)) + + return next_latent_state, next_reward_hidden_state, value_prefix + + def get_dynamic_mean(self) -> float: + return get_dynamic_mean(self) + + def get_reward_mean(self) -> Tuple[ndarray, float]: + return get_reward_mean(self) diff --git a/lzero/policy/efficientzero.py b/lzero/policy/efficientzero.py index 3a94baf51..b9e844cbe 100644 --- a/lzero/policy/efficientzero.py +++ b/lzero/policy/efficientzero.py @@ -218,6 +218,8 @@ def default_model(self) -> Tuple[str, List[str]]: return 'EfficientZeroModel', ['lzero.model.efficientzero_model'] elif self._cfg.model.model_type == "mlp": return 'EfficientZeroModelMLP', ['lzero.model.efficientzero_model_mlp'] + elif self._cfg.model.model_type == "mlp_md": + return 'EfficientZeroModelMD', ['lzero.model.efficientzero_model_md'] else: raise ValueError("model type {} is not supported".format(self._cfg.model.model_type)) diff --git a/zoo/CrowdSim/config/crowdsim_efficientzero_md_config.py b/zoo/CrowdSim/config/crowdsim_efficientzero_md_config.py new file mode 100644 index 000000000..16e395606 --- /dev/null +++ b/zoo/CrowdSim/config/crowdsim_efficientzero_md_config.py @@ -0,0 +1,114 @@ +from easydict import EasyDict +import os +os.environ["CUDA_VISIBLE_DEVICES"] = '0' +# ============================================================== +# begin of the most frequently changed config specified by the user +# ============================================================== +collector_env_num = 8 +n_episode = 8 +evaluator_env_num = 3 +num_simulations = 25 +update_per_collect = 100 +batch_size = 256 +max_env_step = int(3e5) +reanalyze_ratio = 0. +robot_num = 2 +human_num = 59 # purdue +# human_num = 33 # NCSU +# human_num = 92 # KAIST +one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]] +# ============================================================== +# end of the most frequently changed config specified by the user +# ============================================================== + +CrowdSim_efficientzero_md_config = dict( + exp_name= + f'result/old_env/new_CrowdSim_ez_md_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0', + env=dict( + env_mode='easy', + obs_mode='1-dim-array', + env_name='CrowdSim-v0', + dataset = 'purdue', + robot_num = robot_num, + human_num = human_num, + one_uav_action_space = one_uav_action_space, + continuous=False, + manually_discretization=False, + collector_env_num=collector_env_num, + evaluator_env_num=evaluator_env_num, + n_evaluator_episode=evaluator_env_num, + manager=dict(shared_memory=False, ), + ), + policy=dict( + model=dict( + agent_num = robot_num, + observation_shape=(robot_num + human_num)*4, + obs_mode='1-dim-array', + robot_state_dim = 4, + human_state_dim = 4, + robot_num = robot_num, + human_num = human_num, + single_agent_action_size=len(one_uav_action_space), + action_space_size=(len(one_uav_action_space))**robot_num, + model_type='mlp_md', + output_separate_logit=False, # not output separate logit for each action. + lstm_hidden_size=128, + latent_state_dim=128, + discrete_action_encoding_type='one_hot', + res_connection_in_dynamics=True, + norm_type='BN', + ), + cuda=True, + env_type='not_board_games', + game_segment_length=200, + update_per_collect=update_per_collect, + batch_size=batch_size, + optim_type='Adam', + lr_piecewise_constant_decay=False, + learning_rate=0.003, + grad_clip_value=0.5, + num_simulations=num_simulations, + reanalyze_ratio=reanalyze_ratio, + n_episode=n_episode, + eval_freq=int(1e3), + replay_buffer_size=int(1e6), # the size/capacity of replay_buffer, in the terms of transitions. + collector_env_num=collector_env_num, + evaluator_env_num=evaluator_env_num, + ), +) + +CrowdSim_efficientzero_md_config = EasyDict(CrowdSim_efficientzero_md_config) +main_config = CrowdSim_efficientzero_md_config + +CrowdSim_efficientzero_md_create_config = dict( + env=dict( + type='crowdsim_lightzero', + import_names=['zoo.CrowdSim.envs.crowdsim_lightzero_env'], + ), + env_manager=dict(type='base'), + policy=dict( + type='efficientzero', + import_names=['lzero.policy.efficientzero'], + ), + collector=dict( + type='episode_muzero', + import_names=['lzero.worker.muzero_collector'], + ) +) +CrowdSim_efficientzero_md_create_config = EasyDict(CrowdSim_efficientzero_md_create_config) +create_config = CrowdSim_efficientzero_md_create_config + +if __name__ == "__main__": + # Users can use different train entry by specifying the entry_type. + entry_type = "train_muzero" # options={"train_muzero", "train_muzero_with_gym_env"} + + if entry_type == "train_muzero": + from lzero.entry import train_muzero + elif entry_type == "train_muzero_with_gym_env": + """ + The ``train_muzero_with_gym_env`` entry means that the environment used in the training process is generated by wrapping the original gym environment with LightZeroEnvWrapper. + Users can refer to lzero/envs/wrappers for more details. + """ + from lzero.entry import train_muzero_with_gym_env as train_muzero + + train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step) From cb044af9a4bb78f8d5d902478c7350f3f60b2f88 Mon Sep 17 00:00:00 2001 From: nighood Date: Tue, 7 May 2024 19:39:53 +0800 Subject: [PATCH 13/16] polish(rjy): add v_trans in config --- .../config/crowdsim_efficientzero_md_config.py | 14 ++++++-------- zoo/CrowdSim/config/crowdsim_muzero_md_config.py | 8 +++++--- zoo/CrowdSim/config/crowdsim_sez_md_config.py | 5 ++++- zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py | 4 ++-- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/zoo/CrowdSim/config/crowdsim_efficientzero_md_config.py b/zoo/CrowdSim/config/crowdsim_efficientzero_md_config.py index 16e395606..ff2a0913c 100644 --- a/zoo/CrowdSim/config/crowdsim_efficientzero_md_config.py +++ b/zoo/CrowdSim/config/crowdsim_efficientzero_md_config.py @@ -1,6 +1,6 @@ from easydict import EasyDict import os -os.environ["CUDA_VISIBLE_DEVICES"] = '0' +os.environ["CUDA_VISIBLE_DEVICES"] = '1' # ============================================================== # begin of the most frequently changed config specified by the user # ============================================================== @@ -17,15 +17,17 @@ # human_num = 33 # NCSU # human_num = 92 # KAIST one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]] +transmit_v = 20 # ============================================================== # end of the most frequently changed config specified by the user # ============================================================== CrowdSim_efficientzero_md_config = dict( exp_name= - f'result/old_env/new_CrowdSim_ez_md_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0', + f'result/new_env/new_CrowdSim_ez_md_ssl_vt{transmit_v}_step{max_env_step}_uav{robot_num}__human{human_num}_seed0', env=dict( - env_mode='easy', + env_mode='hard', + transmit_v=transmit_v, obs_mode='1-dim-array', env_name='CrowdSim-v0', dataset = 'purdue', @@ -70,7 +72,7 @@ num_simulations=num_simulations, reanalyze_ratio=reanalyze_ratio, n_episode=n_episode, - eval_freq=int(1e3), + eval_freq=int(2e2), replay_buffer_size=int(1e6), # the size/capacity of replay_buffer, in the terms of transitions. collector_env_num=collector_env_num, evaluator_env_num=evaluator_env_num, @@ -90,10 +92,6 @@ type='efficientzero', import_names=['lzero.policy.efficientzero'], ), - collector=dict( - type='episode_muzero', - import_names=['lzero.worker.muzero_collector'], - ) ) CrowdSim_efficientzero_md_create_config = EasyDict(CrowdSim_efficientzero_md_create_config) create_config = CrowdSim_efficientzero_md_create_config diff --git a/zoo/CrowdSim/config/crowdsim_muzero_md_config.py b/zoo/CrowdSim/config/crowdsim_muzero_md_config.py index c0b3b1018..a79eb0bbe 100644 --- a/zoo/CrowdSim/config/crowdsim_muzero_md_config.py +++ b/zoo/CrowdSim/config/crowdsim_muzero_md_config.py @@ -17,15 +17,17 @@ # human_num = 33 # NCSU # human_num = 92 # KAIST one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]] +transmit_v = 20 # ============================================================== # end of the most frequently changed config specified by the user # ============================================================== CrowdSim_muzero_config = dict( exp_name= - f'result/new_env/new_CrowdSim_muzeromd_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0', + f'result/new_env/new_CrowdSim_vt{transmit_v}_muzero_md_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0', env=dict( - env_mode = 'easy', + env_mode = 'hard', + transmit_v=transmit_v, obs_mode='1-dim-array', env_name='CrowdSim-v0', dataset = 'purdue', @@ -74,7 +76,7 @@ num_simulations=num_simulations, reanalyze_ratio=reanalyze_ratio, n_episode=n_episode, - eval_freq=int(1e3), + eval_freq=int(2e2), replay_buffer_size=int(1e6), # the size/capacity of replay_buffer, in the terms of transitions. collector_env_num=collector_env_num, evaluator_env_num=evaluator_env_num, diff --git a/zoo/CrowdSim/config/crowdsim_sez_md_config.py b/zoo/CrowdSim/config/crowdsim_sez_md_config.py index 08ace6a98..ae2e5074c 100644 --- a/zoo/CrowdSim/config/crowdsim_sez_md_config.py +++ b/zoo/CrowdSim/config/crowdsim_sez_md_config.py @@ -18,15 +18,18 @@ # human_num = 92 # KAIST one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]] K = 10 +transmit_v = 20 # ============================================================== # end of the most frequently changed config specified by the user # ============================================================== CrowdSim_sez_config = dict( exp_name= - f'result/old_env/new_CrowdSim_sez_md_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0', + f'result/new_env/new_CrowdSim_vt{transmit_v}_sez_md_ssl_K{K}_step{max_env_step}_uav{robot_num}__human{human_num}_seed0', env=dict( + env_mode='hard', obs_mode='1-dim-array', + transmit_v=transmit_v, env_name='CrowdSim-v0', dataset = 'purdue', robot_num = robot_num, diff --git a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py index ba855b35d..b590ecb9f 100644 --- a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py +++ b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py @@ -44,7 +44,7 @@ def __init__(self, dataset, custom_config=None): self.observation_space = gym.spaces.Box(low=float("-inf"), high=float("inf"), shape=(self.robot_num+self.human_num, 4), dtype=np.float32) # load_dataset - self.transmit_v = 20 # 5*0.3Mb/s + self.transmit_v = self.config.transmit_v # 5*0.3Mb/s self.nlon = self.config.nlon self.nlat = self.config.nlat self.lower_left = self.config.lower_left @@ -68,7 +68,7 @@ def __init__(self, dataset, custom_config=None): self.human_df['aoi'] = -1 # 加入aoi记录aoi self.human_df['data_amount'] = -1 # record the remaining data amount of each human self.human_df['energy'] = -1 # 加入energy记录energy - logging.info('Env mode:', self.env_mode) + logging.info('Env mode: {}'.format(self.env_mode)) logging.info('human number: {}'.format(self.human_num)) logging.info('Robot number: {}'.format(self.robot_num)) From 715b5b82134bc5f1450c798d2e4e11551e9f8fc3 Mon Sep 17 00:00:00 2001 From: nighood Date: Tue, 11 Jun 2024 15:37:52 +0800 Subject: [PATCH 14/16] fix(rjy): fix env bug --- lzero/worker/muzero_evaluator.py | 9 +- zoo/CrowdSim/entry/eval_crowdsim.py | 83 +++++++++++++++++++ zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py | 6 +- zoo/CrowdSim/envs/Crowdsim/env/model/agent.py | 3 +- zoo/CrowdSim/envs/crowdsim_lightzero_env.py | 6 +- 5 files changed, 97 insertions(+), 10 deletions(-) create mode 100644 zoo/CrowdSim/entry/eval_crowdsim.py diff --git a/lzero/worker/muzero_evaluator.py b/lzero/worker/muzero_evaluator.py index a718d33b7..d96cf90f7 100644 --- a/lzero/worker/muzero_evaluator.py +++ b/lzero/worker/muzero_evaluator.py @@ -342,11 +342,12 @@ def eval( # Env reset is done by env_manager automatically. self._policy.reset([env_id]) reward = t.info['eval_episode_return'] - # 'performance_info' and 'episode_info' only choose one + saved_info = {'eval_episode_return': t.info['eval_episode_return']} if 'performance_info' in t.info: - eval_monitor.update_info(env_id, t.info['performance_info']) - elif 'episode_info' in t.info: - eval_monitor.update_info(env_id, t.info['episode_info']) + saved_info.update(t.info['performance_info']) + if 'episode_info' in t.info: + saved_info.update(t.info['episode_info']) + eval_monitor.update_info(env_id, saved_info) eval_monitor.update_reward(env_id, reward) self._logger.info( "[EVALUATOR]env {} finish episode, final reward: {}, current episode: {}".format( diff --git a/zoo/CrowdSim/entry/eval_crowdsim.py b/zoo/CrowdSim/entry/eval_crowdsim.py new file mode 100644 index 000000000..13297c76f --- /dev/null +++ b/zoo/CrowdSim/entry/eval_crowdsim.py @@ -0,0 +1,83 @@ +from lzero.entry import eval_muzero +import numpy as np + +if __name__ == "__main__": + """ + Overview: + Main script to evaluate the MuZero model on Atari games. The script will loop over multiple seeds, + evaluating a certain number of episodes per seed. Results are aggregated and printed. + + Variables: + - model_path (:obj:`Optional[str]`): The pretrained model path, pointing to the ckpt file of the pretrained model. + The path is usually something like ``exp_name/ckpt/ckpt_best.pth.tar``. + - seeds (:obj:`List[int]`): List of seeds to use for the evaluations. + - num_episodes_each_seed (:obj:`int`): Number of episodes to evaluate for each seed. + - total_test_episodes (:obj:`int`): Total number of test episodes, calculated as num_episodes_each_seed * len(seeds). + - returns_mean_seeds (:obj:`np.array`): Array of mean return values for each seed. + - returns_seeds (:obj:`np.array`): Array of all return values for each seed. + """ + # Importing the necessary configuration files from the atari muzero configuration in the zoo directory. + # module_path = '/home/nighoodRen/LightZero/result/new_env/new_CrowdSim_vt20_muzero_md_ssl_step300000_uav2__human59_seed0' + # import sys + # if module_path not in sys.path: + # sys.path.append(module_path) + # # 导入模块中的内容 + # from formatted_total_config import main_config, create_config + # from result.new_env.new_CrowdSim_vt20_muzero_md_ssl_step300000_uav2__human59_seed0.formatted_total_config import main_config, create_config + from zoo.CrowdSim.config.crowdsim_muzero_md_config import main_config, create_config + + # model_path is the path to the trained MuZero model checkpoint. + # If no path is provided, the script will use the default model. + model_path = '/home/nighoodRen/LightZero/result/old_env/CrowdSim_muzeromd_ssl_step300000_uav2__human59_seed0_240503_022923/ckpt/ckpt_best.pth.tar' + main_config.exp_name = '/home/nighoodRen/LightZero/result/old_env/CrowdSim_muzeromd_ssl_step300000_uav2__human59_seed0_240503_022923/' + 'eval' # original result folder/eval + # seeds is a list of seed values for the random number generator, used to initialize the environment. + seeds = [0] + # num_episodes_each_seed is the number of episodes to run for each seed. + num_episodes_each_seed = 1 + # total_test_episodes is the total number of test episodes, calculated as the product of the number of seeds and the number of episodes per seed + total_test_episodes = num_episodes_each_seed * len(seeds) + + # Setting the type of the environment manager to 'base' for the visualization purposes. + create_config.env_manager.type = 'base' + # The number of environments to evaluate concurrently. Set to 1 for visualization purposes. + main_config.env.evaluator_env_num = 1 + # The total number of evaluation episodes that should be run. + main_config.env.n_evaluator_episode = total_test_episodes + # A boolean flag indicating whether to render the environments in real-time. + main_config.env.render_mode_human = False + + # A boolean flag indicating whether to save the video of the environment. + main_config.env.save_replay = True + # The path where the recorded video will be saved. + main_config.env.replay_path = main_config.exp_name + '/video' # current result folder/eval + + # The maximum number of steps for each episode during evaluation. This may need to be adjusted based on the specific characteristics of the environment. + main_config.env.eval_max_episode_steps = int(20) + + # These lists will store the mean and total rewards for each seed. + returns_mean_seeds = [] + returns_seeds = [] + + # The main evaluation loop. For each seed, the MuZero model is evaluated and the mean and total rewards are recorded. + for seed in seeds: + returns_mean, returns = eval_muzero( + [main_config, create_config], + seed=seed, + num_episodes_each_seed=num_episodes_each_seed, + print_seed_details=False, + model_path=model_path + ) + print(returns_mean, returns) + returns_mean_seeds.append(returns_mean) + returns_seeds.append(returns) + + # Convert the list of mean and total rewards into numpy arrays for easier statistical analysis. + returns_mean_seeds = np.array(returns_mean_seeds) + returns_seeds = np.array(returns_seeds) + + # Printing the evaluation results. The average reward and the total reward for each seed are displayed, followed by the mean reward across all seeds. + print("=" * 20) + print(f"We evaluated a total of {len(seeds)} seeds. For each seed, we evaluated {num_episodes_each_seed} episode(s).") + print(f"For seeds {seeds}, the mean returns are {returns_mean_seeds}, and the returns are {returns_seeds}.") + print("Across all seeds, the mean reward is:", returns_mean_seeds.mean()) + print("=" * 20) \ No newline at end of file diff --git a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py index b590ecb9f..7ddf4550a 100644 --- a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py +++ b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py @@ -74,7 +74,7 @@ def __init__(self, dataset, custom_config=None): # for debug self.current_human_aoi_list = np.zeros([self.human_num, ]) - self.mean_aoi_timelist = np.ones([self.config.num_timestep + 1, ]) + self.mean_aoi_timelist = np.zeros([self.config.num_timestep + 1, ]) self.cur_data_amount_timelist = np.zeros([self.human_num, ]) self.robot_energy_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num]) self.robot_x_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num]) @@ -205,7 +205,7 @@ def step(self, action): else: human_transmit_data_list[human_id] = 1 - human.set(next_px, next_py, next_theta, aoi=1, data_amount=human.aoi) + human.set(next_px, next_py, next_theta, aoi=0, data_amount=0) num_updated_human += 1 else: # if the human is not in the range of the robot, then update the aoi of the human @@ -218,7 +218,7 @@ def step(self, action): # if the human is in the range of the robot, then part of human's data will be transmitted last_data_amount = human.data_amount human.update(next_px, next_py, next_theta, transmitted_data=self.transmit_v) - human_transmit_data_list[human_id] = min(last_data_amount, self.transmit_v) + human_transmit_data_list[human_id] = min(last_data_amount + human.collect_v, self.transmit_v) num_updated_human += 1 else: # if the human is not in the range of the robot, then no data will be transmitted, \ diff --git a/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py b/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py index aabc184d2..db6c47b31 100644 --- a/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py +++ b/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py @@ -26,7 +26,7 @@ def act(self, state, current_timestep): class Human(): - collect_v_prob = {1: 0.3, 2: 0.6} + # collect_v_prob = {1: 1, 2: 0} def __init__(self, id, config): self.id = id self.config = config @@ -36,6 +36,7 @@ def __init__(self, id, config): self.aoi = 0 self.data_queue = InformationQueue() self.data_amount = 0 + self.collect_v_prob = getattr(self.config, 'collect_v_prob', {1: 1, 2: 0}) self.collect_v = random.choices(list(self.collect_v_prob.keys()), list(self.collect_v_prob.values()))[0] def set(self, px, py, theta, aoi, data_amount): diff --git a/zoo/CrowdSim/envs/crowdsim_lightzero_env.py b/zoo/CrowdSim/envs/crowdsim_lightzero_env.py index 4aca9dd1a..bcecc89bc 100644 --- a/zoo/CrowdSim/envs/crowdsim_lightzero_env.py +++ b/zoo/CrowdSim/envs/crowdsim_lightzero_env.py @@ -19,7 +19,7 @@ class CrowdSimEnv(BaseEnv): def __init__(self, cfg: dict = {}) -> None: self._cfg = cfg self._init_flag = False - self._replay_path = None + self._replay_path = cfg.get('replay_path', None) self._robot_num = self._cfg.robot_num self._human_num = self._cfg.human_num self._observation_space = gym.spaces.Dict({ @@ -124,7 +124,9 @@ def step(self, action: Union[int, np.ndarray]) -> BaseEnvTimestep: if self._replay_path is not None: self._frame.append(self._env.render()) if done: - import imageio + import imageio, os + if not os.path.exists(self._replay_path): + os.makedirs(self._replay_path) imageio.mimsave(self._replay_path + '/replay.gif', self._frame) return BaseEnvTimestep(obs, rew, done, info) From fecf5d37a3438ffdd5f049398c0737219d49a904 Mon Sep 17 00:00:00 2001 From: nighood Date: Fri, 14 Jun 2024 15:43:26 +0800 Subject: [PATCH 15/16] feature(rjy): add entropy info/set margin --- lzero/worker/muzero_collector.py | 3 ++ .../config/crowdsim_muzero_md_config.py | 11 +++-- zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py | 49 ++++++++++++------- zoo/CrowdSim/envs/Crowdsim/env/model/agent.py | 2 +- zoo/CrowdSim/envs/crowdsim_lightzero_env.py | 2 + .../envs/test_crowdsim_lightzero_env.py | 5 +- 6 files changed, 49 insertions(+), 23 deletions(-) diff --git a/lzero/worker/muzero_collector.py b/lzero/worker/muzero_collector.py index 3c1a630dd..7b8f67ed3 100644 --- a/lzero/worker/muzero_collector.py +++ b/lzero/worker/muzero_collector.py @@ -587,6 +587,7 @@ def collect(self, 'mean_energy_consumption': mean_energy_consumption, 'transmitted_data_ratio': transmitted_data_ratio, 'human_coverage': human_coverage, + "distribution_entropy": timestep.info['performance_info']['distribution_entropy'] } else: info = { @@ -735,6 +736,7 @@ def _output_log(self, train_iter: int) -> None: episode_transmitted_data_ratio = [d['transmitted_data_ratio'] for d in self._episode_info] episode_human_coverage = [d['human_coverage'] for d in self._episode_info] mean_transmit_data = [d['mean_transmit_data'] for d in self._episode_info] + mean_distribution_entropy = [d['distribution_entropy'] for d in self._episode_info] info = { 'episode_count': episode_count, 'envstep_count': envstep_count, @@ -755,6 +757,7 @@ def _output_log(self, train_iter: int) -> None: 'episode_mean_energy_consumption': np.mean(episode_energy_consumption), 'episode_mean_transmitted_data_ratio': np.mean(episode_transmitted_data_ratio), 'episode_mean_human_coverage': np.mean(episode_human_coverage), + 'episode_mean_distribution_entropy': np.mean(mean_distribution_entropy), } else: info = { diff --git a/zoo/CrowdSim/config/crowdsim_muzero_md_config.py b/zoo/CrowdSim/config/crowdsim_muzero_md_config.py index a79eb0bbe..e80476efe 100644 --- a/zoo/CrowdSim/config/crowdsim_muzero_md_config.py +++ b/zoo/CrowdSim/config/crowdsim_muzero_md_config.py @@ -7,8 +7,9 @@ collector_env_num = 8 n_episode = 8 evaluator_env_num = 3 -num_simulations = 25 -update_per_collect = 100 +num_simulations = 50 +# num_simulations = 25 +update_per_collect = 250 batch_size = 256 max_env_step = int(3e5) reanalyze_ratio = 0. @@ -28,6 +29,7 @@ env=dict( env_mode = 'hard', transmit_v=transmit_v, + collect_v_prob = {'1': 1, '2': 0}, obs_mode='1-dim-array', env_name='CrowdSim-v0', dataset = 'purdue', @@ -65,6 +67,7 @@ ), cuda=True, env_type='not_board_games', + # game_segment_length=120, game_segment_length=200, update_per_collect=update_per_collect, batch_size=batch_size, @@ -72,7 +75,7 @@ lr_piecewise_constant_decay=False, learning_rate=0.003, ssl_loss_weight=2, # NOTE: default is 0. - grad_clip_value=0.5, + grad_clip_value=10, num_simulations=num_simulations, reanalyze_ratio=reanalyze_ratio, n_episode=n_episode, @@ -91,7 +94,7 @@ type='crowdsim_lightzero', import_names=['zoo.CrowdSim.envs.crowdsim_lightzero_env'], ), - env_manager=dict(type='base'), + env_manager=dict(type='subprocess'), policy=dict( type='muzero', import_names=['lzero.policy.muzero'], diff --git a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py index 7ddf4550a..b8b3bc43b 100644 --- a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py +++ b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py @@ -5,6 +5,7 @@ import gym # from shapely.geometry import Point import numpy as np +from scipy.stats import entropy # import folium # from folium.plugins import TimestampedGeoJson, AntPath @@ -51,18 +52,6 @@ def __init__(self, dataset, custom_config=None): self.upper_right = self.config.upper_right self.human_df = pd.read_csv(self.config.dataset_dir) logging.info("Finished reading {} rows".format(len(self.human_df))) - # # for temporarily processing data - # sample_list=np.random.choice(self.human_num, size=[50,], replace=False) - # sample_list=sample_list[np.argsort(sample_list)] - # print(sample_list) - # self.human_df= self.human_df[self.human_df["id"].isin(sample_list)] - # for i,human_id in enumerate(sample_list): - # mask=(self.human_df["id"]==human_id) - # self.human_df.loc[mask,"id"]=i - # self.human_df=self.human_df.sort_values(by=["id","timestamp"],ascending=[True,True]) - # print(self.human_df.head()) - # self.human_df.to_csv("50 users-5.csv",index=False) - # exit(0) self.human_df['t'] = pd.to_datetime(self.human_df['timestamp'], unit='s') # 's' stands for second self.human_df['aoi'] = -1 # 加入aoi记录aoi @@ -81,6 +70,8 @@ def __init__(self, dataset, custom_config=None): self.robot_y_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num]) self.update_human_timelist = np.zeros([self.config.num_timestep, ]) self.data_transmission = 0 + self.data_collection_distribution = np.zeros(self.human_num) + self.data_transmission_distribution = np.zeros(self.human_num) def set_agent(self, agent): self.agent = agent @@ -144,11 +135,15 @@ def reset(self, phase='test', test_case=None): self.robot_y_timelist[self.current_timestep, :] = self.nlat / 2 self.update_human_timelist = np.zeros([self.config.num_timestep, ]) self.data_transmission = 0 + self.data_collection_distribution = np.zeros(self.human_num) + self.data_transmission_distribution = np.zeros(self.human_num) # for visualization self.plot_states = [] self.robot_actions = [] self.rewards = [] + self.aoi_rewards = [] + self.energy_rewards = [] self.action_values = [] self.plot_states.append([[robot.get_obs() for robot in self.robots], [human.get_obs() for human in self.humans]]) @@ -178,7 +173,7 @@ def step(self, action): new_energy = robot.energy - consume_energy self.robot_energy_timelist[self.current_timestep + 1][robot_id] = new_energy - if is_collide is True: + if is_collide or (new_robot_px < 0 or new_robot_px > self.nlon or new_robot_py < 0 or new_robot_py > self.nlat): new_robot_position[robot_id][0] = robot.px new_robot_position[robot_id][1] = robot.py self.robot_x_timelist[self.current_timestep + 1][robot_id] = robot.px @@ -231,6 +226,8 @@ def step(self, action): self.cur_data_amount_timelist[human_id] = human.data_amount self.current_human_aoi_list[human_id] = human.aoi self.sync_human_df(human_id, self.current_timestep + 1, human.aoi, human.data_amount) + self.data_collection_distribution[human_id] += human.collect_v + self.data_transmission_distribution[human_id] += human_transmit_data_list[human_id] self.mean_aoi_timelist[self.current_timestep + 1] = np.mean(self.current_human_aoi_list) self.update_human_timelist[self.current_timestep] = num_updated_human @@ -244,13 +241,20 @@ def step(self, action): self.total_generated_data_amount += self.generate_data_amount_per_step # TODO: need to be well-defined - reward = self.mean_aoi_timelist[self.current_timestep] - self.mean_aoi_timelist[self.current_timestep + 1] \ - - self.config.energy_factor * np.sum(current_enenrgy_consume) + aoi_reward = self.mean_aoi_timelist[self.current_timestep] - self.mean_aoi_timelist[self.current_timestep + 1] + energy_reward = np.sum(current_enenrgy_consume) + reward = aoi_reward \ + - self.config.energy_factor * energy_reward # if hasattr(self.agent.policy, 'action_values'): # self.action_values.append(self.agent.policy.action_values) self.robot_actions.append(action) self.rewards.append(reward) + self.aoi_rewards.append(aoi_reward) + self.energy_rewards.append(energy_reward) + distribution_entropy = entropy( + self.data_collection_distribution/ np.sum(self.data_collection_distribution), + self.data_transmission_distribution/np.sum(self.data_transmission_distribution) + 1e-10) self.plot_states.append([[robot.get_obs() for robot in self.robots], [human.get_obs() for human in self.humans]]) @@ -266,11 +270,12 @@ def step(self, action): info = { "performance_info": { "mean_aoi": self.mean_aoi_timelist[self.current_timestep], - "mean_transmit_data": delta_sum_transmit_data / self.human_num, + "mean_transmit_data": self.data_transmission / self.human_num, "mean_energy_consumption": 1.0 - ( np.mean(self.robot_energy_timelist[self.current_timestep]) / self.max_uav_energy), "transmitted_data_ratio": self.data_transmission/(self.total_generated_data_amount*0.3), - "human_coverage": np.mean(self.update_human_timelist) / self.human_num + "human_coverage": np.mean(self.update_human_timelist) / self.human_num, + "distribution_entropy": distribution_entropy # 增加交叉熵信息 }, } @@ -286,6 +291,7 @@ def render(self): map_max_y = self.config.nlat # 创建一个新的图形 fig, ax = plt.subplots(figsize=(8, 6)) + plt.subplots_adjust(right=0.75) # 给数据留白 # 绘制机器人的历史轨迹 for timestep in range(len(self.robot_x_timelist)): @@ -314,6 +320,15 @@ def render(self): ax.set_xlabel('X') ax.set_ylabel('Y') + # 在图的右上角显示reward/aoi_reward/energy_reward/mean_aoi/energy + reward_text = f"Reward: {self.rewards[-1] if self.rewards else 0:.2f}\n" \ + f"AOI Reward: {self.aoi_rewards[-1] if self.aoi_rewards else 0:.2f}\n" \ + f"Energy Reward: {self.energy_rewards[-1] if self.energy_rewards else 0:.2f}\n" \ + f"Mean AOI: {self.mean_aoi_timelist[self.current_timestep] if self.current_timestep < len(self.mean_aoi_timelist) else 0:.2f}\n" \ + f"Energy: {np.mean(self.robot_energy_timelist[self.current_timestep]) if self.current_timestep < len(self.robot_energy_timelist) else 0:.2f}" + plt.text(1.05, 0.95, reward_text, horizontalalignment='left', verticalalignment='top', + transform=ax.transAxes, fontsize=10, bbox=dict(facecolor='white', alpha=0.6), + clip_on=False) # Ensure text is not clipped # 在地图之外留出一些空白区域 ax.margins(x=0.1, y=0.1) ax.set_title('Crowd Simulation Visualization') diff --git a/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py b/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py index db6c47b31..03a4f5af3 100644 --- a/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py +++ b/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py @@ -37,7 +37,7 @@ def __init__(self, id, config): self.data_queue = InformationQueue() self.data_amount = 0 self.collect_v_prob = getattr(self.config, 'collect_v_prob', {1: 1, 2: 0}) - self.collect_v = random.choices(list(self.collect_v_prob.keys()), list(self.collect_v_prob.values()))[0] + self.collect_v = random.choices(list(map(int, self.collect_v_prob.keys())), list(self.collect_v_prob.values()))[0] def set(self, px, py, theta, aoi, data_amount): self.px = px diff --git a/zoo/CrowdSim/envs/crowdsim_lightzero_env.py b/zoo/CrowdSim/envs/crowdsim_lightzero_env.py index bcecc89bc..59548dbe5 100644 --- a/zoo/CrowdSim/envs/crowdsim_lightzero_env.py +++ b/zoo/CrowdSim/envs/crowdsim_lightzero_env.py @@ -128,6 +128,8 @@ def step(self, action: Union[int, np.ndarray]) -> BaseEnvTimestep: if not os.path.exists(self._replay_path): os.makedirs(self._replay_path) imageio.mimsave(self._replay_path + '/replay.gif', self._frame) + # save env.human_df as csv + self._env.human_df.to_csv(self._replay_path + '/human_df.csv') return BaseEnvTimestep(obs, rew, done, info) def enable_save_replay(self, replay_path: Optional[str] = None) -> None: diff --git a/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py b/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py index 09f36d6b7..9ff396da5 100644 --- a/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py +++ b/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py @@ -10,7 +10,9 @@ human_num = 59, # purdue one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]], obs_mode = '2-dim-array', - env_mode = 'easy', + env_mode = 'hard', + transmit_v=120, + collect_v_prob = {'1': 1, '2': 0}, ) @ pytest.mark.envtest @@ -72,6 +74,7 @@ def test_obs_1_dim_array(self): if timestep.done: break print(env.observation_space, env.action_space, env.reward_space) + print('episode reward:', timestep.info['eval_episode_return']) env.close() From 63d37a89086dfd8dc7813221a3eca9673cbf97b4 Mon Sep 17 00:00:00 2001 From: nighood Date: Thu, 20 Jun 2024 18:29:00 +0800 Subject: [PATCH 16/16] polish(rjy): polish code according to comments --- lzero/agent/efficientzero.py | 8 +- lzero/agent/muzero.py | 8 +- lzero/agent/sampled_efficientzero.py | 15 +- lzero/mcts/utils.py | 20 +- lzero/model/common.py | 98 ++++--- lzero/model/common_gcn.py | 75 +++--- lzero/model/muzero_model_gcn.py | 9 +- lzero/model/muzero_model_md.py | 23 +- .../model/sampled_efficientzero_model_gcn.py | 8 +- lzero/model/sampled_efficientzero_model_md.py | 8 +- lzero/model/tests/test_common_gcn.py | 6 +- lzero/model/tests/test_rgcn.py | 10 +- lzero/policy/efficientzero.py | 19 +- lzero/policy/muzero.py | 22 +- lzero/policy/sampled_efficientzero.py | 99 ++++--- lzero/worker/muzero_collector.py | 139 +++++----- lzero/worker/muzero_evaluator.py | 16 +- .../config/CrowdSim_efficientzero_config.py | 35 +-- zoo/CrowdSim/config/CrowdSim_muzero_config.py | 26 +- .../crowdsim_efficientzero_md_config.py | 47 ++-- .../config/crowdsim_muzero_md_config.py | 56 ++-- .../config/crowdsim_muzero_rgcn_config.py | 46 ++-- zoo/CrowdSim/config/crowdsim_sez_md_config.py | 47 ++-- zoo/CrowdSim/entry/eval_crowdsim.py | 14 +- zoo/CrowdSim/envs/CrowdSim_env.py | 18 +- .../envs/Crowdsim/env/base_env_config.py | 137 ---------- zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py | 250 +++++++++++++----- .../Crowdsim/env/crowd_sim_base_config.py | 156 +++++++++++ zoo/CrowdSim/envs/Crowdsim/env/model/agent.py | 102 ++++++- zoo/CrowdSim/envs/Crowdsim/env/model/mdp.py | 15 +- zoo/CrowdSim/envs/Crowdsim/env/model/utils.py | 173 +++++++++--- zoo/CrowdSim/envs/crowdsim_lightzero_env.py | 33 ++- zoo/CrowdSim/envs/test_CrowdSim_env.py | 21 +- .../envs/test_crowdsim_lightzero_env.py | 23 +- 34 files changed, 1039 insertions(+), 743 deletions(-) delete mode 100644 zoo/CrowdSim/envs/Crowdsim/env/base_env_config.py create mode 100644 zoo/CrowdSim/envs/Crowdsim/env/crowd_sim_base_config.py diff --git a/lzero/agent/efficientzero.py b/lzero/agent/efficientzero.py index f06cd844b..bd8e6ff7b 100644 --- a/lzero/agent/efficientzero.py +++ b/lzero/agent/efficientzero.py @@ -127,8 +127,8 @@ def __init__( self.env_fn, self.collector_env_cfg, self.evaluator_env_cfg = get_vec_env_setting(self.cfg.env) def train( - self, - step: int = int(1e7), + self, + step: int = int(1e7), ) -> TrainingReturn: """ Overview: @@ -359,8 +359,8 @@ def deploy( return EvalReturn(eval_value=np.mean(reward_list), eval_value_std=np.std(reward_list)) def batch_evaluate( - self, - n_evaluator_episode: int = None, + self, + n_evaluator_episode: int = None, ) -> EvalReturn: """ Overview: diff --git a/lzero/agent/muzero.py b/lzero/agent/muzero.py index 26ffc6f04..dfb691a69 100644 --- a/lzero/agent/muzero.py +++ b/lzero/agent/muzero.py @@ -130,8 +130,8 @@ def __init__( self.env_fn, self.collector_env_cfg, self.evaluator_env_cfg = get_vec_env_setting(self.cfg.env) def train( - self, - step: int = int(1e7), + self, + step: int = int(1e7), ) -> TrainingReturn: """ Overview: @@ -362,8 +362,8 @@ def deploy( return EvalReturn(eval_value=np.mean(reward_list), eval_value_std=np.std(reward_list)) def batch_evaluate( - self, - n_evaluator_episode: int = None, + self, + n_evaluator_episode: int = None, ) -> EvalReturn: """ Overview: diff --git a/lzero/agent/sampled_efficientzero.py b/lzero/agent/sampled_efficientzero.py index ece54b784..a60dae859 100644 --- a/lzero/agent/sampled_efficientzero.py +++ b/lzero/agent/sampled_efficientzero.py @@ -93,7 +93,12 @@ def __init__( cfg.main_config.exp_name = exp_name self.origin_cfg = cfg self.cfg = compile_config( - cfg.main_config, seed=seed, env=None, auto=True, policy=SampledEfficientZeroPolicy, create_cfg=cfg.create_config + cfg.main_config, + seed=seed, + env=None, + auto=True, + policy=SampledEfficientZeroPolicy, + create_cfg=cfg.create_config ) self.exp_name = self.cfg.exp_name @@ -127,8 +132,8 @@ def __init__( self.env_fn, self.collector_env_cfg, self.evaluator_env_cfg = get_vec_env_setting(self.cfg.env) def train( - self, - step: int = int(1e7), + self, + step: int = int(1e7), ) -> TrainingReturn: """ Overview: @@ -359,8 +364,8 @@ def deploy( return EvalReturn(eval_value=np.mean(reward_list), eval_value_std=np.std(reward_list)) def batch_evaluate( - self, - n_evaluator_episode: int = None, + self, + n_evaluator_episode: int = None, ) -> EvalReturn: """ Overview: diff --git a/lzero/mcts/utils.py b/lzero/mcts/utils.py index 80e3f588e..11afad53a 100644 --- a/lzero/mcts/utils.py +++ b/lzero/mcts/utils.py @@ -6,8 +6,9 @@ from graphviz import Digraph -def generate_random_actions_discrete(num_actions: int, action_space_size: int, num_of_sampled_actions: int, - reshape=False): +def generate_random_actions_discrete( + num_actions: int, action_space_size: int, num_of_sampled_actions: int, reshape=False +): """ Overview: Generate a list of random actions. @@ -19,10 +20,7 @@ def generate_random_actions_discrete(num_actions: int, action_space_size: int, n Returns: A list of random actions. """ - actions = [ - np.random.randint(0, action_space_size, num_of_sampled_actions).reshape(-1) - for _ in range(num_actions) - ] + actions = [np.random.randint(0, action_space_size, num_of_sampled_actions).reshape(-1) for _ in range(num_actions)] # If num_of_sampled_actions == 1, flatten the actions to a list of numbers if num_of_sampled_actions == 1: @@ -97,7 +95,9 @@ def prepare_observation(observation_list, model_type='conv'): Returns: - np.ndarray: Reshaped array of observations. """ - assert model_type in ['conv', 'mlp', 'rgcn', 'mlp_md'], "model_type must be either 'conv', 'mlp', 'rgcn' or 'mlp_md'" + assert model_type in [ + 'conv', 'mlp', 'rgcn', 'mlp_md' + ], "model_type must be either 'conv', 'mlp', 'rgcn' or 'mlp_md'" observation_array = np.array(observation_list) batch_size = observation_array.shape[0] @@ -116,7 +116,7 @@ def prepare_observation(observation_list, model_type='conv'): observation_array = observation_array.reshape(batch_size, -1) else: raise ValueError("For 'mlp' model_type, the observation must have 3 dimensions [B, S, O]") - + elif model_type == 'rgcn': if observation_array.ndim == 4: # TODO(rjy): strage process @@ -127,7 +127,9 @@ def prepare_observation(observation_list, model_type='conv'): # Flatten the last two dimensions observation_array = observation_array.reshape(batch_size, -1) else: - raise ValueError("For 'rgcn' model_type, the observation must have 3 dimensions [B, S, O] or 4 dimensions [B, S, M, O]") + raise ValueError( + "For 'rgcn' model_type, the observation must have 3 dimensions [B, S, O] or 4 dimensions [B, S, M, O]" + ) return observation_array diff --git a/lzero/model/common.py b/lzero/model/common.py index ddf4a5d59..2b798a163 100644 --- a/lzero/model/common.py +++ b/lzero/model/common.py @@ -38,10 +38,14 @@ class MZNetworkOutput: class DownSample(nn.Module): - - def __init__(self, observation_shape: SequenceType, out_channels: int, activation: nn.Module = nn.ReLU(inplace=True), - norm_type: Optional[str] = 'BN', - ) -> None: + + def __init__( + self, + observation_shape: SequenceType, + out_channels: int, + activation: nn.Module = nn.ReLU(inplace=True), + norm_type: Optional[str] = 'BN', + ) -> None: """ Overview: Define downSample convolution network. Encode the observation into hidden state. @@ -74,11 +78,7 @@ def __init__(self, observation_shape: SequenceType, out_channels: int, activatio self.resblocks1 = nn.ModuleList( [ ResBlock( - in_channels=out_channels // 2, - activation=activation, - norm_type='BN', - res_type='basic', - bias=False + in_channels=out_channels // 2, activation=activation, norm_type='BN', res_type='basic', bias=False ) for _ in range(1) ] ) @@ -92,17 +92,15 @@ def __init__(self, observation_shape: SequenceType, out_channels: int, activatio ) self.resblocks2 = nn.ModuleList( [ - ResBlock( - in_channels=out_channels, activation=activation, norm_type='BN', res_type='basic', bias=False - ) for _ in range(1) + ResBlock(in_channels=out_channels, activation=activation, norm_type='BN', res_type='basic', bias=False) + for _ in range(1) ] ) self.pooling1 = nn.AvgPool2d(kernel_size=3, stride=2, padding=1) self.resblocks3 = nn.ModuleList( [ - ResBlock( - in_channels=out_channels, activation=activation, norm_type='BN', res_type='basic', bias=False - ) for _ in range(1) + ResBlock(in_channels=out_channels, activation=activation, norm_type='BN', res_type='basic', bias=False) + for _ in range(1) ] ) self.pooling2 = nn.AvgPool2d(kernel_size=3, stride=2, padding=1) @@ -176,15 +174,18 @@ def __init__( self.norm = nn.BatchNorm2d(num_channels) elif norm_type == 'LN': if downsample: - self.norm = nn.LayerNorm([num_channels, math.ceil(observation_shape[-2] / 16), math.ceil(observation_shape[-1] / 16)]) + self.norm = nn.LayerNorm( + [num_channels, + math.ceil(observation_shape[-2] / 16), + math.ceil(observation_shape[-1] / 16)] + ) else: self.norm = nn.LayerNorm([num_channels, observation_shape[-2], observation_shape[-1]]) - + self.resblocks = nn.ModuleList( [ - ResBlock( - in_channels=num_channels, activation=activation, norm_type='BN', res_type='basic', bias=False - ) for _ in range(num_res_blocks) + ResBlock(in_channels=num_channels, activation=activation, norm_type='BN', res_type='basic', bias=False) + for _ in range(num_res_blocks) ] ) self.activation = activation @@ -225,13 +226,13 @@ def get_param_mean(self) -> float: class RepresentationNetworkMLP(nn.Module): def __init__( - self, - observation_shape: int, - hidden_channels: int = 64, - layer_num: int = 2, - activation: Optional[nn.Module] = nn.ReLU(inplace=True), - last_linear_layer_init_zero: bool = True, - norm_type: Optional[str] = 'BN', + self, + observation_shape: int, + hidden_channels: int = 64, + layer_num: int = 2, + activation: Optional[nn.Module] = nn.ReLU(inplace=True), + last_linear_layer_init_zero: bool = True, + norm_type: Optional[str] = 'BN', ) -> torch.Tensor: """ Overview: @@ -325,26 +326,35 @@ def __init__( self.resblocks = nn.ModuleList( [ - ResBlock( - in_channels=num_channels, activation=activation, norm_type='BN', res_type='basic', bias=False - ) for _ in range(num_res_blocks) + ResBlock(in_channels=num_channels, activation=activation, norm_type='BN', res_type='basic', bias=False) + for _ in range(num_res_blocks) ] ) self.conv1x1_value = nn.Conv2d(num_channels, value_head_channels, 1) self.conv1x1_policy = nn.Conv2d(num_channels, policy_head_channels, 1) - + if norm_type == 'BN': self.norm_value = nn.BatchNorm2d(value_head_channels) self.norm_policy = nn.BatchNorm2d(policy_head_channels) elif norm_type == 'LN': if downsample: - self.norm_value = nn.LayerNorm([value_head_channels, math.ceil(observation_shape[-2] / 16), math.ceil(observation_shape[-1] / 16)]) - self.norm_policy = nn.LayerNorm([policy_head_channels, math.ceil(observation_shape[-2] / 16), math.ceil(observation_shape[-1] / 16)]) + self.norm_value = nn.LayerNorm( + [value_head_channels, + math.ceil(observation_shape[-2] / 16), + math.ceil(observation_shape[-1] / 16)] + ) + self.norm_policy = nn.LayerNorm( + [ + policy_head_channels, + math.ceil(observation_shape[-2] / 16), + math.ceil(observation_shape[-1] / 16) + ] + ) else: self.norm_value = nn.LayerNorm([value_head_channels, observation_shape[-2], observation_shape[-1]]) self.norm_policy = nn.LayerNorm([policy_head_channels, observation_shape[-2], observation_shape[-1]]) - + self.flatten_output_size_for_value_head = flatten_output_size_for_value_head self.flatten_output_size_for_policy_head = flatten_output_size_for_policy_head self.activation = activation @@ -406,16 +416,16 @@ def forward(self, latent_state: torch.Tensor) -> Tuple[torch.Tensor, torch.Tenso class PredictionNetworkMLP(nn.Module): def __init__( - self, - action_space_size, - num_channels, - common_layer_num: int = 2, - fc_value_layers: SequenceType = [32], - fc_policy_layers: SequenceType = [32], - output_support_size: int = 601, - last_linear_layer_init_zero: bool = True, - activation: Optional[nn.Module] = nn.ReLU(inplace=True), - norm_type: Optional[str] = 'BN', + self, + action_space_size, + num_channels, + common_layer_num: int = 2, + fc_value_layers: SequenceType = [32], + fc_policy_layers: SequenceType = [32], + output_support_size: int = 601, + last_linear_layer_init_zero: bool = True, + activation: Optional[nn.Module] = nn.ReLU(inplace=True), + norm_type: Optional[str] = 'BN', ): """ Overview: diff --git a/lzero/model/common_gcn.py b/lzero/model/common_gcn.py index 857890c2f..73bd7a20c 100644 --- a/lzero/model/common_gcn.py +++ b/lzero/model/common_gcn.py @@ -9,29 +9,31 @@ from .utils import renormalize, get_params_mean, get_dynamic_mean, get_reward_mean + class RGCNLayer(nn.Module): """ Overview: Relational graph convolutional network layer. """ + def __init__( - self, - robot_num: int, - human_num: int, - robot_state_dim, - human_state_dim, - similarity_function, - num_layer = 2, - X_dim = 32, - layerwise_graph = False, - skip_connection = True, - wr_dims = [64, 32], # the last dim should equal to X_dim - wh_dims = [64, 32], # the last dim should equal to X_dim - final_state_dim = 32, # should equal to X_dim - norm_type= None, - last_linear_layer_init_zero=True, - activation: Optional[nn.Module] = nn.ReLU(inplace=True), - ): + self, + robot_num: int, + human_num: int, + robot_state_dim, + human_state_dim, + similarity_function, + num_layer=2, + X_dim=32, + layerwise_graph=False, + skip_connection=True, + wr_dims=[64, 32], # the last dim should equal to X_dim + wh_dims=[64, 32], # the last dim should equal to X_dim + final_state_dim=32, # should equal to X_dim + norm_type=None, + last_linear_layer_init_zero=True, + activation: Optional[nn.Module] = nn.ReLU(inplace=True), + ): super().__init__() # design choice @@ -59,7 +61,7 @@ def __init__( activation=activation, norm_type=norm_type, last_linear_layer_init_zero=last_linear_layer_init_zero, - ) # inputs,64,32 + ) # inputs,64,32 self.w_h = MLP( in_channels=human_state_dim, hidden_channels=wh_dims[0], @@ -68,7 +70,7 @@ def __init__( activation=activation, norm_type=norm_type, last_linear_layer_init_zero=last_linear_layer_init_zero, - ) # inputs,64,32 + ) # inputs,64,32 if self.similarity_function == 'embedded_gaussian': self.w_a = nn.Parameter(torch.randn(self.X_dim, self.X_dim)) @@ -79,7 +81,7 @@ def __init__( hidden_channels=2 * X_dim, out_channels=1, layer_num=1, - ) + ) embedding_dim = self.X_dim self.Ws = torch.nn.ParameterList() @@ -148,10 +150,12 @@ def forward(self, state): stack_num = state.size(1) // ((self.robot_num + self.human_num) * self.robot_state_dim) assert stack_num == 1, "stack_num should be 1 for 1-dim-array obs" # robot_states shape:(B, stack_num*robot_num, state_dim) - robot_states = state[:, :stack_num * self.robot_num * self.robot_state_dim].reshape(-1, self.robot_num, self.robot_state_dim) + robot_states = state[:, :stack_num * self.robot_num * + self.robot_state_dim].reshape(-1, self.robot_num, self.robot_state_dim) # human_states shape:(B, stack_num*human_num, state_dim) - human_states = state[:, stack_num * self.robot_num * self.robot_state_dim:].reshape(-1, self.human_num, self.human_state_dim) - + human_states = state[:, stack_num * self.robot_num * + self.robot_state_dim:].reshape(-1, self.human_num, self.human_state_dim) + # compute feature matrix X robot_state_embedings = self.w_r(robot_states) # batch x num x embedding_dim human_state_embedings = self.w_h(human_states) @@ -180,19 +184,20 @@ def forward(self, state): return next_H + class RepresentationNetworkGCN(nn.Module): def __init__( - self, - robot_state_dim: int, - human_state_dim: int, - robot_num: int, - human_num: int, - hidden_channels: int = 64, - layer_num: int = 2, - activation: Optional[nn.Module] = nn.ReLU(inplace=True), - last_linear_layer_init_zero: bool = True, - norm_type: Optional[str] = 'BN', + self, + robot_state_dim: int, + human_state_dim: int, + robot_num: int, + human_num: int, + hidden_channels: int = 64, + layer_num: int = 2, + activation: Optional[nn.Module] = nn.ReLU(inplace=True), + last_linear_layer_init_zero: bool = True, + norm_type: Optional[str] = 'BN', ) -> torch.Tensor: """ Overview: @@ -229,7 +234,7 @@ def __init__( num_layer=2, X_dim=hidden_channels, final_state_dim=hidden_channels, - wr_dims=[hidden_channels, hidden_channels], # TODO: check dim + wr_dims=[hidden_channels, hidden_channels], # TODO: check dim wh_dims=[hidden_channels, hidden_channels], layerwise_graph=False, skip_connection=True, @@ -258,4 +263,4 @@ def forward(self, x: Dict[str, torch.Tensor]) -> torch.Tensor: """ gcn_embedding = self.rgcn(x) gcn_embedding = gcn_embedding.view(gcn_embedding.shape[0], -1) # (B,M,N) -> (B,M*N) - return self.fc_representation(gcn_embedding) \ No newline at end of file + return self.fc_representation(gcn_embedding) diff --git a/lzero/model/muzero_model_gcn.py b/lzero/model/muzero_model_gcn.py index 6e2c808a6..08770a880 100644 --- a/lzero/model/muzero_model_gcn.py +++ b/lzero/model/muzero_model_gcn.py @@ -106,10 +106,10 @@ def __init__( self.res_connection_in_dynamics = res_connection_in_dynamics self.representation_network = RepresentationNetworkGCN( - robot_state_dim = robot_state_dim, - human_state_dim = human_state_dim, - robot_num = robot_num, - human_num = human_num, + robot_state_dim=robot_state_dim, + human_state_dim=human_state_dim, + robot_num=robot_num, + human_num=human_num, hidden_channels=self.latent_state_dim, layer_num=2, norm_type=norm_type @@ -454,4 +454,3 @@ def get_dynamic_mean(self) -> float: def get_reward_mean(self) -> float: return get_reward_mean(self) - diff --git a/lzero/model/muzero_model_md.py b/lzero/model/muzero_model_md.py index 187509ac8..cca5f863e 100644 --- a/lzero/model/muzero_model_md.py +++ b/lzero/model/muzero_model_md.py @@ -453,21 +453,20 @@ def get_reward_mean(self) -> float: return get_reward_mean(self) - class PredictionNetworkMD(nn.Module): def __init__( - self, - agent_num: int, - single_agent_action_size, - num_channels, - common_layer_num: int = 2, - fc_value_layers: SequenceType = [32], - output_support_size: int = 601, - last_linear_layer_init_zero: bool = True, - activation: Optional[nn.Module] = nn.ReLU(inplace=True), - norm_type: Optional[str] = 'BN', - output_separate_logit: bool = False, + self, + agent_num: int, + single_agent_action_size, + num_channels, + common_layer_num: int = 2, + fc_value_layers: SequenceType = [32], + output_support_size: int = 601, + last_linear_layer_init_zero: bool = True, + activation: Optional[nn.Module] = nn.ReLU(inplace=True), + norm_type: Optional[str] = 'BN', + output_separate_logit: bool = False, ): """ Overview: diff --git a/lzero/model/sampled_efficientzero_model_gcn.py b/lzero/model/sampled_efficientzero_model_gcn.py index 5747736eb..70fe8bf05 100644 --- a/lzero/model/sampled_efficientzero_model_gcn.py +++ b/lzero/model/sampled_efficientzero_model_gcn.py @@ -144,10 +144,10 @@ def __init__( self.res_connection_in_dynamics = res_connection_in_dynamics self.representation_network = RepresentationNetworkGCN( - robot_state_dim = robot_state_dim, - human_state_dim = human_state_dim, - robot_num = robot_num, - human_num = human_num, + robot_state_dim=robot_state_dim, + human_state_dim=human_state_dim, + robot_num=robot_num, + human_num=human_num, hidden_channels=self.latent_state_dim, norm_type=norm_type ) diff --git a/lzero/model/sampled_efficientzero_model_md.py b/lzero/model/sampled_efficientzero_model_md.py index 99092de4a..ec2673a3a 100644 --- a/lzero/model/sampled_efficientzero_model_md.py +++ b/lzero/model/sampled_efficientzero_model_md.py @@ -499,10 +499,10 @@ def __init__( ) else: self.policy_multi_head = MultiHead( - head_cls=DiscreteHead, - hidden_size=self.num_channels, - output_size_list=[single_agent_action_size for _ in range(agent_num)], - ) + head_cls=DiscreteHead, + hidden_size=self.num_channels, + output_size_list=[single_agent_action_size for _ in range(agent_num)], + ) self.output_separate_logit = output_separate_logit def forward(self, latent_state: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: diff --git a/lzero/model/tests/test_common_gcn.py b/lzero/model/tests/test_common_gcn.py index a7574b3b8..e8ec41d1c 100644 --- a/lzero/model/tests/test_common_gcn.py +++ b/lzero/model/tests/test_common_gcn.py @@ -5,6 +5,7 @@ # ... + class TestLightZeroEnvWrapper: # ... @@ -55,7 +56,7 @@ def test_representation_network_gcn_with_dict_obs(self): def test_representation_network_gcn_with_2d_array_obs(self): robot_state_dim = 10 - human_state_dim = 10 # 2d_array_obs, so the dimensions must be the same + human_state_dim = 10 # 2d_array_obs, so the dimensions must be the same robot_num = 3 human_num = 2 hidden_channels = 64 @@ -95,8 +96,9 @@ def test_representation_network_gcn_with_2d_array_obs(self): # Check intermediate type assert isinstance(representation_network.rgcn(x), torch.Tensor) + if __name__ == '__main__': test = TestLightZeroEnvWrapper() test.test_representation_network_gcn_with_dict_obs() test.test_representation_network_gcn_with_2d_array_obs() - print("All tests passed.") \ No newline at end of file + print("All tests passed.") diff --git a/lzero/model/tests/test_rgcn.py b/lzero/model/tests/test_rgcn.py index 32b90b6bb..836f36b2e 100644 --- a/lzero/model/tests/test_rgcn.py +++ b/lzero/model/tests/test_rgcn.py @@ -5,7 +5,9 @@ import unittest from lzero.model.common_gcn import RGCNLayer + class TestRGCNLayer(unittest.TestCase): + def setUp(self): self.robot_state_dim = 10 self.human_state_dim = 10 @@ -42,9 +44,13 @@ def test_similarity_function(self): if self.similarity_function == 'embedded_gaussian': X = torch.randn(self.batch_size, self.num_nodes * 2, 32) A = self.rgcn_layer.compute_similarity_matrix(X) - self.assertEqual(A.shape, (self.batch_size, self.num_nodes * 2, self.num_nodes * 2), "Similarity matrix shape is incorrect.") + self.assertEqual( + A.shape, (self.batch_size, self.num_nodes * 2, self.num_nodes * 2), + "Similarity matrix shape is incorrect." + ) self.assertTrue(torch.all(A >= 0) and torch.all(A <= 1), "Similarity matrix values should be normalized.") + # Running the tests if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() diff --git a/lzero/policy/efficientzero.py b/lzero/policy/efficientzero.py index b9e844cbe..7c0e50c42 100644 --- a/lzero/policy/efficientzero.py +++ b/lzero/policy/efficientzero.py @@ -370,7 +370,9 @@ def _forward_learn(self, data: torch.Tensor) -> Dict[str, Union[float, int]]: target_normalized_visit_count_masked = torch.index_select( target_normalized_visit_count_init_step, 0, non_masked_indices ) - target_policy_entropy = -((target_normalized_visit_count_masked+1e-6) * (target_normalized_visit_count_masked+1e-6).log()).sum(-1).mean() + target_policy_entropy = -( + (target_normalized_visit_count_masked + 1e-6) * (target_normalized_visit_count_masked + 1e-6).log() + ).sum(-1).mean() else: # Set target_policy_entropy to log(|A|) if all rows are masked target_policy_entropy = torch.log(torch.tensor(target_normalized_visit_count_init_step.shape[-1])) @@ -437,7 +439,9 @@ def _forward_learn(self, data: torch.Tensor) -> Dict[str, Union[float, int]]: target_normalized_visit_count_masked = torch.index_select( target_normalized_visit_count, 0, non_masked_indices ) - target_policy_entropy += -((target_normalized_visit_count_masked+1e-6) * (target_normalized_visit_count_masked+1e-6).log()).sum(-1).mean() + target_policy_entropy += -( + (target_normalized_visit_count_masked + 1e-6) * (target_normalized_visit_count_masked + 1e-6).log() + ).sum(-1).mean() else: # Set target_policy_entropy to log(|A|) if all rows are masked target_policy_entropy += torch.log(torch.tensor(target_normalized_visit_count.shape[-1])) @@ -578,8 +582,7 @@ def _forward_collect( pred_values = self.inverse_scalar_transform_handle(pred_values).detach().cpu().numpy() latent_state_roots = latent_state_roots.detach().cpu().numpy() reward_hidden_state_roots = ( - reward_hidden_state_roots[0].detach().cpu().numpy(), - reward_hidden_state_roots[1].detach().cpu().numpy() + reward_hidden_state_roots[0].detach().cpu().numpy(), reward_hidden_state_roots[1].detach().cpu().numpy() ) policy_logits = policy_logits.detach().cpu().numpy().tolist() @@ -649,7 +652,13 @@ def _init_eval(self) -> None: else: self._mcts_eval = MCTSPtree(self._cfg) - def _forward_eval(self, data: torch.Tensor, action_mask: list, to_play: -1, ready_env_id: np.array = None,): + def _forward_eval( + self, + data: torch.Tensor, + action_mask: list, + to_play: -1, + ready_env_id: np.array = None, + ): """ Overview: The forward function for evaluating the current policy in eval mode. Use model to execute MCTS search. diff --git a/lzero/policy/muzero.py b/lzero/policy/muzero.py index d4e49c30f..d5597db57 100644 --- a/lzero/policy/muzero.py +++ b/lzero/policy/muzero.py @@ -445,9 +445,9 @@ def _forward_learn(self, data: Tuple[torch.Tensor]) -> Dict[str, Union[float, in # ============================================================== # weighted loss with masks (some invalid states which are out of trajectory.) loss = ( - self._cfg.ssl_loss_weight * consistency_loss + self._cfg.policy_loss_weight * policy_loss + - self._cfg.value_loss_weight * value_loss + self._cfg.reward_loss_weight * reward_loss + - self._cfg.policy_entropy_loss_weight * policy_entropy_loss + self._cfg.ssl_loss_weight * consistency_loss + self._cfg.policy_loss_weight * policy_loss + + self._cfg.value_loss_weight * value_loss + self._cfg.reward_loss_weight * reward_loss + + self._cfg.policy_entropy_loss_weight * policy_entropy_loss ) weighted_total_loss = (weights * loss).mean() @@ -457,8 +457,9 @@ def _forward_learn(self, data: Tuple[torch.Tensor]) -> Dict[str, Union[float, in weighted_total_loss.backward() if self._cfg.multi_gpu: self.sync_gradients(self._learn_model) - total_grad_norm_before_clip = torch.nn.utils.clip_grad_norm_(self._learn_model.parameters(), - self._cfg.grad_clip_value) + total_grad_norm_before_clip = torch.nn.utils.clip_grad_norm_( + self._learn_model.parameters(), self._cfg.grad_clip_value + ) self._optimizer.step() if self._cfg.lr_piecewise_constant_decay: self.lr_scheduler.step() @@ -481,7 +482,7 @@ def _forward_learn(self, data: Tuple[torch.Tensor]) -> Dict[str, Union[float, in 'weighted_total_loss': weighted_total_loss.item(), 'total_loss': loss.mean().item(), 'policy_loss': policy_loss.mean().item(), - 'policy_entropy': - policy_entropy_loss.mean().item() / (self._cfg.num_unroll_steps + 1), + 'policy_entropy': -policy_entropy_loss.mean().item() / (self._cfg.num_unroll_steps + 1), 'reward_loss': reward_loss.mean().item(), 'value_loss': value_loss.mean().item(), 'consistency_loss': consistency_loss.mean().item() / self._cfg.num_unroll_steps, @@ -653,8 +654,13 @@ def _get_target_obs_index_in_step_k(self, step): end_index = self._cfg.model.observation_shape * (step + self._cfg.model.frame_stack_num) return beg_index, end_index - def _forward_eval(self, data: torch.Tensor, action_mask: list, to_play: int = -1, - ready_env_id: np.array = None, ) -> Dict: + def _forward_eval( + self, + data: torch.Tensor, + action_mask: list, + to_play: int = -1, + ready_env_id: np.array = None, + ) -> Dict: """ Overview: The forward function for evaluating the current policy in eval mode. Use model to execute MCTS search. diff --git a/lzero/policy/sampled_efficientzero.py b/lzero/policy/sampled_efficientzero.py index 687fe631b..761eaa96b 100644 --- a/lzero/policy/sampled_efficientzero.py +++ b/lzero/policy/sampled_efficientzero.py @@ -500,9 +500,9 @@ def _forward_learn(self, data: torch.Tensor) -> Dict[str, Union[float, int]]: # ============================================================== # weighted loss with masks (some invalid states which are out of trajectory.) loss = ( - self._cfg.ssl_loss_weight * consistency_loss + self._cfg.policy_loss_weight * policy_loss + - self._cfg.value_loss_weight * value_loss + self._cfg.reward_loss_weight * value_prefix_loss + - self._cfg.policy_entropy_loss_weight * policy_entropy_loss + self._cfg.ssl_loss_weight * consistency_loss + self._cfg.policy_loss_weight * policy_loss + + self._cfg.value_loss_weight * value_loss + self._cfg.reward_loss_weight * value_prefix_loss + + self._cfg.policy_entropy_loss_weight * policy_entropy_loss ) weighted_total_loss = (weights * loss).mean() @@ -554,33 +554,37 @@ def _forward_learn(self, data: torch.Tensor) -> Dict[str, Union[float, int]]: } if self._cfg.model.continuous_action_space: - return_data.update({ - # ============================================================== - # sampled related core code - # ============================================================== - 'policy_mu_max': mu[:, 0].max().item(), - 'policy_mu_min': mu[:, 0].min().item(), - 'policy_mu_mean': mu[:, 0].mean().item(), - 'policy_sigma_max': sigma.max().item(), - 'policy_sigma_min': sigma.min().item(), - 'policy_sigma_mean': sigma.mean().item(), - # take the fist dim in action space - 'target_sampled_actions_max': target_sampled_actions[:, :, 0].max().item(), - 'target_sampled_actions_min': target_sampled_actions[:, :, 0].min().item(), - 'target_sampled_actions_mean': target_sampled_actions[:, :, 0].mean().item(), - 'total_grad_norm_before_clip': total_grad_norm_before_clip.item() - }) + return_data.update( + { + # ============================================================== + # sampled related core code + # ============================================================== + 'policy_mu_max': mu[:, 0].max().item(), + 'policy_mu_min': mu[:, 0].min().item(), + 'policy_mu_mean': mu[:, 0].mean().item(), + 'policy_sigma_max': sigma.max().item(), + 'policy_sigma_min': sigma.min().item(), + 'policy_sigma_mean': sigma.mean().item(), + # take the fist dim in action space + 'target_sampled_actions_max': target_sampled_actions[:, :, 0].max().item(), + 'target_sampled_actions_min': target_sampled_actions[:, :, 0].min().item(), + 'target_sampled_actions_mean': target_sampled_actions[:, :, 0].mean().item(), + 'total_grad_norm_before_clip': total_grad_norm_before_clip.item() + } + ) else: - return_data.update({ - # ============================================================== - # sampled related core code - # ============================================================== - # take the fist dim in action space - 'target_sampled_actions_max': target_sampled_actions[:, :].float().max().item(), - 'target_sampled_actions_min': target_sampled_actions[:, :].float().min().item(), - 'target_sampled_actions_mean': target_sampled_actions[:, :].float().mean().item(), - 'total_grad_norm_before_clip': total_grad_norm_before_clip.item() - }) + return_data.update( + { + # ============================================================== + # sampled related core code + # ============================================================== + # take the fist dim in action space + 'target_sampled_actions_max': target_sampled_actions[:, :].float().max().item(), + 'target_sampled_actions_min': target_sampled_actions[:, :].float().min().item(), + 'target_sampled_actions_mean': target_sampled_actions[:, :].float().mean().item(), + 'total_grad_norm_before_clip': total_grad_norm_before_clip.item() + } + ) return return_data @@ -681,9 +685,9 @@ def _calculate_policy_loss_cont( if self._cfg.policy_loss_type == 'KL': # KL divergence loss: sum( p* log(p/q) ) = sum( p*log(p) - p*log(q) )= sum( p*log(p)) - sum( p*log(q) ) policy_loss += ( - torch.exp(target_log_prob_sampled_actions.detach()) * - (target_log_prob_sampled_actions.detach() - log_prob_sampled_actions) - ).sum(-1) * mask_batch[:, unroll_step] + torch.exp(target_log_prob_sampled_actions.detach()) * + (target_log_prob_sampled_actions.detach() - log_prob_sampled_actions) + ).sum(-1) * mask_batch[:, unroll_step] elif self._cfg.policy_loss_type == 'cross_entropy': # cross_entropy loss: - sum(p * log (q) ) policy_loss += -torch.sum( @@ -724,8 +728,9 @@ def _calculate_policy_loss_disc( torch.nonzero(mask_batch[:, unroll_step]).squeeze(-1) ) - target_policy_entropy = -((target_normalized_visit_count_masked + 1e-6) * ( - target_normalized_visit_count_masked + 1e-6).log()).sum(-1).mean() + target_policy_entropy = -( + (target_normalized_visit_count_masked + 1e-6) * (target_normalized_visit_count_masked + 1e-6).log() + ).sum(-1).mean() # shape: (batch_size, num_unroll_steps, num_of_sampled_actions, action_dim) -> (batch_size, # num_of_sampled_actions, action_dim) e.g. (4, 6, 20, 2) -> (4, 20, 2) @@ -769,9 +774,9 @@ def _calculate_policy_loss_disc( if self._cfg.policy_loss_type == 'KL': # KL divergence loss: sum( p* log(p/q) ) = sum( p*log(p) - p*log(q) )= sum( p*log(p)) - sum( p*log(q) ) policy_loss += ( - torch.exp(target_log_prob_sampled_actions.detach()) * - (target_log_prob_sampled_actions.detach() - log_prob_sampled_actions) - ).sum(-1) * mask_batch[:, unroll_step] + torch.exp(target_log_prob_sampled_actions.detach()) * + (target_log_prob_sampled_actions.detach() - log_prob_sampled_actions) + ).sum(-1) * mask_batch[:, unroll_step] elif self._cfg.policy_loss_type == 'cross_entropy': # cross_entropy loss: - sum(p * log (q) ) policy_loss += -torch.sum( @@ -793,8 +798,13 @@ def _init_collect(self) -> None: self._collect_mcts_temperature = 1 def _forward_collect( - self, data: torch.Tensor, action_mask: list = None, temperature: np.ndarray = 1, to_play=-1, - epsilon: float = 0.25, ready_env_id: np.array = None, + self, + data: torch.Tensor, + action_mask: list = None, + temperature: np.ndarray = 1, + to_play=-1, + epsilon: float = 0.25, + ready_env_id: np.array = None, ): """ Overview: @@ -832,8 +842,7 @@ def _forward_collect( pred_values = self.inverse_scalar_transform_handle(pred_values).detach().cpu().numpy() latent_state_roots = latent_state_roots.detach().cpu().numpy() reward_hidden_state_roots = ( - reward_hidden_state_roots[0].detach().cpu().numpy(), - reward_hidden_state_roots[1].detach().cpu().numpy() + reward_hidden_state_roots[0].detach().cpu().numpy(), reward_hidden_state_roots[1].detach().cpu().numpy() ) policy_logits = policy_logits.detach().cpu().numpy().tolist() @@ -933,7 +942,13 @@ def _init_eval(self) -> None: else: self._mcts_eval = MCTSPtree(self._cfg) - def _forward_eval(self, data: torch.Tensor, action_mask: list, to_play: -1, ready_env_id: np.array = None,): + def _forward_eval( + self, + data: torch.Tensor, + action_mask: list, + to_play: -1, + ready_env_id: np.array = None, + ): """ Overview: The forward function for evaluating the current policy in eval mode. Use model to execute MCTS search. diff --git a/lzero/worker/muzero_collector.py b/lzero/worker/muzero_collector.py index 3c1a630dd..272c2cec5 100644 --- a/lzero/worker/muzero_collector.py +++ b/lzero/worker/muzero_collector.py @@ -210,24 +210,23 @@ def _compute_priorities(self, i: int, pred_values_lst: List[float], search_value if self.policy_config.use_priority: # Calculate priorities. The priorities are the L1 losses between the predicted # values and the search values. We use 'none' as the reduction parameter, which - # means the loss is calculated for each element individually, instead of being summed or averaged. + # means the loss is calculated for each element individually, instead of being summed or averaged. # A small constant (1e-6) is added to the results to avoid zero priorities. This # is done because zero priorities could potentially cause issues in some scenarios. pred_values = torch.from_numpy(np.array(pred_values_lst[i])).to(self.policy_config.device).float().view(-1) search_values = torch.from_numpy(np.array(search_values_lst[i])).to(self.policy_config.device ).float().view(-1) - priorities = L1Loss(reduction='none' - )(pred_values, - search_values).detach().cpu().numpy() + 1e-6 + priorities = L1Loss(reduction='none')(pred_values, search_values).detach().cpu().numpy() + 1e-6 else: # priorities is None -> use the max priority for all newly collected data priorities = None return priorities - def pad_and_save_last_trajectory(self, i: int, last_game_segments: List[GameSegment], - last_game_priorities: List[np.ndarray], - game_segments: List[GameSegment], done: np.ndarray) -> None: + def pad_and_save_last_trajectory( + self, i: int, last_game_segments: List[GameSegment], last_game_priorities: List[np.ndarray], + game_segments: List[GameSegment], done: np.ndarray + ) -> None: """ Overview: Save the game segment to the pool if the current game is finished, padding it if necessary. @@ -270,12 +269,18 @@ def pad_and_save_last_trajectory(self, i: int, last_game_segments: List[GameSegm # pad over and save if self.policy_config.gumbel_algo: - last_game_segments[i].pad_over(pad_obs_lst, pad_reward_lst, pad_root_values_lst, pad_child_visits_lst, - next_segment_improved_policy=pad_improved_policy_prob) + last_game_segments[i].pad_over( + pad_obs_lst, + pad_reward_lst, + pad_root_values_lst, + pad_child_visits_lst, + next_segment_improved_policy=pad_improved_policy_prob + ) else: if self.policy_config.use_ture_chance_label_in_chance_encoder: - last_game_segments[i].pad_over(pad_obs_lst, pad_reward_lst, pad_root_values_lst, pad_child_visits_lst, - next_chances=chance_lst) + last_game_segments[i].pad_over( + pad_obs_lst, pad_reward_lst, pad_root_values_lst, pad_child_visits_lst, next_chances=chance_lst + ) else: last_game_segments[i].pad_over(pad_obs_lst, pad_reward_lst, pad_root_values_lst, pad_child_visits_lst) """ @@ -437,10 +442,7 @@ def collect(self, if self.policy_config.gumbel_algo: improved_policy_dict_no_env_id = {k: v['improved_policy_probs'] for k, v in policy_output.items()} - completed_value_no_env_id = { - k: v['roots_completed_value'] - for k, v in policy_output.items() - } + completed_value_no_env_id = {k: v['roots_completed_value'] for k, v in policy_output.items()} # TODO(pu): subprocess actions = {} distributions_dict = {} @@ -488,8 +490,11 @@ def collect(self, distributions_dict[env_id], value_dict[env_id], root_sampled_actions_dict[env_id] ) elif self.policy_config.gumbel_algo: - game_segments[env_id].store_search_stats(distributions_dict[env_id], value_dict[env_id], - improved_policy=improved_policy_dict[env_id]) + game_segments[env_id].store_search_stats( + distributions_dict[env_id], + value_dict[env_id], + improved_policy=improved_policy_dict[env_id] + ) else: game_segments[env_id].store_search_stats(distributions_dict[env_id], value_dict[env_id]) # append a transition tuple, including a_t, o_{t+1}, r_{t}, action_mask_{t}, to_play_{t} @@ -571,30 +576,28 @@ def collect(self, self._env_info[env_id]['time'] += self._timer.value + interaction_duration if timestep.done: reward = timestep.info['eval_episode_return'] + info = { + 'reward': reward, + 'time': self._env_info[env_id]['time'], + 'step': self._env_info[env_id]['step'], + 'visit_entropy': visit_entropies_lst[env_id] / eps_steps_lst[env_id], + } if timestep.info.get('performance_info') is not None: + # this branch is for the performance evaluation of crowdsim env mean_aoi = timestep.info['performance_info']['mean_aoi'] mean_transmit_data = timestep.info['performance_info']['mean_transmit_data'] mean_energy_consumption = timestep.info['performance_info']['mean_energy_consumption'] transmitted_data_ratio = timestep.info['performance_info']['transmitted_data_ratio'] human_coverage = timestep.info['performance_info']['human_coverage'] - info = { - 'reward': reward, - 'time': self._env_info[env_id]['time'], - 'step': self._env_info[env_id]['step'], - 'visit_entropy': visit_entropies_lst[env_id] / eps_steps_lst[env_id], - 'mean_aoi': mean_aoi, - 'mean_transmit_data': mean_transmit_data, - 'mean_energy_consumption': mean_energy_consumption, - 'transmitted_data_ratio': transmitted_data_ratio, - 'human_coverage': human_coverage, - } - else: - info = { - 'reward': reward, - 'time': self._env_info[env_id]['time'], - 'step': self._env_info[env_id]['step'], - 'visit_entropy': visit_entropies_lst[env_id] / eps_steps_lst[env_id], - } + info.update( + { + 'mean_aoi': mean_aoi, + 'mean_transmit_data': mean_transmit_data, + 'mean_energy_consumption': mean_energy_consumption, + 'transmitted_data_ratio': transmitted_data_ratio, + 'human_coverage': human_coverage, + } + ) if self.policy_config.gumbel_algo: info['completed_value'] = completed_value_lst[env_id] / eps_steps_lst[env_id] collected_episode += 1 @@ -729,51 +732,39 @@ def _output_log(self, train_iter: int) -> None: if self.policy_config.gumbel_algo: completed_value = [d['completed_value'] for d in self._episode_info] self._total_duration += duration + info = { + 'episode_count': episode_count, + 'envstep_count': envstep_count, + 'avg_envstep_per_episode': envstep_count / episode_count, + 'avg_envstep_per_sec': envstep_count / duration, + 'avg_episode_per_sec': episode_count / duration, + 'collect_time': duration, + 'reward_mean': np.mean(episode_reward), + 'reward_std': np.std(episode_reward), + 'reward_max': np.max(episode_reward), + 'reward_min': np.min(episode_reward), + 'total_envstep_count': self._total_envstep_count, + 'total_episode_count': self._total_episode_count, + 'total_duration': self._total_duration, + 'visit_entropy': np.mean(visit_entropy), + # 'each_reward': episode_reward, + } if self._episode_info[0].get('mean_aoi') is not None: + # this branch is for the performance evaluation of crowdsim env episode_aoi = [d['mean_aoi'] for d in self._episode_info] episode_energy_consumption = [d['mean_energy_consumption'] for d in self._episode_info] episode_transmitted_data_ratio = [d['transmitted_data_ratio'] for d in self._episode_info] episode_human_coverage = [d['human_coverage'] for d in self._episode_info] mean_transmit_data = [d['mean_transmit_data'] for d in self._episode_info] - info = { - 'episode_count': episode_count, - 'envstep_count': envstep_count, - 'avg_envstep_per_episode': envstep_count / episode_count, - 'avg_envstep_per_sec': envstep_count / duration, - 'avg_episode_per_sec': episode_count / duration, - 'collect_time': duration, - 'reward_mean': np.mean(episode_reward), - 'reward_std': np.std(episode_reward), - 'reward_max': np.max(episode_reward), - 'reward_min': np.min(episode_reward), - 'total_envstep_count': self._total_envstep_count, - 'total_episode_count': self._total_episode_count, - 'total_duration': self._total_duration, - 'visit_entropy': np.mean(visit_entropy), - 'episode_mean_aoi': np.mean(episode_aoi), - 'episode_mean_transmit_data': np.mean(mean_transmit_data), - 'episode_mean_energy_consumption': np.mean(episode_energy_consumption), - 'episode_mean_transmitted_data_ratio': np.mean(episode_transmitted_data_ratio), - 'episode_mean_human_coverage': np.mean(episode_human_coverage), - } - else: - info = { - 'episode_count': episode_count, - 'envstep_count': envstep_count, - 'avg_envstep_per_episode': envstep_count / episode_count, - 'avg_envstep_per_sec': envstep_count / duration, - 'avg_episode_per_sec': episode_count / duration, - 'collect_time': duration, - 'reward_mean': np.mean(episode_reward), - 'reward_std': np.std(episode_reward), - 'reward_max': np.max(episode_reward), - 'reward_min': np.min(episode_reward), - 'total_envstep_count': self._total_envstep_count, - 'total_episode_count': self._total_episode_count, - 'total_duration': self._total_duration, - 'visit_entropy': np.mean(visit_entropy), - # 'each_reward': episode_reward, - } + info.update( + { + 'episode_mean_aoi': np.mean(episode_aoi), + 'episode_mean_transmit_data': np.mean(mean_transmit_data), + 'episode_mean_energy_consumption': np.mean(episode_energy_consumption), + 'episode_mean_transmitted_data_ratio': np.mean(episode_transmitted_data_ratio), + 'episode_mean_human_coverage': np.mean(episode_human_coverage), + } + ) if self.policy_config.gumbel_algo: info['completed_value'] = np.mean(completed_value) self._episode_info.clear() diff --git a/lzero/worker/muzero_evaluator.py b/lzero/worker/muzero_evaluator.py index d96cf90f7..983191f0c 100644 --- a/lzero/worker/muzero_evaluator.py +++ b/lzero/worker/muzero_evaluator.py @@ -235,8 +235,9 @@ def eval( time.sleep(retry_waiting_time) self._logger.info('=' * 10 + 'Wait for all environments (subprocess) to finish resetting.' + '=' * 10) self._logger.info( - 'After sleeping {}s, the current _env_states is {}'.format(retry_waiting_time, - self._env._env_states) + 'After sleeping {}s, the current _env_states is {}'.format( + retry_waiting_time, self._env._env_states + ) ) init_obs = self._env.ready_obs @@ -342,8 +343,9 @@ def eval( # Env reset is done by env_manager automatically. self._policy.reset([env_id]) reward = t.info['eval_episode_return'] - saved_info = {'eval_episode_return': t.info['eval_episode_return']} + saved_info = {'eval_episode_return': t.info['eval_episode_return']} if 'performance_info' in t.info: + # this branch is for crowdsim env saved_info.update(t.info['performance_info']) if 'episode_info' in t.info: saved_info.update(t.info['episode_info']) @@ -369,7 +371,8 @@ def eval( ) time.sleep(retry_waiting_time) self._logger.info( - '=' * 10 + 'Wait for all environments (subprocess) to finish resetting.' + '=' * 10 + '=' * 10 + 'Wait for all environments (subprocess) to finish resetting.' + + '=' * 10 ) self._logger.info( 'After sleeping {}s, the current _env_states is {}'.format( @@ -442,9 +445,8 @@ def eval( stop_flag = episode_return >= self._stop_value and train_iter > 0 if stop_flag: self._logger.info( - "[LightZero serial pipeline] " + - "Current episode_return: {} is greater than stop_value: {}".format(episode_return, - self._stop_value) + + "[LightZero serial pipeline] " + "Current episode_return: {} is greater than stop_value: {}". + format(episode_return, self._stop_value) + ", so your MCTS/RL agent is converged, you can refer to 'log/evaluator/evaluator_logger.txt' for details." ) diff --git a/zoo/CrowdSim/config/CrowdSim_efficientzero_config.py b/zoo/CrowdSim/config/CrowdSim_efficientzero_config.py index 0252c64ee..fc81f2c46 100644 --- a/zoo/CrowdSim/config/CrowdSim_efficientzero_config.py +++ b/zoo/CrowdSim/config/CrowdSim_efficientzero_config.py @@ -1,6 +1,5 @@ from easydict import EasyDict import os -os.environ["CUDA_VISIBLE_DEVICES"] = '2' # ============================================================== # begin of the most frequently changed config specified by the user @@ -14,9 +13,10 @@ max_env_step = int(3e5) reanalyze_ratio = 0. robot_num = 2 -human_num = 10 # purdue -# human_num = 33 # NCSU -# human_num = 92 # KAIST +# different human_num for different datasets +human_num = 10 # purdue dataset +# human_num = 33 # NCSU dataset +# human_num = 92 # KAIST dataset one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]] # ============================================================== # end of the most frequently changed config specified by the user @@ -27,9 +27,9 @@ f'result/crowd_num_human/CrowdSim_efficientzero_step{max_env_step}_uav{robot_num}_human{human_num}_upc{update_per_collect}_rr{reanalyze_ratio}_seed0', env=dict( env_name='CrowdSim-v0', - robot_num = robot_num, - human_num = human_num, - one_uav_action_space = one_uav_action_space, + robot_num=robot_num, + human_num=human_num, + one_uav_action_space=one_uav_action_space, continuous=False, manually_discretization=False, collector_env_num=collector_env_num, @@ -39,14 +39,14 @@ ), policy=dict( model=dict( - observation_shape=(robot_num+human_num)*4, - action_space_size=(len(one_uav_action_space))**robot_num, - model_type='mlp', + observation_shape=(robot_num + human_num) * 4, + action_space_size=(len(one_uav_action_space)) ** robot_num, + model_type='mlp', lstm_hidden_size=256, latent_state_dim=256, discrete_action_encoding_type='one_hot', # res_connection_in_dynamics=True, - norm_type='BN', + norm_type='BN', ), cuda=True, env_type='not_board_games', @@ -88,16 +88,5 @@ create_config = CrowdSim_efficientzero_create_config if __name__ == "__main__": - # Users can use different train entry by specifying the entry_type. - entry_type = "train_muzero" # options={"train_muzero", "train_muzero_with_gym_env"} - - if entry_type == "train_muzero": - from lzero.entry import train_muzero - elif entry_type == "train_muzero_with_gym_env": - """ - The ``train_muzero_with_gym_env`` entry means that the environment used in the training process is generated by wrapping the original gym environment with LightZeroEnvWrapper. - Users can refer to lzero/envs/wrappers for more details. - """ - from lzero.entry import train_muzero_with_gym_env as train_muzero - + from lzero.entry import train_muzero train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step) diff --git a/zoo/CrowdSim/config/CrowdSim_muzero_config.py b/zoo/CrowdSim/config/CrowdSim_muzero_config.py index 8def49e3e..3705d2850 100644 --- a/zoo/CrowdSim/config/CrowdSim_muzero_config.py +++ b/zoo/CrowdSim/config/CrowdSim_muzero_config.py @@ -1,6 +1,5 @@ from easydict import EasyDict import os -os.environ["CUDA_VISIBLE_DEVICES"] = '2' # ============================================================== # begin of the most frequently changed config specified by the user # ============================================================== @@ -26,9 +25,9 @@ f'result/crowd_num_human/CrowdSim_muzero_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_upc{update_per_collect}_rr{reanalyze_ratio}_seed0', env=dict( env_name='CrowdSim-v0', - robot_num = robot_num, - human_num = human_num, - one_uav_action_space = one_uav_action_space, + robot_num=robot_num, + human_num=human_num, + one_uav_action_space=one_uav_action_space, continuous=False, manually_discretization=False, collector_env_num=collector_env_num, @@ -38,9 +37,9 @@ ), policy=dict( model=dict( - observation_shape=(robot_num+human_num)*4, - action_space_size=(len(one_uav_action_space))**robot_num, - model_type='mlp', + observation_shape=(robot_num + human_num) * 4, + action_space_size=(len(one_uav_action_space)) ** robot_num, + model_type='mlp', lstm_hidden_size=256, latent_state_dim=256, self_supervised_learning_loss=True, # NOTE: default is False. @@ -90,16 +89,5 @@ create_config = CrowdSim_muzero_create_config if __name__ == "__main__": - # Users can use different train entry by specifying the entry_type. - entry_type = "train_muzero" # options={"train_muzero", "train_muzero_with_gym_env"} - - if entry_type == "train_muzero": - from lzero.entry import train_muzero - elif entry_type == "train_muzero_with_gym_env": - """ - The ``train_muzero_with_gym_env`` entry means that the environment used in the training process is generated by wrapping the original gym environment with LightZeroEnvWrapper. - Users can refer to lzero/envs/wrappers for more details. - """ - from lzero.entry import train_muzero_with_gym_env as train_muzero - + from lzero.entry import train_muzero train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step) diff --git a/zoo/CrowdSim/config/crowdsim_efficientzero_md_config.py b/zoo/CrowdSim/config/crowdsim_efficientzero_md_config.py index ff2a0913c..85a799a83 100644 --- a/zoo/CrowdSim/config/crowdsim_efficientzero_md_config.py +++ b/zoo/CrowdSim/config/crowdsim_efficientzero_md_config.py @@ -1,6 +1,5 @@ from easydict import EasyDict import os -os.environ["CUDA_VISIBLE_DEVICES"] = '1' # ============================================================== # begin of the most frequently changed config specified by the user # ============================================================== @@ -13,9 +12,10 @@ max_env_step = int(3e5) reanalyze_ratio = 0. robot_num = 2 -human_num = 59 # purdue -# human_num = 33 # NCSU -# human_num = 92 # KAIST +# different human_num for different datasets +human_num = 59 # purdue dataset +# human_num = 33 # NCSU dataset +# human_num = 92 # KAIST dataset one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]] transmit_v = 20 # ============================================================== @@ -30,10 +30,10 @@ transmit_v=transmit_v, obs_mode='1-dim-array', env_name='CrowdSim-v0', - dataset = 'purdue', - robot_num = robot_num, - human_num = human_num, - one_uav_action_space = one_uav_action_space, + dataset='purdue', + robot_num=robot_num, + human_num=human_num, + one_uav_action_space=one_uav_action_space, continuous=False, manually_discretization=False, collector_env_num=collector_env_num, @@ -43,17 +43,17 @@ ), policy=dict( model=dict( - agent_num = robot_num, - observation_shape=(robot_num + human_num)*4, + agent_num=robot_num, + observation_shape=(robot_num + human_num) * 4, obs_mode='1-dim-array', - robot_state_dim = 4, - human_state_dim = 4, - robot_num = robot_num, - human_num = human_num, + robot_state_dim=4, + human_state_dim=4, + robot_num=robot_num, + human_num=human_num, single_agent_action_size=len(one_uav_action_space), - action_space_size=(len(one_uav_action_space))**robot_num, - model_type='mlp_md', - output_separate_logit=False, # not output separate logit for each action. + action_space_size=(len(one_uav_action_space)) ** robot_num, + model_type='mlp_md', + output_separate_logit=False, # not output separate logit for each action. lstm_hidden_size=128, latent_state_dim=128, discrete_action_encoding_type='one_hot', @@ -97,16 +97,5 @@ create_config = CrowdSim_efficientzero_md_create_config if __name__ == "__main__": - # Users can use different train entry by specifying the entry_type. - entry_type = "train_muzero" # options={"train_muzero", "train_muzero_with_gym_env"} - - if entry_type == "train_muzero": - from lzero.entry import train_muzero - elif entry_type == "train_muzero_with_gym_env": - """ - The ``train_muzero_with_gym_env`` entry means that the environment used in the training process is generated by wrapping the original gym environment with LightZeroEnvWrapper. - Users can refer to lzero/envs/wrappers for more details. - """ - from lzero.entry import train_muzero_with_gym_env as train_muzero - + from lzero.entry import train_muzero train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step) diff --git a/zoo/CrowdSim/config/crowdsim_muzero_md_config.py b/zoo/CrowdSim/config/crowdsim_muzero_md_config.py index a79eb0bbe..28c40d9cd 100644 --- a/zoo/CrowdSim/config/crowdsim_muzero_md_config.py +++ b/zoo/CrowdSim/config/crowdsim_muzero_md_config.py @@ -1,6 +1,6 @@ from easydict import EasyDict import os -os.environ["CUDA_VISIBLE_DEVICES"] = '1' +os.environ["CUDA_VISIBLE_DEVICES"] = '2' # ============================================================== # begin of the most frequently changed config specified by the user # ============================================================== @@ -10,30 +10,31 @@ num_simulations = 25 update_per_collect = 100 batch_size = 256 -max_env_step = int(3e5) +max_env_step = int(5e5) reanalyze_ratio = 0. robot_num = 2 -human_num = 59 # purdue -# human_num = 33 # NCSU -# human_num = 92 # KAIST +# different human_num for different datasets +human_num = 59 # purdue dataset +# human_num = 33 # NCSU dataset +# human_num = 92 # KAIST dataset one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]] -transmit_v = 20 +transmit_v = 120 # ============================================================== # end of the most frequently changed config specified by the user # ============================================================== CrowdSim_muzero_config = dict( exp_name= - f'result/new_env/new_CrowdSim_vt{transmit_v}_muzero_md_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0', + f'result/new_env2_hard/new_CrowdSim2_hard_womd_vc1_vt{transmit_v}_muzero_md_ssl_step{max_env_step}_uav{robot_num}_human{human_num}_ns{num_simulations}_upc{update_per_collect}_seed0', env=dict( - env_mode = 'hard', + env_mode='hard', transmit_v=transmit_v, obs_mode='1-dim-array', env_name='CrowdSim-v0', - dataset = 'purdue', - robot_num = robot_num, - human_num = human_num, - one_uav_action_space = one_uav_action_space, + dataset='purdue', + robot_num=robot_num, + human_num=human_num, + one_uav_action_space=one_uav_action_space, continuous=False, manually_discretization=False, collector_env_num=collector_env_num, @@ -45,17 +46,17 @@ model=dict( # robot_observation_shape=(robot_num, 4), # human_observation_shape=(human_num, 4), - agent_num = robot_num, - observation_shape=(robot_num + human_num)*4, + agent_num=robot_num, + observation_shape=(robot_num + human_num) * 4, obs_mode='1-dim-array', - robot_state_dim = 4, - human_state_dim = 4, - robot_num = robot_num, - human_num = human_num, + robot_state_dim=4, + human_state_dim=4, + robot_num=robot_num, + human_num=human_num, single_agent_action_size=len(one_uav_action_space), - action_space_size=(len(one_uav_action_space))**robot_num, - model_type='mlp_md', - output_separate_logit=False, # not output separate logit for each action. + action_space_size=(len(one_uav_action_space)) ** robot_num, + model_type='mlp_md', + output_separate_logit=False, # not output separate logit for each action. lstm_hidden_size=256, latent_state_dim=256, self_supervised_learning_loss=True, # NOTE: default is False. @@ -105,16 +106,5 @@ create_config = CrowdSim_muzero_create_config if __name__ == "__main__": - # Users can use different train entry by specifying the entry_type. - entry_type = "train_muzero" # options={"train_muzero", "train_muzero_with_gym_env"} - - if entry_type == "train_muzero": - from lzero.entry import train_muzero - elif entry_type == "train_muzero_with_gym_env": - """ - The ``train_muzero_with_gym_env`` entry means that the environment used in the training process is generated by wrapping the original gym environment with LightZeroEnvWrapper. - Users can refer to lzero/envs/wrappers for more details. - """ - from lzero.entry import train_muzero_with_gym_env as train_muzero - + from lzero.entry import train_muzero train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step) diff --git a/zoo/CrowdSim/config/crowdsim_muzero_rgcn_config.py b/zoo/CrowdSim/config/crowdsim_muzero_rgcn_config.py index bb6b9ced3..0f3dcf147 100644 --- a/zoo/CrowdSim/config/crowdsim_muzero_rgcn_config.py +++ b/zoo/CrowdSim/config/crowdsim_muzero_rgcn_config.py @@ -1,6 +1,5 @@ from easydict import EasyDict import os -os.environ["CUDA_VISIBLE_DEVICES"] = '2' # ============================================================== # begin of the most frequently changed config specified by the user # ============================================================== @@ -13,24 +12,24 @@ max_env_step = int(3e5) reanalyze_ratio = 0. robot_num = 2 -human_num = 59 # purdue -# human_num = 33 # NCSU -# human_num = 92 # KAIST +# different human_num for different datasets +human_num = 59 # purdue dataset +# human_num = 33 # NCSU dataset +# human_num = 92 # KAIST dataset one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]] # ============================================================== # end of the most frequently changed config specified by the user # ============================================================== CrowdSim_muzero_config = dict( - exp_name= - f'result/CrowdSim_muzerogcn_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0', + exp_name=f'result/CrowdSim_muzerogcn_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0', env=dict( obs_mode='1-dim-array', env_name='CrowdSim-v0', - dataset = 'purdue', - robot_num = robot_num, - human_num = human_num, - one_uav_action_space = one_uav_action_space, + dataset='purdue', + robot_num=robot_num, + human_num=human_num, + one_uav_action_space=one_uav_action_space, continuous=False, manually_discretization=False, collector_env_num=collector_env_num, @@ -42,14 +41,14 @@ model=dict( # robot_observation_shape=(robot_num, 4), # human_observation_shape=(human_num, 4), - observation_shape=(robot_num + human_num)*4, + observation_shape=(robot_num + human_num) * 4, obs_mode='1-dim-array', - robot_state_dim = 4, - human_state_dim = 4, - robot_num = robot_num, - human_num = human_num, - action_space_size=(len(one_uav_action_space))**robot_num, - model_type='rgcn', + robot_state_dim=4, + human_state_dim=4, + robot_num=robot_num, + human_num=human_num, + action_space_size=(len(one_uav_action_space)) ** robot_num, + model_type='rgcn', lstm_hidden_size=256, latent_state_dim=256, self_supervised_learning_loss=True, # NOTE: default is False. @@ -99,16 +98,5 @@ create_config = CrowdSim_muzero_create_config if __name__ == "__main__": - # Users can use different train entry by specifying the entry_type. - entry_type = "train_muzero" # options={"train_muzero", "train_muzero_with_gym_env"} - - if entry_type == "train_muzero": - from lzero.entry import train_muzero - elif entry_type == "train_muzero_with_gym_env": - """ - The ``train_muzero_with_gym_env`` entry means that the environment used in the training process is generated by wrapping the original gym environment with LightZeroEnvWrapper. - Users can refer to lzero/envs/wrappers for more details. - """ - from lzero.entry import train_muzero_with_gym_env as train_muzero - + from lzero.entry import train_muzero train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step) diff --git a/zoo/CrowdSim/config/crowdsim_sez_md_config.py b/zoo/CrowdSim/config/crowdsim_sez_md_config.py index ae2e5074c..a6ac190a8 100644 --- a/zoo/CrowdSim/config/crowdsim_sez_md_config.py +++ b/zoo/CrowdSim/config/crowdsim_sez_md_config.py @@ -1,6 +1,5 @@ from easydict import EasyDict import os -os.environ["CUDA_VISIBLE_DEVICES"] = '0' # ============================================================== # begin of the most frequently changed config specified by the user # ============================================================== @@ -13,9 +12,10 @@ max_env_step = int(3e5) reanalyze_ratio = 0. robot_num = 2 -human_num = 59 # purdue -# human_num = 33 # NCSU -# human_num = 92 # KAIST +# different human_num for different datasets +human_num = 59 # purdue dataset +# human_num = 33 # NCSU dataset +# human_num = 92 # KAIST dataset one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]] K = 10 transmit_v = 20 @@ -31,10 +31,10 @@ obs_mode='1-dim-array', transmit_v=transmit_v, env_name='CrowdSim-v0', - dataset = 'purdue', - robot_num = robot_num, - human_num = human_num, - one_uav_action_space = one_uav_action_space, + dataset='purdue', + robot_num=robot_num, + human_num=human_num, + one_uav_action_space=one_uav_action_space, continuous=False, manually_discretization=False, collector_env_num=collector_env_num, @@ -44,17 +44,17 @@ ), policy=dict( model=dict( - agent_num = robot_num, - observation_shape=(robot_num + human_num)*4, + agent_num=robot_num, + observation_shape=(robot_num + human_num) * 4, obs_mode='1-dim-array', - robot_state_dim = 4, - human_state_dim = 4, - robot_num = robot_num, - human_num = human_num, + robot_state_dim=4, + human_state_dim=4, + robot_num=robot_num, + human_num=human_num, single_agent_action_size=len(one_uav_action_space), - action_space_size=(len(one_uav_action_space))**robot_num, - model_type='mlp_md', - output_separate_logit=False, # not output separate logit for each action. + action_space_size=(len(one_uav_action_space)) ** robot_num, + model_type='mlp_md', + output_separate_logit=False, # not output separate logit for each action. continuous_action_space=False, num_of_sampled_actions=K, lstm_hidden_size=128, @@ -106,16 +106,5 @@ create_config = CrowdSim_sez_create_config if __name__ == "__main__": - # Users can use different train entry by specifying the entry_type. - entry_type = "train_muzero" # options={"train_muzero", "train_muzero_with_gym_env"} - - if entry_type == "train_muzero": - from lzero.entry import train_muzero - elif entry_type == "train_muzero_with_gym_env": - """ - The ``train_muzero_with_gym_env`` entry means that the environment used in the training process is generated by wrapping the original gym environment with LightZeroEnvWrapper. - Users can refer to lzero/envs/wrappers for more details. - """ - from lzero.entry import train_muzero_with_gym_env as train_muzero - + from lzero.entry import train_muzero train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step) diff --git a/zoo/CrowdSim/entry/eval_crowdsim.py b/zoo/CrowdSim/entry/eval_crowdsim.py index 13297c76f..46364bdc8 100644 --- a/zoo/CrowdSim/entry/eval_crowdsim.py +++ b/zoo/CrowdSim/entry/eval_crowdsim.py @@ -28,8 +28,8 @@ # model_path is the path to the trained MuZero model checkpoint. # If no path is provided, the script will use the default model. - model_path = '/home/nighoodRen/LightZero/result/old_env/CrowdSim_muzeromd_ssl_step300000_uav2__human59_seed0_240503_022923/ckpt/ckpt_best.pth.tar' - main_config.exp_name = '/home/nighoodRen/LightZero/result/old_env/CrowdSim_muzeromd_ssl_step300000_uav2__human59_seed0_240503_022923/' + 'eval' # original result folder/eval + model_path = 'xxx/ckpt_best.pth.tar' + main_config.exp_name = 'xxx' + 'eval' # original result folder/eval # seeds is a list of seed values for the random number generator, used to initialize the environment. seeds = [0] # num_episodes_each_seed is the number of episodes to run for each seed. @@ -49,8 +49,8 @@ # A boolean flag indicating whether to save the video of the environment. main_config.env.save_replay = True # The path where the recorded video will be saved. - main_config.env.replay_path = main_config.exp_name + '/video' # current result folder/eval - + main_config.env.replay_path = main_config.exp_name + '/video' # current result folder/eval + # The maximum number of steps for each episode during evaluation. This may need to be adjusted based on the specific characteristics of the environment. main_config.env.eval_max_episode_steps = int(20) @@ -77,7 +77,9 @@ # Printing the evaluation results. The average reward and the total reward for each seed are displayed, followed by the mean reward across all seeds. print("=" * 20) - print(f"We evaluated a total of {len(seeds)} seeds. For each seed, we evaluated {num_episodes_each_seed} episode(s).") + print( + f"We evaluated a total of {len(seeds)} seeds. For each seed, we evaluated {num_episodes_each_seed} episode(s)." + ) print(f"For seeds {seeds}, the mean returns are {returns_mean_seeds}, and the returns are {returns_seeds}.") print("Across all seeds, the mean reward is:", returns_mean_seeds.mean()) - print("=" * 20) \ No newline at end of file + print("=" * 20) diff --git a/zoo/CrowdSim/envs/CrowdSim_env.py b/zoo/CrowdSim/envs/CrowdSim_env.py index 21d31863e..6cc3a7209 100644 --- a/zoo/CrowdSim/envs/CrowdSim_env.py +++ b/zoo/CrowdSim/envs/CrowdSim_env.py @@ -23,22 +23,20 @@ def __init__(self, cfg: dict = {}) -> None: self._robot_num = self._cfg.robot_num self._human_num = self._cfg.human_num self._observation_space = gym.spaces.Box( - low=float("-inf"), - high=float("inf"), - shape=((self._robot_num+self._human_num)*4,), - dtype=np.float32) + low=float("-inf"), high=float("inf"), shape=((self._robot_num + self._human_num) * 4, ), dtype=np.float32 + ) # action space # one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]] self.real_action_space = list(product(self._cfg.one_uav_action_space, repeat=self._robot_num)) one_uav_action_n = len(self._cfg.one_uav_action_space) - self._action_space = gym.spaces.Discrete(one_uav_action_n**self._robot_num) + self._action_space = gym.spaces.Discrete(one_uav_action_n ** self._robot_num) self._action_space.seed(0) # default seed self._reward_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1, ), dtype=np.float32) self._continuous = False def reset(self) -> np.ndarray: if not self._init_flag: - self._env = gym.make('CrowdSim-v0', dataset = self._cfg.dataset, custom_config = self._cfg) + self._env = gym.make('CrowdSim-v0', dataset=self._cfg.dataset, custom_config=self._cfg) self._init_flag = True if hasattr(self, '_seed') and hasattr(self, '_dynamic_seed') and self._dynamic_seed: np_seed = 100 * np.random.randint(1, 1000) @@ -52,8 +50,8 @@ def reset(self) -> np.ndarray: raw_obs = self._env.reset() obs_list = raw_obs.to_array() # human_obs, robot_obs = obs_list - obs = np.concatenate(obs_list,axis=0).flatten() # for 1 dim e.g.(244,) - assert len(obs)==(self._robot_num+self._human_num)*4 + obs = np.concatenate(obs_list, axis=0).flatten() # for 1 dim e.g.(244,) + assert len(obs) == (self._robot_num + self._human_num) * 4 action_mask = np.ones(self.action_space.n, 'int8') obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1} @@ -76,8 +74,8 @@ def step(self, action: Union[int, np.ndarray]) -> BaseEnvTimestep: assert isinstance(real_action, tuple) and len(real_action) == self._robot_num, "illegal action!" raw_obs, rew, done, info = self._env.step(real_action) obs_list = to_ndarray(raw_obs.to_tensor()) - obs = np.concatenate(obs_list,axis=0).flatten() # for 1 dim e.g.(244,) - assert len(obs)==(self._robot_num+self._human_num)*4 + obs = np.concatenate(obs_list, axis=0).flatten() # for 1 dim e.g.(244,) + assert len(obs) == (self._robot_num + self._human_num) * 4 self._eval_episode_return += rew if done: diff --git a/zoo/CrowdSim/envs/Crowdsim/env/base_env_config.py b/zoo/CrowdSim/envs/Crowdsim/env/base_env_config.py deleted file mode 100644 index 865ab56fb..000000000 --- a/zoo/CrowdSim/envs/Crowdsim/env/base_env_config.py +++ /dev/null @@ -1,137 +0,0 @@ -from easydict import EasyDict - -# define base config -base_config = EasyDict({ - "num_timestep": 120, # 120x15=1800s=30min - "step_time": 15, # second per step - "max_uav_energy": 359640, # 359640 J <-- 359.64 kJ (4500mah, 22.2v) 大疆经纬 - "rotation_limit": 360, - "diameter_of_human_blockers": 0.5, # m - "h_rx": 1.3, # m, height of RX - "h_b": 1.7, # m, height of a human blocker - "velocity": 18, - "frequence_band": 28, # GHz - "h_d": 120, # m, height of drone-BS - "alpha_nlos": 113.63, - "beta_nlos": 1.16, - "zeta_nlos": 2.58, # Frequency 28GHz, sub-urban. channel model - "alpha_los": 84.64, - "beta_los": 1.55, - "zeta_los": 0.12, - "g_tx": 0, # dB - "g_rx": 5, # dB - "tallest_locs": None, # obstacle - "no_fly_zone": None, # obstacle - "start_timestamp": 1519894800, - "end_timestamp": 1519896600, - "energy_factor": 3, # TODO: energy factor in reward function - "robot_num": 2, # TODO: 多了要用多进程 - "rollout_num": 1, # 1 2 6 12 15, calculated based on robot_num -}) - -# define all dataset configs -dataset_configs = { - 'purdue': EasyDict({ - "lower_left": [-86.93, 40.4203], # 经纬度 - "upper_right": [-86.9103, 40.4313], - "nlon": 200, - "nlat": 120, - "human_num": 59, - "dataset_dir": '/home/nighoodRen/CrowdSim/CrowdSim/envs/crowd_sim/dataset/purdue/59 users.csv', - "sensing_range": 23.2, # unit 23.2 - "one_uav_action_space": [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30], [21, 21], [21, -21], [-21, 21], [-21, -21]], - "max_x_distance": 1667, # m - "max_y_distance": 1222, # m - "density_of_human_blockers": 30000 / 1667 / 1222, # block/m2 - }), - 'ncsu': EasyDict({ - "lower_left": [-78.6988, 35.7651], # 经纬度 - "upper_right": [-78.6628, 35.7896], - "nlon": 3600, - "nlat": 2450, - "human_num": 33, - "dataset_dir": '/home/nighoodRen/CrowdSim/CrowdSim/envs/crowd_sim/dataset/NCSU/33 users.csv', - "sensing_range": 220, # unit 220 - "one_uav_action_space": [[0, 0], [300, 0], [-300, 0], [0, 300], [0, -300], [210, 210], [210, -210], [-210, 210], [-210, -210]], - "max_x_distance": 3255.4913305859623, - "max_y_distance": 2718.3945272795013, - "density_of_human_blockers": 30000 / 3255.4913305859623 / 2718.3945272795013, # block/m2 - }), - 'kaist': EasyDict({ - "lower_left": [127.3475, 36.3597], - "upper_right": [127.3709, 36.3793], - "nlon": 2340, - "nlat": 1960, - "human_num": 92, - "dataset_dir": '/home/nighoodRen/CrowdSim/CrowdSim/envs/crowd_sim/dataset/KAIST/92 users.csv', - "sensing_range": 220, # unit 220 - "one_uav_action_space": [[0, 0], [300, 0], [-300, 0], [0, 300], [0, -300], [210, 210], [210, -210], [-210, 210], [-210, -210]], - "max_x_distance": 2100.207579392558, - "max_y_distance": 2174.930950809533, - "density_of_human_blockers": 30000 / 2100.207579392558 / 2174.930950809533, # block/m2 - }), - # ... could add more dataset configs here -} - -# get config according to data set name -def get_selected_config(data_set_name): - if data_set_name in dataset_configs: - dataset_config = dataset_configs[data_set_name] - return EasyDict({**base_config, **dataset_config}) - else: - raise ValueError(f"Data set '{data_set_name}' not found.") - -# r:meters, 2d distance -# threshold: dB -def try_sensing_range(r, data_set_name): - import math - config = get_selected_config(data_set_name) - p_los = math.exp( - -config.density_of_human_blockers * config.diameter_of_human_blockers * r * (config.h_b - config.h_rx) / ( - config.h_d - config.h_rx)) - p_nlos = 1 - p_los - PL_los = config.alpha_los + config.beta_los * 10 * math.log10( - math.sqrt(r * r + config.h_d * config.h_d)) + config.zeta_los - PL_nlos = config.alpha_nlos + config.beta_nlos * 10 * math.log10( - math.sqrt(r * r + config.h_d * config.h_d)) + config.zeta_nlos - PL = p_los * PL_los + p_nlos * PL_nlos - CL = PL - config.g_tx - config.g_rx - print(p_los, p_nlos) - print(CL) - - -# Maximum Coupling Loss (110dB is recommended) -# purdue: - -# 123dB -> 560m -> 60.5 range -# 121dB -> 420m -> 45.4 range -# 119dB -> 300m -> 32.4 range -# 117dB -> 215m -> 23.2 range √ -# 115dB -> 140m -> 15 range - -# ncsu: -# 123dB -> 600m -> 600 range -# 121dB -> 435m -> 435 range -# 119dB -> 315m -> 315 range -# 117dB -> 220m -> 220 range √ -# 115dB -> 145m -> 145 range - -# kaist: -# 123dB -> 600m -> 600 range -# 121dB -> 435m -> 435 range -# 119dB -> 315m -> 315 range -# 117dB -> 220m -> 220 range √ -# 115dB -> 145m -> 145 range - -# san: -# 123dB -> 600m -> 600 range -# 121dB -> 450m -> 450 range -# 119dB -> 330m -> 330 range -# 117dB -> 240m -> 240 range √ -# 115dB -> 165m -> 165 range - -if __name__ == '__main__': - # example usage - data_set_name = 'purdue' - selected_config = get_selected_config(data_set_name) - print(selected_config) \ No newline at end of file diff --git a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py index 7ddf4550a..e31689866 100644 --- a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py +++ b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py @@ -10,15 +10,54 @@ from zoo.CrowdSim.envs.Crowdsim.env.model.utils import * from zoo.CrowdSim.envs.Crowdsim.env.model.mdp import HumanState, RobotState, JointState -from zoo.CrowdSim.envs.Crowdsim.env.base_env_config import get_selected_config - +from LightZero.zoo.CrowdSim.envs.Crowdsim.env.crowd_sim_base_config import get_selected_config class CrowdSim(gym.Env): + """ + Overview: + LightZero version of the CrowdSim environment. This class includes methods for resetting, closing, and \ + stepping through the environment, as well as seeding for reproducibility, saving replay videos, and generating \ + random actions. It also includes properties for accessing the observation space, action space, and reward space of the \ + environment. The environment is a grid world with humans and robots moving around. The robots are tasked with \ + minimizing the average age of information (AoI) of the humans by moving to their locations and collecting data from them. \ + The humans generate data at a constant rate, and the robots have a limited energy supply that is consumed by moving. \ + The environment is divided into two modes: 'easy' and 'hard'. In the 'easy' mode, the robots can only collect data from \ + humans when they are within a certain range, and the AoI of a human is reset to 0 when a robot collects data from them. \ + In the 'hard' mode, the robots can collect data from humans even when they are not within range, and the AoI of a human \ + is not reset when a robot collects data from them. The environment is initialized with a dataset of human locations and \ + timestamps, and the robots are tasked with collecting data from the humans to minimize the average AoI. The environment \ + is considered solved when the average AoI is minimized to a certain threshold or the time limit is reached. + Interface: + `__init__`, `reset`, `step`, `render`, `sync_human_df`, `generate_human`, `generate_robot`. + """ metadata = {'render.modes': ['human']} def __init__(self, dataset, custom_config=None): - # mcfg should include: + """ + Overview: + Initialize the environment with a dataset and a custom configuration. The dataset contains the locations and \ + timestamps of the humans, and the custom configuration contains the environment mode, number of humans, number \ + of robots, maximum timestep, step time, start timestamp, and maximum UAV energy. The environment is divided into \ + two modes: 'easy' and 'hard'. In the 'easy' mode, the robots can only collect data from humans when they are within \ + a certain range, and the AoI of a human is reset to 0 when a robot collects data from them. In the 'hard' mode, the \ + robots can collect data from humans even when they are not within range, and the AoI of a human is not reset when a \ + robot collects data from them. The environment is initialized with a dataset of human locations and timestamps, and \ + the robots are tasked with collecting data from the humans to minimize the average AoI. The environment is considered \ + solved when the average AoI is minimized to a certain threshold or the time limit is reached. + Args: + - dataset (:obj:`str`): The path to the dataset file. + - custom_config (:obj:`dict`): A dictionary containing the custom configuration for the environment. \ + The custom configuration should include the following keys: + - env_mode (:obj:`str`): The environment mode ('easy' or 'hard'). + - human_num (:obj:`int`): The number of humans in the environment. + - robot_num (:obj:`int`): The number of robots in the environment. + - num_timestep (:obj:`int`): The maximum timestep for the environment. + - step_time (:obj:`float`): The time per step in seconds. + - start_timestamp (:obj:`int`): The start timestamp for the environment. + - max_uav_energy (:obj:`float`): The maximum energy for the UAVs. + """ + # mcfg should include: self.time_limit = None self.robots = None self.humans = None @@ -29,10 +68,10 @@ def __init__(self, dataset, custom_config=None): self.config = get_selected_config(dataset) self.config.update(custom_config) - self.env_mode = self.config.env_mode # 'easy' or 'hard' + self.env_mode = self.config.env_mode # 'easy' or 'hard' self.human_num = self.config.human_num self.robot_num = self.config.robot_num - self.num_timestep = self.config.num_timestep # max timestep + self.num_timestep = self.config.num_timestep # max timestep self.step_time = self.config.step_time # second per step self.start_timestamp = self.config.start_timestamp # fit timpestamp to datetime self.max_uav_energy = self.config.max_uav_energy @@ -40,8 +79,9 @@ def __init__(self, dataset, custom_config=None): self.action_space = gym.spaces.Discrete(len(self.config.one_uav_action_space)) # human obs: [px, py, remaining_data_amount, aoi] # robot obs: [px, py, theta, energy] - # self.observation_space = gym.spaces.Box(low=float("-inf"), high=float("inf"), shape=(4), dtype=np.float32) - self.observation_space = gym.spaces.Box(low=float("-inf"), high=float("inf"), shape=(self.robot_num+self.human_num, 4), dtype=np.float32) + self.observation_space = gym.spaces.Box( + low=float("-inf"), high=float("inf"), shape=(self.robot_num + self.human_num, 4), dtype=np.float32 + ) # load_dataset self.transmit_v = self.config.transmit_v # 5*0.3Mb/s @@ -51,48 +91,63 @@ def __init__(self, dataset, custom_config=None): self.upper_right = self.config.upper_right self.human_df = pd.read_csv(self.config.dataset_dir) logging.info("Finished reading {} rows".format(len(self.human_df))) - # # for temporarily processing data - # sample_list=np.random.choice(self.human_num, size=[50,], replace=False) - # sample_list=sample_list[np.argsort(sample_list)] - # print(sample_list) - # self.human_df= self.human_df[self.human_df["id"].isin(sample_list)] - # for i,human_id in enumerate(sample_list): - # mask=(self.human_df["id"]==human_id) - # self.human_df.loc[mask,"id"]=i - # self.human_df=self.human_df.sort_values(by=["id","timestamp"],ascending=[True,True]) - # print(self.human_df.head()) - # self.human_df.to_csv("50 users-5.csv",index=False) - # exit(0) self.human_df['t'] = pd.to_datetime(self.human_df['timestamp'], unit='s') # 's' stands for second self.human_df['aoi'] = -1 # 加入aoi记录aoi - self.human_df['data_amount'] = -1 # record the remaining data amount of each human + self.human_df['data_amount'] = -1 # record the remaining data amount of each human self.human_df['energy'] = -1 # 加入energy记录energy logging.info('Env mode: {}'.format(self.env_mode)) logging.info('human number: {}'.format(self.human_num)) logging.info('Robot number: {}'.format(self.robot_num)) # for debug - self.current_human_aoi_list = np.zeros([self.human_num, ]) - self.mean_aoi_timelist = np.zeros([self.config.num_timestep + 1, ]) - self.cur_data_amount_timelist = np.zeros([self.human_num, ]) + self.current_human_aoi_list = np.zeros([ + self.human_num, + ]) + self.mean_aoi_timelist = np.zeros([ + self.config.num_timestep + 1, + ]) + self.cur_data_amount_timelist = np.zeros([ + self.human_num, + ]) self.robot_energy_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num]) self.robot_x_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num]) self.robot_y_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num]) - self.update_human_timelist = np.zeros([self.config.num_timestep, ]) + self.update_human_timelist = np.zeros([ + self.config.num_timestep, + ]) self.data_transmission = 0 - def set_agent(self, agent): - self.agent = agent - def generate_human(self, human_id, selected_data, selected_next_data): + """ + Overview: + Generate a human with the given id, selected data, and selected next data. The human is initialized with \ + the given data and next data, and the remaining data amount is set to 0. The human is also initialized with \ + an AoI of 0. + Argments: + - human_id (:obj:`int`): The id of the human. + - selected_data (:obj:`pd.DataFrame`): The selected data for the current timestep. + - selected_next_data (:obj:`pd.DataFrame`): The selected data for the next timestep. + Returns: + - human (:obj:`Human`): The generated human. + """ human = Human(human_id, self.config) - px, py, theta = get_human_position_from_list(self.current_timestep, human_id, selected_data, selected_next_data, self.config) + px, py, theta = get_human_position_from_list( + self.current_timestep, human_id, selected_data, selected_next_data, self.config + ) # human obs: [px, py, data_amount, aoi] human.set(px, py, theta, 0, 0) # initial aoi of human is 0 return human def generate_robot(self, robot_id): + """ + Overview: + Generate a robot with the given id. The robot is initialized with the given id and the maximum UAV energy. + Argments: + - robot_id (:obj:`int`): The id of the robot. + Returns: + - robot (:obj:`Robot`): The generated robot. + """ robot = Robot(robot_id, self.config) # robot obs: [px, py, theta, energy] robot.set(self.nlon / 2, self.nlat / 2, 0, self.max_uav_energy) # robot有energy @@ -108,13 +163,26 @@ def sync_human_df(self, human_id, current_timestep, aoi, data_amount): - aoi (:obj:`int`): The aoi of the human. """ current_timestamp = self.start_timestamp + current_timestep * self.step_time - current_index = self.human_df[ - (self.human_df.id == human_id) & (self.human_df.timestamp == current_timestamp)].index + current_index = self.human_df[(self.human_df.id == human_id) + & (self.human_df.timestamp == current_timestamp)].index # self.human_df.loc[current_index, "aoi"] = aoi # slower - self.human_df.iat[current_index.values[0], 9] = aoi # faster + self.human_df.iat[current_index.values[0], 9] = aoi # faster self.human_df.iat[current_index.values[0], 10] = data_amount def reset(self, phase='test', test_case=None): + """ + Overview: + Reset the environment to the initial state. The environment is reset to the start timestamp, and the humans \ + and robots are generated with the given data. The humans are initialized with the selected data and next data, \ + and the robots are initialized with the given id. The environment is also initialized with the current timestep, \ + mean AoI, robot energy, robot x, robot y, and update human timelist. The environment is considered solved when \ + the average AoI is minimized to a certain threshold or the time limit is reached. + Argments: + - phase (:obj:`str`): The phase of the environment ('train' or 'test'). + - test_case (:obj:`int`): The test case for the environment. + Returns: + - state (:obj:`JointState`): The initial state of the environment. + """ self.current_timestep = 0 # generate human @@ -132,9 +200,15 @@ def reset(self, phase='test', test_case=None): for robot_id in range(self.robot_num): self.robots.append(self.generate_robot(robot_id)) - self.cur_data_amount_timelist = np.zeros([self.human_num, ]) - self.current_human_aoi_list = np.zeros([self.human_num, ]) - self.mean_aoi_timelist = np.zeros([self.config.num_timestep + 1, ]) + self.cur_data_amount_timelist = np.zeros([ + self.human_num, + ]) + self.current_human_aoi_list = np.zeros([ + self.human_num, + ]) + self.mean_aoi_timelist = np.zeros([ + self.config.num_timestep + 1, + ]) self.mean_aoi_timelist[self.current_timestep] = np.mean(self.current_human_aoi_list) self.robot_energy_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num]) self.robot_energy_timelist[self.current_timestep, :] = self.max_uav_energy @@ -142,7 +216,9 @@ def reset(self, phase='test', test_case=None): self.robot_x_timelist[self.current_timestep, :] = self.nlon / 2 self.robot_y_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num]) self.robot_y_timelist[self.current_timestep, :] = self.nlat / 2 - self.update_human_timelist = np.zeros([self.config.num_timestep, ]) + self.update_human_timelist = np.zeros([ + self.config.num_timestep, + ]) self.data_transmission = 0 # for visualization @@ -150,17 +226,35 @@ def reset(self, phase='test', test_case=None): self.robot_actions = [] self.rewards = [] self.action_values = [] - self.plot_states.append([[robot.get_obs() for robot in self.robots], - [human.get_obs() for human in self.humans]]) - + self.plot_states.append( + [[robot.get_obs() for robot in self.robots], [human.get_obs() for human in self.humans]] + ) + state = JointState([robot.get_obs() for robot in self.robots], [human.get_obs() for human in self.humans]) return state def step(self, action): + """ + Overview: + Perform a step in the environment using the provided action, and return the next state of the environment. \ + The next state is encapsulated in a BaseEnvTimestep object, which includes the new observation, reward, done flag, \ + and info dictionary. The cumulative reward (`_eval_episode_return`) is updated with the reward obtained in this step. \ + If the episode ends (done is True), the total reward for the episode is stored in the info dictionary. + Argments: + - action (:obj:`Union[int, np.ndarray]`): The action to be performed in the environment. If the action is a 1-dimensional \ + numpy array, it is squeezed to a 0-dimension array. + Returns: + - next_state (:obj:`JointState`): The next state of the environment. + - reward (:obj:`float`): The reward obtained in this step. + - done (:obj:`bool`): A flag indicating whether the episode has ended. + - info (:obj:`dict`): A dictionary containing additional information about the environment. + """ new_robot_position = np.zeros([self.robot_num, 2]) - current_enenrgy_consume = np.zeros([self.robot_num, ]) + current_enenrgy_consume = np.zeros([ + self.robot_num, + ]) - num_updated_human = 0 # number of humans whose AoI is updated + num_updated_human = 0 # number of humans whose AoI is updated for robot_id, robot in enumerate(self.robots): new_robot_px = robot.px + action[robot_id][0] @@ -191,11 +285,14 @@ def step(self, action): self.robot_y_timelist[self.current_timestep + 1][robot_id] = new_robot_py robot.set(new_robot_px, new_robot_py, robot_theta, energy=new_energy) - selected_data, selected_next_data = get_human_position_list(self.current_timestep + 1, self.human_df, self.config) - human_transmit_data_list = np.zeros_like(self.cur_data_amount_timelist) # 0 means no update + selected_data, selected_next_data = get_human_position_list( + self.current_timestep + 1, self.human_df, self.config + ) + human_transmit_data_list = np.zeros_like(self.cur_data_amount_timelist) # 0 means no update for human_id, human in enumerate(self.humans): - next_px, next_py, next_theta = get_human_position_from_list(self.current_timestep + 1, human_id, - selected_data, selected_next_data, self.config) + next_px, next_py, next_theta = get_human_position_from_list( + self.current_timestep + 1, human_id, selected_data, selected_next_data, self.config + ) should_reset = judge_aoi_update([next_px, next_py], new_robot_position, self.config) if self.env_mode == 'easy': if should_reset: @@ -212,7 +309,7 @@ def step(self, action): human_transmit_data_list[human_id] = 0 new_aoi = human.aoi + 1 human.set(next_px, next_py, next_theta, aoi=new_aoi, data_amount=human.aoi) - + elif self.env_mode == 'hard': if should_reset: # if the human is in the range of the robot, then part of human's data will be transmitted @@ -238,7 +335,7 @@ def step(self, action): self.data_transmission += (delta_sum_transmit_data * 0.3) # Mb, 0.02M/s per person if self.env_mode == 'easy': # in easy mode, the data amount generated per step is equal to the number of humans - self.total_generated_data_amount = self.num_timestep*self.human_num + self.total_generated_data_amount = self.num_timestep * self.human_num elif self.env_mode == 'hard': # in hard mode, the data amount generated per step is equal to the sum of the data amount of all humans self.total_generated_data_amount += self.generate_data_amount_per_step @@ -251,11 +348,11 @@ def step(self, action): # self.action_values.append(self.agent.policy.action_values) self.robot_actions.append(action) self.rewards.append(reward) - self.plot_states.append([[robot.get_obs() for robot in self.robots], - [human.get_obs() for human in self.humans]]) + self.plot_states.append( + [[robot.get_obs() for robot in self.robots], [human.get_obs() for human in self.humans]] + ) - next_state = JointState([robot.get_obs() for robot in self.robots], - [human.get_obs() for human in self.humans]) + next_state = JointState([robot.get_obs() for robot in self.robots], [human.get_obs() for human in self.humans]) self.current_timestep += 1 # print('This game is on',self.current_timestep,' step\n') @@ -265,18 +362,26 @@ def step(self, action): done = False info = { "performance_info": { - "mean_aoi": self.mean_aoi_timelist[self.current_timestep], - "mean_transmit_data": delta_sum_transmit_data / self.human_num, - "mean_energy_consumption": 1.0 - ( - np.mean(self.robot_energy_timelist[self.current_timestep]) / self.max_uav_energy), - "transmitted_data_ratio": self.data_transmission/(self.total_generated_data_amount*0.3), - "human_coverage": np.mean(self.update_human_timelist) / self.human_num + "mean_aoi": self.mean_aoi_timelist[self.current_timestep], + "mean_transmit_data": delta_sum_transmit_data / self.human_num, + "mean_energy_consumption": 1.0 - + (np.mean(self.robot_energy_timelist[self.current_timestep]) / self.max_uav_energy), + "transmitted_data_ratio": self.data_transmission / (self.total_generated_data_amount * 0.3), + "human_coverage": np.mean(self.update_human_timelist) / self.human_num }, } return next_state, reward, done, info def render(self): + """ + Overview: + Render the environment to an image. The image is generated using the matplotlib library, and it includes the \ + historical trajectories of the robots, the current positions of the robots, the sensing range of the robots, the \ + positions of the humans, and their AoI values. The image is returned as a numpy array. + Returns: + - image (:obj:`np.ndarray`): The rendered image of the environment. + """ import matplotlib.pyplot as plt import matplotlib.patches as patches import io @@ -284,44 +389,49 @@ def render(self): map_max_x = self.config.nlon map_max_y = self.config.nlat - # 创建一个新的图形 + # Create a new figure fig, ax = plt.subplots(figsize=(8, 6)) - # 绘制机器人的历史轨迹 + # Plot the historical trajectories of the robots for timestep in range(len(self.robot_x_timelist)): for robot_id in range(len(self.robot_x_timelist[timestep])): - ax.plot(self.robot_x_timelist[timestep][robot_id], self.robot_y_timelist[timestep][robot_id], color='gray', alpha=0.5) - - # 绘制机器人的位置 + ax.plot( + self.robot_x_timelist[timestep][robot_id], + self.robot_y_timelist[timestep][robot_id], + color='gray', + alpha=0.5 + ) + + # Plot the current positions of the robots for robot in self.robots: ax.plot(robot.px, robot.py, marker='o', markersize=5, color='blue') - - # 绘制机器人的感知范围 + + # Plot the sensing range of the robots for robot in self.robots: robot_x, robot_y = robot.px, robot.py circle = patches.Circle((robot_x, robot_y), self.config.sensing_range, edgecolor='blue', facecolor='none') ax.add_patch(circle) - - # 绘制人类的位置和AOI变化 + + # Plot the positions of the humans and their AOI values for human in self.humans: human_x, human_y, aoi = human.px, human.py, human.aoi ax.plot(human_x, human_y, marker='x', markersize=5, color='red') ax.text(human_x, human_y, str(aoi), fontsize=8, color='black') - # 设置图形标题和坐标轴标签 + # Set the title and axis labels # ax.set_xlim(0, map_max_x) # ax.set_ylim(0, map_max_y) ax.set_xlabel('X') ax.set_ylabel('Y') - # 在地图之外留出一些空白区域 + # Leave some margin around the map ax.margins(x=0.1, y=0.1) ax.set_title('Crowd Simulation Visualization') - # 显示图形 + # Render the figure to an image fig.canvas.draw() image = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8) - image = image.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + image = image.reshape(fig.canvas.get_width_height()[::-1] + (3, )) plt.close() - return image \ No newline at end of file + return image diff --git a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim_base_config.py b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim_base_config.py new file mode 100644 index 000000000..008fd7e16 --- /dev/null +++ b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim_base_config.py @@ -0,0 +1,156 @@ +from easydict import EasyDict + +# define base config +base_config = EasyDict( + { + "num_timestep": 120, # 120x15=1800s=30min + "step_time": 15, # seconds per step + "max_uav_energy": 359640, # 359640 J <-- 359.64 kJ (4500mAh, 22.2V) DJI Matrice + "rotation_limit": 360, + "diameter_of_human_blockers": 0.5, # meters + "h_rx": 1.3, # meters, height of RX + "h_b": 1.7, # meters, height of a human blocker + "velocity": 18, + "frequence_band": 28, # GHz + "h_d": 120, # meters, height of drone-BS + "alpha_nlos": 113.63, + "beta_nlos": 1.16, + "zeta_nlos": 2.58, # Frequency 28GHz, sub-urban. channel model + "alpha_los": 84.64, + "beta_los": 1.55, + "zeta_los": 0.12, + "g_tx": 0, # dB + "g_rx": 5, # dB + "tallest_locs": None, # obstacle + "no_fly_zone": None, # obstacle + "start_timestamp": 1519894800, + "end_timestamp": 1519896600, + "energy_factor": 3, # TODO: energy factor in reward function + "robot_num": 2, + "rollout_num": 1, # 1 2 6 12 15, calculated based on robot_num + } +) + +# define all dataset configs +dataset_configs = { + 'purdue': EasyDict( + { + "lower_left": [-86.93, 40.4203], # longitude and latitude + "upper_right": [-86.9103, 40.4313], + "nlon": 200, + "nlat": 120, + "human_num": 59, + "dataset_dir": '/home/nighoodRen/CrowdSim/CrowdSim/envs/crowd_sim/dataset/purdue/59 users.csv', + "sensing_range": 23.2, # unit 23.2 + "one_uav_action_space": [ + [0, 0], [30, 0], [-30, 0], [0, 30], [0, -30], [21, 21], [21, -21], [-21, 21], [-21, -21] + ], + "max_x_distance": 1667, # meters + "max_y_distance": 1222, # meters + "density_of_human_blockers": 30000 / 1667 / 1222, # blockers/m2 + } + ), + 'ncsu': EasyDict( + { + "lower_left": [-78.6988, 35.7651], # longitude and latitude + "upper_right": [-78.6628, 35.7896], + "nlon": 3600, + "nlat": 2450, + "human_num": 33, + "dataset_dir": '/home/nighoodRen/CrowdSim/CrowdSim/envs/crowd_sim/dataset/NCSU/33 users.csv', + "sensing_range": 220, # unit 220 + "one_uav_action_space": [ + [0, 0], [300, 0], [-300, 0], [0, 300], [0, -300], [210, 210], [210, -210], [-210, 210], [-210, -210] + ], + "max_x_distance": 3255.4913305859623, # meters + "max_y_distance": 2718.3945272795013, # meters + "density_of_human_blockers": 30000 / 3255.4913305859623 / 2718.3945272795013, # blockers/m2 + } + ), + 'kaist': EasyDict( + { + "lower_left": [127.3475, 36.3597], # longitude and latitude + "upper_right": [127.3709, 36.3793], + "nlon": 2340, + "nlat": 1960, + "human_num": 92, + "dataset_dir": '/home/nighoodRen/CrowdSim/CrowdSim/envs/crowd_sim/dataset/KAIST/92 users.csv', + "sensing_range": 220, # unit 220 + "one_uav_action_space": [ + [0, 0], [300, 0], [-300, 0], [0, 300], [0, -300], [210, 210], [210, -210], [-210, 210], [-210, -210] + ], + "max_x_distance": 2100.207579392558, # meters + "max_y_distance": 2174.930950809533, # meters + "density_of_human_blockers": 30000 / 2100.207579392558 / 2174.930950809533, # blockers/m2 + } + ), + # ... could add more dataset configs here +} + + +# get config according to data set name +def get_selected_config(data_set_name): + if data_set_name in dataset_configs: + dataset_config = dataset_configs[data_set_name] + return EasyDict({**base_config, **dataset_config}) + else: + raise ValueError(f"Data set '{data_set_name}' not found.") + + +# r:meters, 2D distance +# threshold: dB +def try_sensing_range(r, data_set_name): + import math + config = get_selected_config(data_set_name) + p_los = math.exp( + -config.density_of_human_blockers * config.diameter_of_human_blockers * r * (config.h_b - config.h_rx) / + (config.h_d - config.h_rx) + ) + p_nlos = 1 - p_los + PL_los = config.alpha_los + config.beta_los * 10 * math.log10( + math.sqrt(r * r + config.h_d * config.h_d) + ) + config.zeta_los + PL_nlos = config.alpha_nlos + config.beta_nlos * 10 * math.log10( + math.sqrt(r * r + config.h_d * config.h_d) + ) + config.zeta_nlos + PL = p_los * PL_los + p_nlos * PL_nlos + CL = PL - config.g_tx - config.g_rx + print(p_los, p_nlos) + print(CL) + + +# Maximum Coupling Loss (110dB is recommended) +# purdue: + +# 123dB -> 560m -> 60.5 range +# 121dB -> 420m -> 45.4 range +# 119dB -> 300m -> 32.4 range +# 117dB -> 215m -> 23.2 range √ +# 115dB -> 140m -> 15 range + +# ncsu: +# 123dB -> 600m -> 600 range +# 121dB -> 435m -> 435 range +# 119dB -> 315m -> 315 range +# 117dB -> 220m -> 220 range √ +# 115dB -> 145m -> 145 range + +# kaist: +# 123dB -> 600m -> 600 range +# 121dB -> 435m -> 435 range +# 119dB -> 315m -> 315 range +# 117dB -> 220m -> 220 range √ +# 115dB -> 145m -> 145 range + +# san: +# 123dB -> 600m -> 600 range +# 121dB -> 450m -> 450 range +# 119dB -> 330m -> 330 range +# 117dB -> 240m -> 240 range √ +# 115dB -> 165m -> 165 range + +if __name__ == '__main__': + # example usage + data_set_name = 'purdue' + selected_config = get_selected_config(data_set_name) + print(selected_config) diff --git a/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py b/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py index db6c47b31..77e378636 100644 --- a/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py +++ b/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py @@ -5,6 +5,7 @@ class Agent(): + def __init__(self): """ Base class for robot and human. Have the physical attributes of an agent. @@ -26,6 +27,15 @@ def act(self, state, current_timestep): class Human(): + """ + Overview: + Human class. Have the physical attributes of a human agent. The human agent has a data queue to store the \ + information blocks. The data queue is updated when the human agent moves and transmits data to the robot. \ + The age of information (aoi) is calculated based on the data queue. + Interface: + `__init__`, `set`, `update`, `get_obs`. + """ + # collect_v_prob = {1: 1, 2: 0} def __init__(self, id, config): self.id = id @@ -40,6 +50,16 @@ def __init__(self, id, config): self.collect_v = random.choices(list(self.collect_v_prob.keys()), list(self.collect_v_prob.values()))[0] def set(self, px, py, theta, aoi, data_amount): + """ + Overview: + Set the physical attributes of the human agent. + Arguments: + - px (:obj:`float`): The x-coordinate of the human agent. + - py (:obj:`float`): The y-coordinate of the human agent. + - theta (:obj:`float`): The orientation of the human agent. + - aoi (:obj:`float`): The age of information (aoi) of the human agent. + - data_amount (:obj:`int`): The amount of data blocks in the data queue of the human agent. + """ self.px = px self.py = py self.theta = theta @@ -47,6 +67,16 @@ def set(self, px, py, theta, aoi, data_amount): self.data_amount = data_amount def update(self, px, py, theta, transmitted_data): + """ + Overview: + Update the physical attributes of the human agent and the data queue. The age of information (aoi) is \ + calculated based on the data queue. + Arguments: + - px (:obj:`float`): The x-coordinate of the human agent. + - py (:obj:`float`): The y-coordinate of the human agent. + - theta (:obj:`float`): The orientation of the human agent. + - transmitted_data (:obj:`int`): The number of data blocks transmitted to the robot. + """ self.px = px # position self.py = py self.theta = theta @@ -54,16 +84,29 @@ def update(self, px, py, theta, transmitted_data): self.aoi = self.data_queue.total_aoi() self.data_amount = self.data_queue.total_blocks() - # TODO: change state,可能需要归一化 def get_obs(self): + """ + Overview: + Get the observation of the human agent. The observation includes the position, age of information (aoi), \ + and the amount of data blocks in the data queue. + Returns: + - obs (:obj:`HumanState`): The observation of the human agent. + """ # obs: (px, py, remaining_data, aoi) - return HumanState(self.px / self.config.nlon, - self.py / self.config.nlat, - self.data_amount / self.config.num_timestep, - self.aoi / self.config.num_timestep) + return HumanState( + self.px / self.config.nlon, self.py / self.config.nlat, self.data_amount / self.config.num_timestep, + self.aoi / self.config.num_timestep + ) class Robot(): + """ + Overview: + Robot class. Have the physical attributes of a robot agent. + Interface: + `__init__`, `set`, `get_obs`. + """ + def __init__(self, id, config): self.id = id self.config = config @@ -73,42 +116,75 @@ def __init__(self, id, config): self.energy = None def set(self, px, py, theta, energy): + """ + Overview: + Set the physical attributes of the robot agent. + Arguments: + - px (:obj:`float`): The x-coordinate of the robot agent. + - py (:obj:`float`): The y-coordinate of the robot agent. + - theta (:obj:`float`): The orientation of the robot agent. + - energy (:obj:`float`): The remaining energy of the robot agent. + """ self.px = px # position self.py = py self.theta = theta self.energy = energy - # TODO: change state,可能需要归一化 def get_obs(self): - return RobotState(self.px / self.config.nlon, - self.py / self.config.nlat, - self.theta / self.config.rotation_limit, - self.energy / self.config.max_uav_energy) + """ + Overview: + Get the observation of the robot agent. The observation includes the position, orientation, and the remaining \ + energy of the robot agent. + Returns: + - obs (:obj:`RobotState`): The observation of the robot agent. + """ + return RobotState( + self.px / self.config.nlon, self.py / self.config.nlat, self.theta / self.config.rotation_limit, + self.energy / self.config.max_uav_energy + ) class InformationQueue: + """ + Overview: + Information queue class. The data queue is updated when the human agent moves and transmits data to the robot. \ + The age of information (aoi) is calculated based on the data queue. + + Interface: + `__init__`, `update`, `total_aoi`, `total_blocks`. + """ + def __init__(self): # Initialize the queue to hold the age of each information block self.queue = [] def update(self, arrivals, departures): + """ + Overview: + Update the data queue. Increase the age of information (aoi) for each block in the queue. Add new information \ + blocks with aoi of 0. Remove the specified number of oldest information blocks. + Arguments: + - arrivals (:obj:`int`): The number of new information blocks entering the queue. + - departures (:obj:`int`): The number of oldest information blocks leaving the queue. + """ # Increase the age of information (aoi) for each block in the queue self.queue = [age + 1 for age in self.queue] - + # Add new information blocks with aoi of 0 self.queue.extend([0] * arrivals) - + # Remove the specified number of oldest information blocks self.queue = self.queue[departures:] if departures <= len(self.queue) else [] def total_aoi(self): # Return the total age of information in the queue return sum(self.queue) - + def total_blocks(self): # Return the total number of information blocks in the queue return len(self.queue) + # # Example of using the InformationQueue class # info_queue = InformationQueue() # info_queue.update(arrivals=5, departures=0) # 5 blocks enter the queue, all with aoi of 0 diff --git a/zoo/CrowdSim/envs/Crowdsim/env/model/mdp.py b/zoo/CrowdSim/envs/Crowdsim/env/model/mdp.py index 72f1b9c4a..9d4d2b770 100644 --- a/zoo/CrowdSim/envs/Crowdsim/env/model/mdp.py +++ b/zoo/CrowdSim/envs/Crowdsim/env/model/mdp.py @@ -6,6 +6,7 @@ # State class HumanState(object): + def __init__(self, px, py, theta, aoi): self.px = px self.py = py @@ -24,6 +25,7 @@ def to_tuple(self): class RobotState(object): + def __init__(self, px, py, theta, energy): self.px = px self.py = py @@ -43,6 +45,7 @@ def to_tuple(self): class JointState(object): + def __init__(self, robot_states, human_states): for robot_state in robot_states: assert isinstance(robot_state, RobotState) @@ -53,10 +56,12 @@ def __init__(self, robot_states, human_states): self.human_states = human_states def to_tensor(self, add_batch_size=False, device=None): - robot_states_tensor = torch.tensor([robot_state.to_tuple() for robot_state in self.robot_states], - dtype=torch.float32) - human_states_tensor = torch.tensor([human_state.to_tuple() for human_state in self.human_states], - dtype=torch.float32) + robot_states_tensor = torch.tensor( + [robot_state.to_tuple() for robot_state in self.robot_states], dtype=torch.float32 + ) + human_states_tensor = torch.tensor( + [human_state.to_tuple() for human_state in self.human_states], dtype=torch.float32 + ) if add_batch_size: # True robot_states_tensor = robot_states_tensor.unsqueeze(0) @@ -67,7 +72,7 @@ def to_tensor(self, add_batch_size=False, device=None): human_states_tensor = human_states_tensor.to(device) return robot_states_tensor, human_states_tensor - + def to_array(self): robot_states_array = np.array([robot_state.to_tuple() for robot_state in self.robot_states]) human_states_array = np.array([human_state.to_tuple() for human_state in self.human_states]) diff --git a/zoo/CrowdSim/envs/Crowdsim/env/model/utils.py b/zoo/CrowdSim/envs/Crowdsim/env/model/utils.py index e81ec842f..421b9508a 100644 --- a/zoo/CrowdSim/envs/Crowdsim/env/model/utils.py +++ b/zoo/CrowdSim/envs/Crowdsim/env/model/utils.py @@ -7,34 +7,72 @@ from shapely.geometry import * -def tensor_to_joint_state(state, config): # 恢复原先尺度 +def tensor_to_joint_state(state, config): + """ + Overview: + Convert the state tensor to the JointState object. The state tensor is a tuple of two tensors, the first one \ + is the robot state tensor, and the second one is the human state tensor. The robot state tensor is a tensor of \ + shape (1, robot_num, 4), and the human state tensor is a tensor of shape (1, human_num, 4). + Arguments: + - state (:obj:`tuple`): The state tensor. + - config (:obj:`dict`): The configuration of the environment. + Returns: + - joint_state (:obj:`JointState`): The JointState object. + """ robot_states, human_states = state robot_states = robot_states.cpu().squeeze(0).data.numpy() - robot_states = [RobotState(robot_state[0] * config.nlon, - robot_state[1] * config.nlat, - robot_state[2] * config.rotation_limit, - robot_state[3] * config.max_uav_energy) for robot_state in robot_states] + robot_states = [ + RobotState( + robot_state[0] * config.nlon, robot_state[1] * config.nlat, robot_state[2] * config.rotation_limit, + robot_state[3] * config.max_uav_energy + ) for robot_state in robot_states + ] human_states = human_states.cpu().squeeze(0).data.numpy() - human_states = [HumanState(human_state[0] * config.nlon, - human_state[1] * config.nlat, - human_state[2] * config.rotation_limit, - human_state[3] * config.num_timestep) for human_state in human_states] + human_states = [ + HumanState( + human_state[0] * config.nlon, human_state[1] * config.nlat, human_state[2] * config.rotation_limit, + human_state[3] * config.num_timestep + ) for human_state in human_states + ] return JointState(robot_states, human_states) def tensor_to_robot_states(robot_state_tensor, config): + """ + Overview: + Convert the robot state tensor to a list of RobotState objects. The robot state tensor is a tensor of shape \ + (1, robot_num, 4). + Arguments: + - robot_state_tensor (:obj:`torch.Tensor`): The robot state tensor. + - config (:obj:`dict`): The configuration of the environment. + Returns: + - robot_states (:obj:`list`): The list of RobotState objects. + """ robot_states = robot_state_tensor.cpu().squeeze(0).data.numpy() - robot_states = [RobotState(robot_state[0] * config.nlon, - robot_state[1] * config.nlat, - robot_state[2] * config.rotation_limit, - robot_state[3] * config.max_uav_energy) for robot_state in robot_states] + robot_states = [ + RobotState( + robot_state[0] * config.nlon, robot_state[1] * config.nlat, robot_state[2] * config.rotation_limit, + robot_state[3] * config.max_uav_energy + ) for robot_state in robot_states + ] return robot_states def get_human_position_list(selected_timestep, human_df, config): + """ + Overview: + Get the human position list at the selected timestep. The human position list is a list of tuples, each tuple \ + contains the x, y, and theta of a human. + Arguments: + - selected_timestep (:obj:`int`): The selected timestep. + - human_df (:obj:`pandas.DataFrame`): The human dataframe. + - config (:obj:`dict`): The configuration of the environment. + Returns: + - human_position_list (:obj:`list`): The human position list. + """ # config.step_time means the time interval between two timesteps selected_timestamp = config.start_timestamp + selected_timestep * config.step_time selected_data = human_df[human_df.timestamp == selected_timestamp] @@ -50,6 +88,21 @@ def get_human_position_list(selected_timestep, human_df, config): def get_human_position_from_list(selected_timestep, human_id, selected_data, selected_next_data, config): + """ + Overview: + Get the human position from the human position list at the selected timestep. The human position is a tuple \ + containing the x, y, and theta of the human. + Arguments: + - selected_timestep (:obj:`int`): The selected timestep. + - human_id (:obj:`int`): The human id. + - selected_data (:obj:`pandas.DataFrame`): The human position list at the selected timestep. + - selected_next_data (:obj:`pandas.DataFrame`): The human position list at the next timestep. + - config (:obj:`dict`): The configuration of the environment. + Returns: + - px (:obj:`float`): The x coordinate of the human. + - py (:obj:`float`): The y coordinate of the human. + - theta (:obj:`float`): The orientation of the human. + """ px, py = selected_data.loc[human_id, ["x", "y"]] if selected_timestep < config.num_timestep: @@ -70,11 +123,15 @@ def judge_aoi_update(human_position, robot_position, config): - human_position (:obj:`list`): The position of the human. - robot_position (:obj:`list`): The position of the robot. - config (:obj:`dict`): The configuration of the environment. + Returns: + - should_update (:obj:`bool`): Whether the AoI should be updated. """ should_reset = False for robot_id in range(config.robot_num): - unit_distance = np.sqrt(np.power(robot_position[robot_id][0] - human_position[0], 2) - + np.power(robot_position[robot_id][1] - human_position[1], 2)) + unit_distance = np.sqrt( + np.power(robot_position[robot_id][0] - human_position[0], 2) + + np.power(robot_position[robot_id][1] - human_position[1], 2) + ) if unit_distance <= config.sensing_range: should_reset = True break @@ -83,6 +140,16 @@ def judge_aoi_update(human_position, robot_position, config): def inPoly(polygon, x, y): + """ + Overview: + Judge whether a point is in a polygon. + Arguments: + - polygon (:obj:`list`): The polygon. + - x (:obj:`float`): The x coordinate of the point. + - y (:obj:`float`): The y coordinate of the point. + Returns: + - in_poly (:obj:`bool`): Whether the point is in the polygon. + """ pt = (x, y) line = LineString(polygon) point = Point(pt) @@ -91,12 +158,33 @@ def inPoly(polygon, x, y): def iscrosses(line1, line2): + """ + Overview: + Judge whether two lines cross each other. + Arguments: + - line1 (:obj:`list`): The first line. + - line2 (:obj:`list`): The second line. + Returns: + - crosses (:obj:`bool`): Whether the two lines cross each other. + """ if LineString(line1).crosses(LineString(line2)): return True return False def crossPoly(square, x1, y1, x2, y2): + """ + Overview: + Judge whether a line crosses a polygon. + Arguments: + - square (:obj:`list`): The polygon. + - x1 (:obj:`float`): The x coordinate of the start point of the line. + - y1 (:obj:`float`): The y coordinate of the start point of the line. + - x2 (:obj:`float`): The x coordinate of the end point of the line. + - y2 (:obj:`float`): The y coordinate of the end point of the line. + Returns: + - crosses (:obj:`bool`): Whether the line crosses the polygon. + """ our_line = LineString([[x1, y1], [x2, y2]]) line1 = LineString([square[0], square[2]]) line2 = LineString([square[1], square[3]]) @@ -107,6 +195,18 @@ def crossPoly(square, x1, y1, x2, y2): def judge_collision(new_robot_px, new_robot_py, old_robot_px, old_robot_py, config): + """ + Overview: + Judge whether a collision happens. A collision happens when the new position of the robot is in the no-fly zone. + Arguments: + - new_robot_px (:obj:`float`): The x coordinate of the new position of the robot. + - new_robot_py (:obj:`float`): The y coordinate of the new position of the robot. + - old_robot_px (:obj:`float`): The x coordinate of the old position of the robot. + - old_robot_py (:obj:`float`): The y coordinate of the old position of the robot. + - config (:obj:`dict`): The configuration of the environment. + Returns: + - collision (:obj:`bool`): Whether a collision happens. + """ if config.no_fly_zone is None: return False @@ -126,6 +226,17 @@ def get_theta(x1, y1, x2, y2): def consume_uav_energy(fly_time, hover_time, config): + """ + Overview: + Calculate the energy consumption of the UAV. The energy consumption is calculated based on the power consumption \ + of the UAV in the flying state and the hovering state. + Arguments: + - fly_time (:obj:`float`): The flying time. + - hover_time (:obj:`float`): The hovering time. + - config (:obj:`dict`): The configuration of the environment. + Returns: + - energy (:obj:`float`): The energy consumption of the UAV. + """ # configs Pu = 0.5 # the average transmitted power of each user, W, e.g. mobile phone P0 = 79.8563 # blade profile power, W @@ -153,20 +264,16 @@ def get_border(ur, lf): lower_right = [ur[0], lf[1]] lower_left = [lf[0], lf[1]] - coordinates = [ - upper_left, - upper_right, - lower_right, - lower_left, - upper_left - ] + coordinates = [upper_left, upper_right, lower_right, lower_left, upper_left] - geo_json = {"type": "FeatureCollection", - "properties": { - "lower_left": lower_left, - "upper_right": upper_right - }, - "features": []} + geo_json = { + "type": "FeatureCollection", + "properties": { + "lower_left": lower_left, + "upper_right": upper_right + }, + "features": [] + } grid_feature = { "type": "Feature", @@ -191,10 +298,7 @@ def traj_to_timestamped_geojson(index, trajectory, robot_num, color): # for Point in GeoJSON type for _, row in point_gdf.iterrows(): - corrent_point_coordinates = [ - row["geometry"].xy[0][0], - row["geometry"].xy[1][0] - ] + corrent_point_coordinates = [row["geometry"].xy[0][0], row["geometry"].xy[1][0]] current_time = [row["time"].isoformat()] if index < robot_num: @@ -231,12 +335,10 @@ def traj_to_timestamped_geojson(index, trajectory, robot_num, color): 'radius': radius, 'weight': 1, }, - "style": { # line "color": color, }, "code": 11, - }, } ) @@ -244,5 +346,4 @@ def traj_to_timestamped_geojson(index, trajectory, robot_num, color): if __name__ == "__main__": - print(judge_collision(new_robot_px=6505, new_robot_py=5130, - old_robot_px=6925, old_robot_py=5130)) + print(judge_collision(new_robot_px=6505, new_robot_py=5130, old_robot_px=6925, old_robot_py=5130)) diff --git a/zoo/CrowdSim/envs/crowdsim_lightzero_env.py b/zoo/CrowdSim/envs/crowdsim_lightzero_env.py index bcecc89bc..e4d18047d 100644 --- a/zoo/CrowdSim/envs/crowdsim_lightzero_env.py +++ b/zoo/CrowdSim/envs/crowdsim_lightzero_env.py @@ -22,25 +22,21 @@ def __init__(self, cfg: dict = {}) -> None: self._replay_path = cfg.get('replay_path', None) self._robot_num = self._cfg.robot_num self._human_num = self._cfg.human_num - self._observation_space = gym.spaces.Dict({ - 'robot_state': gym.spaces.Box( - low=float("-inf"), - high=float("inf"), - shape=(self._robot_num, 4), - dtype=np.float32 - ), - 'human_state': gym.spaces.Box( - low=float("-inf"), - high=float("inf"), - shape=(self._human_num, 4), - dtype=np.float32 - ) - }) + self._observation_space = gym.spaces.Dict( + { + 'robot_state': gym.spaces.Box( + low=float("-inf"), high=float("inf"), shape=(self._robot_num, 4), dtype=np.float32 + ), + 'human_state': gym.spaces.Box( + low=float("-inf"), high=float("inf"), shape=(self._human_num, 4), dtype=np.float32 + ) + } + ) # action space # one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]] self.real_action_space = list(product(self._cfg.one_uav_action_space, repeat=self._robot_num)) one_uav_action_n = len(self._cfg.one_uav_action_space) - self._action_space = gym.spaces.Discrete(one_uav_action_n**self._robot_num) + self._action_space = gym.spaces.Discrete(one_uav_action_n ** self._robot_num) self._action_space.seed(0) # default seed self._reward_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1, ), dtype=np.float32) self._continuous = False @@ -48,16 +44,17 @@ def __init__(self, cfg: dict = {}) -> None: # obs_mode '2-dim-array': np.concatenate((robot_state, human_state), axis=0) # obs_mode '1-dim-array': np.concatenate((robot_state, human_state), axis=0).flatten() self.obs_mode = self._cfg.get('obs_mode', '2-dim-array') - assert self.obs_mode in ['dict', '2-dim-array', '1-dim-array'], "obs_mode should be 'dict' or '2-dim-array' or '1-dim-array'!" + assert self.obs_mode in [ + 'dict', '2-dim-array', '1-dim-array' + ], "obs_mode should be 'dict' or '2-dim-array' or '1-dim-array'!" # action_mode 'combine': combine all robot actions into one action, action space size = one_uav_action_n**robot_num # action_mode 'separate': separate robot actions, shape (robot_num,), for each robot action space size = one_uav_action_n self.action_mode = self._cfg.get('action_mode', 'combine') assert self.action_mode in ['combine', 'separate'], "action_mode should be 'combine' or 'separate'!" - def reset(self) -> np.ndarray: if not self._init_flag: - self._env = gym.make('CrowdSim-v0', dataset = self._cfg.dataset, custom_config = self._cfg) + self._env = gym.make('CrowdSim-v0', dataset=self._cfg.dataset, custom_config=self._cfg) self._init_flag = True if hasattr(self, '_seed') and hasattr(self, '_dynamic_seed') and self._dynamic_seed: np_seed = 100 * np.random.randint(1, 1000) diff --git a/zoo/CrowdSim/envs/test_CrowdSim_env.py b/zoo/CrowdSim/envs/test_CrowdSim_env.py index 25a8bc411..81ac5f7ab 100644 --- a/zoo/CrowdSim/envs/test_CrowdSim_env.py +++ b/zoo/CrowdSim/envs/test_CrowdSim_env.py @@ -2,28 +2,31 @@ from easydict import EasyDict from zoo.CrowdSim.envs.CrowdSim_env import CrowdSimEnv -mcfg=EasyDict( - env_name='CrowdSim-v0', - dataset = 'purdue', - robot_num = 2, - human_num = 59, # purdue - one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]) +mcfg = EasyDict( + env_name='CrowdSim-v0', + dataset='purdue', + robot_num=2, + human_num=59, # purdue + one_uav_action_space=[[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]] +) + def test_naive(cfg): env = CrowdSimEnv(cfg) env.seed(314) assert env._seed == 314 obs = env.reset() - assert obs['observation'].shape == (244,) + assert obs['observation'].shape == (244, ) for i in range(10): random_action = env.random_action() timestep = env.step(random_action) print(timestep) assert isinstance(timestep.obs['observation'], np.ndarray) assert isinstance(timestep.done, bool) - assert timestep.obs['observation'].shape == (244,) + assert timestep.obs['observation'].shape == (244, ) assert timestep.reward.shape == (1, ) print(env.observation_space, env.action_space, env.reward_space) env.close() -test_naive(mcfg) \ No newline at end of file + +test_naive(mcfg) diff --git a/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py b/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py index 09f36d6b7..ee369d2cd 100644 --- a/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py +++ b/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py @@ -3,19 +3,20 @@ from easydict import EasyDict from zoo.CrowdSim.envs.crowdsim_lightzero_env import CrowdSimEnv -mcfg=EasyDict( - env_name='CrowdSim-v0', - dataset = 'purdue', - robot_num = 2, - human_num = 59, # purdue - one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]], - obs_mode = '2-dim-array', - env_mode = 'easy', - ) +mcfg = EasyDict( + env_name='CrowdSim-v0', + dataset='purdue', + robot_num=2, + human_num=59, # purdue + one_uav_action_space=[[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]], + obs_mode='2-dim-array', + env_mode='easy', +) -@ pytest.mark.envtest +@pytest.mark.envtest class TestCrowdSimEnv: + def test_obs_dict(self): mcfg['obs_mode'] = 'dict' env = CrowdSimEnv(mcfg) @@ -79,4 +80,4 @@ def test_obs_1_dim_array(self): test = TestCrowdSimEnv() # test.test_obs_dict() # test.test_obs_2_dim_array() - test.test_obs_1_dim_array() \ No newline at end of file + test.test_obs_1_dim_array()