From c27ae921c1d9d4c8a51dddf8573d27c7f580b1d7 Mon Sep 17 00:00:00 2001
From: nighood <nighood@163.com>
Date: Thu, 8 Jun 2023 21:48:12 +0800
Subject: [PATCH 01/16] env(rjy): add crowdsim env

---
 lzero/worker/__init__.py                      |   2 +-
 lzero/worker/muzero_collector.py              |  88 +++++++++----
 lzero/worker/muzero_evaluator.py              |   5 +-
 zoo/CrowdSim/__init__.py                      |   0
 .../config/CrowdSim_efficientzero_config.py   |  99 ++++++++++++++
 zoo/CrowdSim/config/__init__.py               |   0
 zoo/CrowdSim/envs/CrowdSim_env.py             | 122 ++++++++++++++++++
 zoo/CrowdSim/envs/__init__.py                 |   0
 zoo/CrowdSim/envs/test_CrowdSim_env.py        |  28 ++++
 9 files changed, 319 insertions(+), 25 deletions(-)
 create mode 100644 zoo/CrowdSim/__init__.py
 create mode 100644 zoo/CrowdSim/config/CrowdSim_efficientzero_config.py
 create mode 100644 zoo/CrowdSim/config/__init__.py
 create mode 100644 zoo/CrowdSim/envs/CrowdSim_env.py
 create mode 100644 zoo/CrowdSim/envs/__init__.py
 create mode 100644 zoo/CrowdSim/envs/test_CrowdSim_env.py

diff --git a/lzero/worker/__init__.py b/lzero/worker/__init__.py
index ef4ab2c65..aafaf5410 100644
--- a/lzero/worker/__init__.py
+++ b/lzero/worker/__init__.py
@@ -2,4 +2,4 @@
 from .alphazero_evaluator import AlphaZeroEvaluator
 from .muzero_collector import MuZeroCollector
 from .muzero_evaluator import MuZeroEvaluator
-from .gumbel_muzero_collector import GumbelMuZeroCollector
+# from .gumbel_muzero_collector import GumbelMuZeroCollector
diff --git a/lzero/worker/muzero_collector.py b/lzero/worker/muzero_collector.py
index fffe6e84e..ec8ad7ebd 100644
--- a/lzero/worker/muzero_collector.py
+++ b/lzero/worker/muzero_collector.py
@@ -520,12 +520,28 @@ def collect(self,
                 if timestep.done:
                     self._total_episode_count += 1
                     reward = timestep.info['eval_episode_return']
-                    info = {
-                        'reward': reward,
-                        'time': self._env_info[env_id]['time'],
-                        'step': self._env_info[env_id]['step'],
-                        'visit_entropy': visit_entropies_lst[env_id] / eps_steps_lst[env_id],
-                    }
+                    if timestep.info.get('performance_info') is not None:
+                        mean_aoi = timestep.info['performance_info']['mean_aoi']
+                        mean_energy_consumption = timestep.info['performance_info']['mean_energy_consumption']
+                        collected_data_amount = timestep.info['performance_info']['collected_data_amount']
+                        human_coverage = timestep.info['performance_info']['human_coverage']
+                        info = {
+                            'reward': reward,
+                            'time': self._env_info[env_id]['time'],
+                            'step': self._env_info[env_id]['step'],
+                            'visit_entropy': visit_entropies_lst[env_id] / eps_steps_lst[env_id],
+                            'mean_aoi': mean_aoi,
+                            'mean_energy_consumption': mean_energy_consumption,
+                            'collected_data_amount': collected_data_amount,
+                            'human_coverage': human_coverage,
+                        }
+                    else:
+                        info = {
+                            'reward': reward,
+                            'time': self._env_info[env_id]['time'],
+                            'step': self._env_info[env_id]['step'],
+                            'visit_entropy': visit_entropies_lst[env_id] / eps_steps_lst[env_id],
+                        }
                     if self.policy_config.gumbel_algo:
                         info['completed_value'] = completed_value_lst[env_id] / eps_steps_lst[env_id]
                     collected_episode += 1
@@ -650,23 +666,49 @@ def _output_log(self, train_iter: int) -> None:
             if self.policy_config.gumbel_algo:
                 completed_value = [d['completed_value'] for d in self._episode_info]
             self._total_duration += duration
-            info = {
-                'episode_count': episode_count,
-                'envstep_count': envstep_count,
-                'avg_envstep_per_episode': envstep_count / episode_count,
-                'avg_envstep_per_sec': envstep_count / duration,
-                'avg_episode_per_sec': episode_count / duration,
-                'collect_time': duration,
-                'reward_mean': np.mean(episode_reward),
-                'reward_std': np.std(episode_reward),
-                'reward_max': np.max(episode_reward),
-                'reward_min': np.min(episode_reward),
-                'total_envstep_count': self._total_envstep_count,
-                'total_episode_count': self._total_episode_count,
-                'total_duration': self._total_duration,
-                'visit_entropy': np.mean(visit_entropy),
-                # 'each_reward': episode_reward,
-            }
+            if self._episode_info[0].get('mean_aoi') is not None:
+                episode_aoi = [d['mean_aoi'] for d in self._episode_info]
+                episode_energy_consumption = [d['mean_energy_consumption'] for d in self._episode_info]
+                episode_collected_data_amount = [d['collected_data_amount'] for d in self._episode_info]
+                episode_human_coverage = [d['human_coverage'] for d in self._episode_info]
+                info = {
+                    'episode_count': episode_count,
+                    'envstep_count': envstep_count,
+                    'avg_envstep_per_episode': envstep_count / episode_count,
+                    'avg_envstep_per_sec': envstep_count / duration,
+                    'avg_episode_per_sec': episode_count / duration,
+                    'collect_time': duration,
+                    'reward_mean': np.mean(episode_reward),
+                    'reward_std': np.std(episode_reward),
+                    'reward_max': np.max(episode_reward),
+                    'reward_min': np.min(episode_reward),
+                    'total_envstep_count': self._total_envstep_count,
+                    'total_episode_count': self._total_episode_count,
+                    'total_duration': self._total_duration,
+                    'visit_entropy': np.mean(visit_entropy),
+                    'episode_mean_aoi': np.mean(episode_aoi),
+                    'episode_mean_energy_consumption': np.mean(episode_energy_consumption),
+                    'episode_mean_collected_data_amount': np.mean(episode_collected_data_amount),
+                    'episode_mean_human_coverage': np.mean(episode_human_coverage),
+                }
+            else:
+                info = {
+                    'episode_count': episode_count,
+                    'envstep_count': envstep_count,
+                    'avg_envstep_per_episode': envstep_count / episode_count,
+                    'avg_envstep_per_sec': envstep_count / duration,
+                    'avg_episode_per_sec': episode_count / duration,
+                    'collect_time': duration,
+                    'reward_mean': np.mean(episode_reward),
+                    'reward_std': np.std(episode_reward),
+                    'reward_max': np.max(episode_reward),
+                    'reward_min': np.min(episode_reward),
+                    'total_envstep_count': self._total_envstep_count,
+                    'total_episode_count': self._total_episode_count,
+                    'total_duration': self._total_duration,
+                    'visit_entropy': np.mean(visit_entropy),
+                    # 'each_reward': episode_reward,
+                }
             if self.policy_config.gumbel_algo:
                 info['completed_value'] = np.mean(completed_value)
             self._episode_info.clear()
diff --git a/lzero/worker/muzero_evaluator.py b/lzero/worker/muzero_evaluator.py
index 04d6fece9..eb2832a97 100644
--- a/lzero/worker/muzero_evaluator.py
+++ b/lzero/worker/muzero_evaluator.py
@@ -331,7 +331,10 @@ def eval(
                         # Env reset is done by env_manager automatically.
                         self._policy.reset([env_id])
                         reward = t.info['eval_episode_return']
-                        if 'episode_info' in t.info:
+                        # 'performance_info' and 'episode_info' only choose one 
+                        if 'performance_info' in t.info:
+                            eval_monitor.update_info(env_id, t.info['performance_info'])
+                        elif 'episode_info' in t.info:
                             eval_monitor.update_info(env_id, t.info['episode_info'])
                         eval_monitor.update_reward(env_id, reward)
                         self._logger.info(
diff --git a/zoo/CrowdSim/__init__.py b/zoo/CrowdSim/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/zoo/CrowdSim/config/CrowdSim_efficientzero_config.py b/zoo/CrowdSim/config/CrowdSim_efficientzero_config.py
new file mode 100644
index 000000000..acfbff5dc
--- /dev/null
+++ b/zoo/CrowdSim/config/CrowdSim_efficientzero_config.py
@@ -0,0 +1,99 @@
+from easydict import EasyDict
+
+# ==============================================================
+# begin of the most frequently changed config specified by the user
+# ==============================================================
+collector_env_num = 8
+n_episode = 8
+evaluator_env_num = 3
+num_simulations = 25
+update_per_collect = 100
+batch_size = 256
+max_env_step = int(1e5)
+reanalyze_ratio = 0.
+robot_num = 2
+human_num = 59  # purdue
+one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]
+# ==============================================================
+# end of the most frequently changed config specified by the user
+# ==============================================================
+
+CrowdSim_efficientzero_config = dict(
+    exp_name=
+    f'/root/result/crowd_result/CrowdSim_efficientzero_ns{num_simulations}_upc{update_per_collect}_rr{reanalyze_ratio}_seed0',
+    env=dict(
+        env_name='CrowdSim-v0',
+        robot_num = robot_num,
+        human_num = human_num, 
+        one_uav_action_space = one_uav_action_space,
+        continuous=False,
+        manually_discretization=False,
+        collector_env_num=collector_env_num,
+        evaluator_env_num=evaluator_env_num,
+        n_evaluator_episode=evaluator_env_num,
+        manager=dict(shared_memory=False, ),
+    ),
+    policy=dict(
+        model=dict(
+            observation_shape=(robot_num+human_num)*4,
+            action_space_size=(len(one_uav_action_space))**robot_num,
+            model_type='mlp', 
+            lstm_hidden_size=256,
+            latent_state_dim=256,
+            discrete_action_encoding_type='one_hot',
+            # res_connection_in_dynamics=True,
+            norm_type='BN', 
+        ),
+        cuda=True,
+        env_type='not_board_games',
+        game_segment_length=200,
+        update_per_collect=update_per_collect,
+        batch_size=batch_size,
+        optim_type='Adam',
+        lr_piecewise_constant_decay=False,
+        learning_rate=0.003,
+        num_simulations=num_simulations,
+        reanalyze_ratio=reanalyze_ratio,
+        n_episode=n_episode,
+        eval_freq=int(1e3),
+        replay_buffer_size=int(1e6),  # the size/capacity of replay_buffer, in the terms of transitions.
+        collector_env_num=collector_env_num,
+        evaluator_env_num=evaluator_env_num,
+    ),
+)
+
+CrowdSim_efficientzero_config = EasyDict(CrowdSim_efficientzero_config)
+main_config = CrowdSim_efficientzero_config
+
+CrowdSim_efficientzero_create_config = dict(
+    env=dict(
+        type='crowdsim_lightzero',
+        import_names=['zoo.CrowdSim.envs.CrowdSim_env'],
+    ),
+    env_manager=dict(type='subprocess'),
+    policy=dict(
+        type='efficientzero',
+        import_names=['lzero.policy.efficientzero'],
+    ),
+    collector=dict(
+        type='episode_muzero',
+        import_names=['lzero.worker.muzero_collector'],
+    )
+)
+CrowdSim_efficientzero_create_config = EasyDict(CrowdSim_efficientzero_create_config)
+create_config = CrowdSim_efficientzero_create_config
+
+if __name__ == "__main__":
+    # Users can use different train entry by specifying the entry_type.
+    entry_type = "train_muzero"  # options={"train_muzero", "train_muzero_with_gym_env"}
+
+    if entry_type == "train_muzero":
+        from lzero.entry import train_muzero
+    elif entry_type == "train_muzero_with_gym_env":
+        """
+        The ``train_muzero_with_gym_env`` entry means that the environment used in the training process is generated by wrapping the original gym environment with LightZeroEnvWrapper.
+        Users can refer to lzero/envs/wrappers for more details.
+        """
+        from lzero.entry import train_muzero_with_gym_env as train_muzero
+
+    train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step)
diff --git a/zoo/CrowdSim/config/__init__.py b/zoo/CrowdSim/config/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/zoo/CrowdSim/envs/CrowdSim_env.py b/zoo/CrowdSim/envs/CrowdSim_env.py
new file mode 100644
index 000000000..c88ae21b5
--- /dev/null
+++ b/zoo/CrowdSim/envs/CrowdSim_env.py
@@ -0,0 +1,122 @@
+from typing import Union, Optional
+
+import gym
+import numpy as np
+from itertools import product
+import logging
+
+from ding.envs import BaseEnv, BaseEnvTimestep
+from ding.envs import ObsPlusPrevActRewWrapper
+from ding.torch_utils import to_ndarray
+from ding.utils import ENV_REGISTRY
+
+import CrowdSim.envs as envs
+
+
+@ENV_REGISTRY.register('crowdsim_lightzero')
+class CrowdSimEnv(BaseEnv):
+
+    def __init__(self, cfg: dict = {}) -> None:
+        self._cfg = cfg
+        self._init_flag = False
+        self._replay_path = None
+        self._robot_num = self._cfg.robot_num
+        self._human_num = self._cfg.human_num
+        self._observation_space = gym.spaces.Box(
+            low=float("-inf"),
+            high=float("inf"),
+            shape=((self._robot_num+self._human_num)*4,),
+            dtype=np.float32)
+        # action space
+        # one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]
+        self.real_action_space = list(product(self._cfg.one_uav_action_space, repeat=self._robot_num))
+        one_uav_action_n = len(self._cfg.one_uav_action_space)
+        self._action_space = gym.spaces.Discrete(one_uav_action_n**self._robot_num)
+        self._action_space.seed(0)  # default seed
+        self._reward_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1, ), dtype=np.float32)
+        self._continuous = False
+
+    def reset(self) -> np.ndarray:
+        if not self._init_flag:
+            self._env = gym.make('CrowdSim-v0')
+            # if self._replay_path is not None:
+            #     self._env = gym.wrappers.RecordVideo(
+            #         self._env,
+            #         video_folder=self._replay_path,
+            #         episode_trigger=lambda episode_id: True,
+            #         name_prefix='rl-video-{}'.format(id(self))
+            #     )
+            # if hasattr(self._cfg, 'obs_plus_prev_action_reward') and self._cfg.obs_plus_prev_action_reward:
+            #     self._env = ObsPlusPrevActRewWrapper(self._env)
+            self._init_flag = True
+        if hasattr(self, '_seed') and hasattr(self, '_dynamic_seed') and self._dynamic_seed:
+            np_seed = 100 * np.random.randint(1, 1000)
+            self._env.seed(self._seed + np_seed)
+            self._action_space.seed(self._seed + np_seed)
+        elif hasattr(self, '_seed'):
+            self._env.seed(self._seed)
+            self._action_space.seed(self._seed)
+        self._eval_episode_return = 0
+        # process obs
+        raw_obs = self._env.reset()
+        obs_list = to_ndarray(raw_obs.to_tensor())
+        # human_obs, robot_obs = obs_list
+        obs = np.concatenate(obs_list,axis=0).flatten() # for 1 dim e.g.(244,)
+        action_mask = np.ones(self.action_space.n, 'int8')
+        obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1}
+
+        return obs
+
+    def close(self) -> None:
+        if self._init_flag:
+            self._env.close()
+        self._init_flag = False
+
+    def seed(self, seed: int, dynamic_seed: bool = True) -> None:
+        self._seed = seed
+        self._dynamic_seed = dynamic_seed
+        np.random.seed(self._seed)
+
+    def step(self, action: Union[int, np.ndarray]) -> BaseEnvTimestep:
+        if isinstance(action, np.ndarray) and action.shape == (1, ):
+            action = action.squeeze()  # 0-dim array
+        real_action = self.real_action_space[action]
+        assert isinstance(real_action, tuple) and len(real_action) == self._robot_num, "illegal action!"
+        raw_obs, rew, done, info = self._env.step(real_action)
+        obs_list = to_ndarray(raw_obs.to_tensor())
+        obs = np.concatenate(obs_list,axis=0).flatten() # for 1 dim e.g.(244,)
+
+        self._eval_episode_return += rew
+        if done:
+            info['eval_episode_return'] = self._eval_episode_return
+            # logging.INFO('one game finish!')
+
+        action_mask = np.ones(self.action_space.n, 'int8')
+        obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1}
+        rew = to_ndarray([rew]).astype(np.float32)
+        return BaseEnvTimestep(obs, rew, done, info)
+
+    def enable_save_replay(self, replay_path: Optional[str] = None) -> None:
+        if replay_path is None:
+            replay_path = './video'
+        self._replay_path = replay_path
+
+    def random_action(self) -> np.ndarray:
+        random_action = self.action_space.sample()
+        random_action = to_ndarray([random_action], dtype=np.int64)
+        return random_action
+
+    @property
+    def observation_space(self) -> gym.spaces.Space:
+        return self._observation_space
+
+    @property
+    def action_space(self) -> gym.spaces.Space:
+        return self._action_space
+
+    @property
+    def reward_space(self) -> gym.spaces.Space:
+        return self._reward_space
+
+    def __repr__(self) -> str:
+        return "LightZero CrowdSim Env"
diff --git a/zoo/CrowdSim/envs/__init__.py b/zoo/CrowdSim/envs/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/zoo/CrowdSim/envs/test_CrowdSim_env.py b/zoo/CrowdSim/envs/test_CrowdSim_env.py
new file mode 100644
index 000000000..4f95511dc
--- /dev/null
+++ b/zoo/CrowdSim/envs/test_CrowdSim_env.py
@@ -0,0 +1,28 @@
+import numpy as np
+from easydict import EasyDict
+from zoo.CrowdSim.envs.CrowdSim_env import CrowdSimEnv
+
+mcfg=EasyDict(
+        env_name='CrowdSim-v0',
+        robot_num = 2,
+        human_num = 59, # purdue
+        one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]])
+
+def test_naive(cfg):
+    env = CrowdSimEnv(cfg)
+    env.seed(314)
+    assert env._seed == 314
+    obs = env.reset()
+    assert obs['observation'].shape == (244,)
+    for i in range(10):
+        random_action = env.random_action()
+        timestep = env.step(random_action)
+        print(timestep)
+        assert isinstance(timestep.obs['observation'], np.ndarray)
+        assert isinstance(timestep.done, bool)
+        assert timestep.obs['observation'].shape == (244,)
+        assert timestep.reward.shape == (1, )
+    print(env.observation_space, env.action_space, env.reward_space)
+    env.close()
+
+test_naive(mcfg)
\ No newline at end of file

From c6acd7dc519e1af9d770954c72a816e9ca57d3c9 Mon Sep 17 00:00:00 2001
From: nighood <nighoodren@gmail.com>
Date: Mon, 13 Nov 2023 11:49:35 +0800
Subject: [PATCH 02/16] config(rjy): add mz/ez config for crowdsim

---
 .../config/CrowdSim_efficientzero_config.py   |  10 +-
 zoo/CrowdSim/config/CrowdSim_muzero_config.py | 105 ++++++++++++++++++
 zoo/CrowdSim/envs/CrowdSim_env.py             |   2 +
 3 files changed, 114 insertions(+), 3 deletions(-)
 create mode 100644 zoo/CrowdSim/config/CrowdSim_muzero_config.py

diff --git a/zoo/CrowdSim/config/CrowdSim_efficientzero_config.py b/zoo/CrowdSim/config/CrowdSim_efficientzero_config.py
index acfbff5dc..0252c64ee 100644
--- a/zoo/CrowdSim/config/CrowdSim_efficientzero_config.py
+++ b/zoo/CrowdSim/config/CrowdSim_efficientzero_config.py
@@ -1,4 +1,6 @@
 from easydict import EasyDict
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = '2'
 
 # ==============================================================
 # begin of the most frequently changed config specified by the user
@@ -9,10 +11,12 @@
 num_simulations = 25
 update_per_collect = 100
 batch_size = 256
-max_env_step = int(1e5)
+max_env_step = int(3e5)
 reanalyze_ratio = 0.
 robot_num = 2
-human_num = 59  # purdue
+human_num = 10  # purdue
+# human_num = 33  # NCSU
+# human_num = 92  # KAIST
 one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]
 # ==============================================================
 # end of the most frequently changed config specified by the user
@@ -20,7 +24,7 @@
 
 CrowdSim_efficientzero_config = dict(
     exp_name=
-    f'/root/result/crowd_result/CrowdSim_efficientzero_ns{num_simulations}_upc{update_per_collect}_rr{reanalyze_ratio}_seed0',
+    f'result/crowd_num_human/CrowdSim_efficientzero_step{max_env_step}_uav{robot_num}_human{human_num}_upc{update_per_collect}_rr{reanalyze_ratio}_seed0',
     env=dict(
         env_name='CrowdSim-v0',
         robot_num = robot_num,
diff --git a/zoo/CrowdSim/config/CrowdSim_muzero_config.py b/zoo/CrowdSim/config/CrowdSim_muzero_config.py
new file mode 100644
index 000000000..8def49e3e
--- /dev/null
+++ b/zoo/CrowdSim/config/CrowdSim_muzero_config.py
@@ -0,0 +1,105 @@
+from easydict import EasyDict
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = '2'
+# ==============================================================
+# begin of the most frequently changed config specified by the user
+# ==============================================================
+collector_env_num = 8
+n_episode = 8
+evaluator_env_num = 3
+num_simulations = 25
+update_per_collect = 100
+batch_size = 256
+max_env_step = int(3e5)
+reanalyze_ratio = 0.
+robot_num = 2
+human_num = 10  # purdue
+# human_num = 33  # NCSU
+# human_num = 92  # KAIST
+one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]
+# ==============================================================
+# end of the most frequently changed config specified by the user
+# ==============================================================
+
+CrowdSim_muzero_config = dict(
+    exp_name=
+    f'result/crowd_num_human/CrowdSim_muzero_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_upc{update_per_collect}_rr{reanalyze_ratio}_seed0',
+    env=dict(
+        env_name='CrowdSim-v0',
+        robot_num = robot_num,
+        human_num = human_num, 
+        one_uav_action_space = one_uav_action_space,
+        continuous=False,
+        manually_discretization=False,
+        collector_env_num=collector_env_num,
+        evaluator_env_num=evaluator_env_num,
+        n_evaluator_episode=evaluator_env_num,
+        manager=dict(shared_memory=False, ),
+    ),
+    policy=dict(
+        model=dict(
+            observation_shape=(robot_num+human_num)*4,
+            action_space_size=(len(one_uav_action_space))**robot_num,
+            model_type='mlp', 
+            lstm_hidden_size=256,
+            latent_state_dim=256,
+            self_supervised_learning_loss=True,  # NOTE: default is False.
+            discrete_action_encoding_type='one_hot',
+            res_connection_in_dynamics=True,
+            norm_type='BN',
+        ),
+        cuda=True,
+        env_type='not_board_games',
+        game_segment_length=200,
+        update_per_collect=update_per_collect,
+        batch_size=batch_size,
+        optim_type='Adam',
+        lr_piecewise_constant_decay=False,
+        learning_rate=0.003,
+        ssl_loss_weight=2,  # NOTE: default is 0.
+        grad_clip_value=0.5,
+        num_simulations=num_simulations,
+        reanalyze_ratio=reanalyze_ratio,
+        n_episode=n_episode,
+        eval_freq=int(1e3),
+        replay_buffer_size=int(1e6),  # the size/capacity of replay_buffer, in the terms of transitions.
+        collector_env_num=collector_env_num,
+        evaluator_env_num=evaluator_env_num,
+    ),
+)
+
+CrowdSim_muzero_config = EasyDict(CrowdSim_muzero_config)
+main_config = CrowdSim_muzero_config
+
+CrowdSim_muzero_create_config = dict(
+    env=dict(
+        type='crowdsim_lightzero',
+        import_names=['zoo.CrowdSim.envs.CrowdSim_env'],
+    ),
+    env_manager=dict(type='subprocess'),
+    policy=dict(
+        type='muzero',
+        import_names=['lzero.policy.muzero'],
+    ),
+    collector=dict(
+        type='episode_muzero',
+        import_names=['lzero.worker.muzero_collector'],
+    )
+)
+CrowdSim_muzero_create_config = EasyDict(CrowdSim_muzero_create_config)
+create_config = CrowdSim_muzero_create_config
+
+if __name__ == "__main__":
+    # Users can use different train entry by specifying the entry_type.
+    entry_type = "train_muzero"  # options={"train_muzero", "train_muzero_with_gym_env"}
+
+    if entry_type == "train_muzero":
+        from lzero.entry import train_muzero
+    elif entry_type == "train_muzero_with_gym_env":
+        """
+        The ``train_muzero_with_gym_env`` entry means that the environment used in the training process is generated by wrapping the original gym environment with LightZeroEnvWrapper.
+        Users can refer to lzero/envs/wrappers for more details.
+        """
+        from lzero.entry import train_muzero_with_gym_env as train_muzero
+
+    train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step)
diff --git a/zoo/CrowdSim/envs/CrowdSim_env.py b/zoo/CrowdSim/envs/CrowdSim_env.py
index c88ae21b5..8af0729aa 100644
--- a/zoo/CrowdSim/envs/CrowdSim_env.py
+++ b/zoo/CrowdSim/envs/CrowdSim_env.py
@@ -62,6 +62,7 @@ def reset(self) -> np.ndarray:
         obs_list = to_ndarray(raw_obs.to_tensor())
         # human_obs, robot_obs = obs_list
         obs = np.concatenate(obs_list,axis=0).flatten() # for 1 dim e.g.(244,)
+        assert len(obs)==(self._robot_num+self._human_num)*4
         action_mask = np.ones(self.action_space.n, 'int8')
         obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1}
 
@@ -85,6 +86,7 @@ def step(self, action: Union[int, np.ndarray]) -> BaseEnvTimestep:
         raw_obs, rew, done, info = self._env.step(real_action)
         obs_list = to_ndarray(raw_obs.to_tensor())
         obs = np.concatenate(obs_list,axis=0).flatten() # for 1 dim e.g.(244,)
+        assert len(obs)==(self._robot_num+self._human_num)*4
 
         self._eval_episode_return += rew
         if done:

From ad0cd02da2202a6ee1654ba40fc7310cb91cf0de Mon Sep 17 00:00:00 2001
From: nighood <nighoodren@gmail.com>
Date: Sun, 7 Apr 2024 23:17:25 +0800
Subject: [PATCH 03/16] env(rjy): add crowdsim env

---
 zoo/CrowdSim/envs/CrowdSim_env.py             |  15 +-
 zoo/CrowdSim/envs/Crowdsim/__init__.py        |   7 +
 zoo/CrowdSim/envs/Crowdsim/env/__init__.py    |   1 +
 .../envs/Crowdsim/env/base_env_config.py      | 137 ++++++++++
 zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py   | 252 ++++++++++++++++++
 .../envs/Crowdsim/env/model/__init__.py       |   0
 zoo/CrowdSim/envs/Crowdsim/env/model/agent.py |  72 +++++
 zoo/CrowdSim/envs/Crowdsim/env/model/mdp.py   |  89 +++++++
 zoo/CrowdSim/envs/Crowdsim/env/model/utils.py | 248 +++++++++++++++++
 zoo/CrowdSim/envs/test_CrowdSim_env.py        |   1 +
 10 files changed, 810 insertions(+), 12 deletions(-)
 create mode 100644 zoo/CrowdSim/envs/Crowdsim/__init__.py
 create mode 100644 zoo/CrowdSim/envs/Crowdsim/env/__init__.py
 create mode 100644 zoo/CrowdSim/envs/Crowdsim/env/base_env_config.py
 create mode 100644 zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
 create mode 100644 zoo/CrowdSim/envs/Crowdsim/env/model/__init__.py
 create mode 100644 zoo/CrowdSim/envs/Crowdsim/env/model/agent.py
 create mode 100644 zoo/CrowdSim/envs/Crowdsim/env/model/mdp.py
 create mode 100644 zoo/CrowdSim/envs/Crowdsim/env/model/utils.py

diff --git a/zoo/CrowdSim/envs/CrowdSim_env.py b/zoo/CrowdSim/envs/CrowdSim_env.py
index 8af0729aa..21d31863e 100644
--- a/zoo/CrowdSim/envs/CrowdSim_env.py
+++ b/zoo/CrowdSim/envs/CrowdSim_env.py
@@ -10,7 +10,7 @@
 from ding.torch_utils import to_ndarray
 from ding.utils import ENV_REGISTRY
 
-import CrowdSim.envs as envs
+import zoo.CrowdSim.envs.Crowdsim.env
 
 
 @ENV_REGISTRY.register('crowdsim_lightzero')
@@ -38,16 +38,7 @@ def __init__(self, cfg: dict = {}) -> None:
 
     def reset(self) -> np.ndarray:
         if not self._init_flag:
-            self._env = gym.make('CrowdSim-v0')
-            # if self._replay_path is not None:
-            #     self._env = gym.wrappers.RecordVideo(
-            #         self._env,
-            #         video_folder=self._replay_path,
-            #         episode_trigger=lambda episode_id: True,
-            #         name_prefix='rl-video-{}'.format(id(self))
-            #     )
-            # if hasattr(self._cfg, 'obs_plus_prev_action_reward') and self._cfg.obs_plus_prev_action_reward:
-            #     self._env = ObsPlusPrevActRewWrapper(self._env)
+            self._env = gym.make('CrowdSim-v0', dataset = self._cfg.dataset, custom_config = self._cfg)
             self._init_flag = True
         if hasattr(self, '_seed') and hasattr(self, '_dynamic_seed') and self._dynamic_seed:
             np_seed = 100 * np.random.randint(1, 1000)
@@ -59,7 +50,7 @@ def reset(self) -> np.ndarray:
         self._eval_episode_return = 0
         # process obs
         raw_obs = self._env.reset()
-        obs_list = to_ndarray(raw_obs.to_tensor())
+        obs_list = raw_obs.to_array()
         # human_obs, robot_obs = obs_list
         obs = np.concatenate(obs_list,axis=0).flatten() # for 1 dim e.g.(244,)
         assert len(obs)==(self._robot_num+self._human_num)*4
diff --git a/zoo/CrowdSim/envs/Crowdsim/__init__.py b/zoo/CrowdSim/envs/Crowdsim/__init__.py
new file mode 100644
index 000000000..5fb297682
--- /dev/null
+++ b/zoo/CrowdSim/envs/Crowdsim/__init__.py
@@ -0,0 +1,7 @@
+import logging
+from gym.envs.registration import register
+logger = logging.getLogger(__name__)
+register(
+    id='CrowdSim-v0',
+    entry_point='zoo.CrowdSim.envs.Crowdsim.env.crowd_sim:CrowdSim',
+)
diff --git a/zoo/CrowdSim/envs/Crowdsim/env/__init__.py b/zoo/CrowdSim/envs/Crowdsim/env/__init__.py
new file mode 100644
index 000000000..eddfe3037
--- /dev/null
+++ b/zoo/CrowdSim/envs/Crowdsim/env/__init__.py
@@ -0,0 +1 @@
+from .crowd_sim import CrowdSim
diff --git a/zoo/CrowdSim/envs/Crowdsim/env/base_env_config.py b/zoo/CrowdSim/envs/Crowdsim/env/base_env_config.py
new file mode 100644
index 000000000..865ab56fb
--- /dev/null
+++ b/zoo/CrowdSim/envs/Crowdsim/env/base_env_config.py
@@ -0,0 +1,137 @@
+from easydict import EasyDict
+
+# define base config
+base_config = EasyDict({
+    "num_timestep": 120,  # 120x15=1800s=30min
+    "step_time": 15,  # second per step
+    "max_uav_energy": 359640,  # 359640 J <-- 359.64 kJ (4500mah, 22.2v) 大疆经纬
+    "rotation_limit": 360,
+    "diameter_of_human_blockers": 0.5,  # m
+    "h_rx": 1.3,  # m, height of RX
+    "h_b": 1.7,  # m, height of a human blocker
+    "velocity": 18,
+    "frequence_band": 28,  # GHz
+    "h_d": 120,  # m, height of drone-BS
+    "alpha_nlos": 113.63,
+    "beta_nlos": 1.16,
+    "zeta_nlos": 2.58,  # Frequency 28GHz, sub-urban. channel model
+    "alpha_los": 84.64,
+    "beta_los": 1.55,
+    "zeta_los": 0.12,
+    "g_tx": 0,  # dB
+    "g_rx": 5,  # dB
+    "tallest_locs": None,  # obstacle
+    "no_fly_zone": None,  # obstacle
+    "start_timestamp": 1519894800,
+    "end_timestamp": 1519896600,
+    "energy_factor": 3,  # TODO: energy factor in reward function
+    "robot_num": 2,  # TODO: 多了要用多进程
+    "rollout_num": 1,  # 1 2 6 12 15, calculated based on robot_num
+})
+
+# define all dataset configs
+dataset_configs = {
+    'purdue': EasyDict({
+        "lower_left": [-86.93, 40.4203],  # 经纬度
+        "upper_right": [-86.9103, 40.4313],
+        "nlon": 200,
+        "nlat": 120,
+        "human_num": 59,
+        "dataset_dir": '/home/nighoodRen/CrowdSim/CrowdSim/envs/crowd_sim/dataset/purdue/59 users.csv',
+        "sensing_range": 23.2,  # unit  23.2
+        "one_uav_action_space": [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30], [21, 21], [21, -21], [-21, 21], [-21, -21]],
+        "max_x_distance": 1667,  # m
+        "max_y_distance": 1222,  # m
+        "density_of_human_blockers": 30000 / 1667 / 1222,  # block/m2
+    }),
+    'ncsu': EasyDict({
+        "lower_left": [-78.6988, 35.7651],  # 经纬度
+        "upper_right": [-78.6628, 35.7896],
+        "nlon": 3600,
+        "nlat": 2450,
+        "human_num": 33,
+        "dataset_dir": '/home/nighoodRen/CrowdSim/CrowdSim/envs/crowd_sim/dataset/NCSU/33 users.csv',
+        "sensing_range": 220,  # unit  220
+        "one_uav_action_space": [[0, 0], [300, 0], [-300, 0], [0, 300], [0, -300], [210, 210], [210, -210], [-210, 210], [-210, -210]],
+        "max_x_distance": 3255.4913305859623,
+        "max_y_distance": 2718.3945272795013,
+        "density_of_human_blockers": 30000 / 3255.4913305859623 / 2718.3945272795013,  # block/m2
+    }),
+    'kaist': EasyDict({
+        "lower_left": [127.3475, 36.3597],
+        "upper_right": [127.3709, 36.3793],
+        "nlon": 2340,
+        "nlat": 1960,
+        "human_num": 92,
+        "dataset_dir": '/home/nighoodRen/CrowdSim/CrowdSim/envs/crowd_sim/dataset/KAIST/92 users.csv',
+        "sensing_range": 220,  # unit  220
+        "one_uav_action_space": [[0, 0], [300, 0], [-300, 0], [0, 300], [0, -300], [210, 210], [210, -210], [-210, 210], [-210, -210]],
+        "max_x_distance": 2100.207579392558,
+        "max_y_distance": 2174.930950809533,
+        "density_of_human_blockers": 30000 / 2100.207579392558 / 2174.930950809533,  # block/m2
+    }),
+    # ... could add more dataset configs here
+}
+
+# get config according to data set name
+def get_selected_config(data_set_name):
+    if data_set_name in dataset_configs:
+        dataset_config = dataset_configs[data_set_name]
+        return EasyDict({**base_config, **dataset_config})
+    else:
+        raise ValueError(f"Data set '{data_set_name}' not found.")
+
+# r:meters, 2d distance
+# threshold: dB
+def try_sensing_range(r, data_set_name):
+    import math
+    config = get_selected_config(data_set_name)
+    p_los = math.exp(
+        -config.density_of_human_blockers * config.diameter_of_human_blockers * r * (config.h_b - config.h_rx) / (
+                config.h_d - config.h_rx))
+    p_nlos = 1 - p_los
+    PL_los = config.alpha_los + config.beta_los * 10 * math.log10(
+        math.sqrt(r * r + config.h_d * config.h_d)) + config.zeta_los
+    PL_nlos = config.alpha_nlos + config.beta_nlos * 10 * math.log10(
+        math.sqrt(r * r + config.h_d * config.h_d)) + config.zeta_nlos
+    PL = p_los * PL_los + p_nlos * PL_nlos
+    CL = PL - config.g_tx - config.g_rx
+    print(p_los, p_nlos)
+    print(CL)
+
+
+# Maximum Coupling Loss (110dB is recommended)
+# purdue:
+
+# 123dB -> 560m -> 60.5 range
+# 121dB -> 420m -> 45.4 range
+# 119dB -> 300m -> 32.4 range
+# 117dB -> 215m -> 23.2 range √
+# 115dB -> 140m -> 15 range
+
+# ncsu:
+# 123dB -> 600m -> 600 range
+# 121dB -> 435m -> 435 range
+# 119dB -> 315m -> 315 range
+# 117dB -> 220m -> 220 range √
+# 115dB -> 145m -> 145 range
+
+# kaist:
+# 123dB -> 600m -> 600 range
+# 121dB -> 435m -> 435 range
+# 119dB -> 315m -> 315 range
+# 117dB -> 220m -> 220 range √
+# 115dB -> 145m -> 145 range
+
+# san:
+# 123dB -> 600m -> 600 range
+# 121dB -> 450m -> 450 range
+# 119dB -> 330m -> 330 range
+# 117dB -> 240m -> 240 range √
+# 115dB -> 165m -> 165 range
+
+if __name__ == '__main__':
+    # example usage
+    data_set_name = 'purdue'
+    selected_config = get_selected_config(data_set_name)
+    print(selected_config)
\ No newline at end of file
diff --git a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
new file mode 100644
index 000000000..c27a888d5
--- /dev/null
+++ b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
@@ -0,0 +1,252 @@
+import pandas as pd
+
+import logging
+import random
+import gym
+from shapely.geometry import Point
+import numpy as np
+import folium
+from folium.plugins import TimestampedGeoJson, AntPath
+
+from zoo.CrowdSim.envs.Crowdsim.env.model.utils import *
+from zoo.CrowdSim.envs.Crowdsim.env.model.mdp import HumanState, RobotState, JointState
+from zoo.CrowdSim.envs.Crowdsim.env.base_env_config import get_selected_config
+
+
+
+class CrowdSim(gym.Env):
+    metadata = {'render.modes': ['human']}
+
+    def __init__(self, dataset, custom_config=None):
+        # mcfg should include: 
+        self.time_limit = None
+        self.robots = None
+        self.humans = None
+        self.agent = None
+        self.current_timestep = None
+        self.phase = None
+
+        self.config = get_selected_config(dataset)
+        self.config.update(custom_config)
+
+        self.human_num = self.config.human_num
+        self.robot_num = self.config.robot_num
+        self.num_timestep = self.config.num_timestep    # max timestep
+        self.step_time = self.config.step_time  # second per step
+        self.start_timestamp = self.config.start_timestamp  # fit timpestamp to datetime
+        self.max_uav_energy = self.config.max_uav_energy
+        # self.action_space = gym.spaces.Discrete(4**self.robot_num) # for each robot have 4 actions(up, down, left, right), then product
+        self.action_space = gym.spaces.Discrete(len(self.config.one_uav_action_space))
+        # human obs: [px, py, theta, aoi]
+        # robot obs: [px, py, theta, energy]
+        # self.observation_space = gym.spaces.Box(low=float("-inf"), high=float("inf"), shape=(4), dtype=np.float32)
+        self.observation_space = gym.spaces.Box(low=float("-inf"), high=float("inf"), shape=(self.robot_num+self.human_num, 4), dtype=np.float32)
+
+        # load_dataset
+        self.nlon = self.config.nlon
+        self.nlat = self.config.nlat
+        self.lower_left = self.config.lower_left
+        self.upper_right = self.config.upper_right
+        self.human_df = pd.read_csv(self.config.dataset_dir)
+        logging.info("Finished reading {} rows".format(len(self.human_df)))
+        # # for temporarily processing data
+        # sample_list=np.random.choice(self.human_num, size=[50,], replace=False)
+        # sample_list=sample_list[np.argsort(sample_list)]
+        # print(sample_list)
+        # self.human_df= self.human_df[self.human_df["id"].isin(sample_list)]
+        # for i,human_id in enumerate(sample_list):
+        #     mask=(self.human_df["id"]==human_id)
+        #     self.human_df.loc[mask,"id"]=i
+        # self.human_df=self.human_df.sort_values(by=["id","timestamp"],ascending=[True,True])
+        # print(self.human_df.head())
+        # self.human_df.to_csv("50 users-5.csv",index=False)
+        # exit(0)
+
+        self.human_df['t'] = pd.to_datetime(self.human_df['timestamp'], unit='s')  # 's' stands for second
+        self.human_df['aoi'] = -1  # 加入aoi记录aoi
+        self.human_df['energy'] = -1  # 加入energy记录energy
+        logging.info('human number: {}'.format(self.human_num))
+        logging.info('Robot number: {}'.format(self.robot_num))
+
+        # for debug
+        self.current_human_aoi_list = np.ones([self.human_num, ])
+        self.mean_aoi_timelist = np.ones([self.config.num_timestep + 1, ])
+        self.robot_energy_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num])
+        self.robot_x_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num])
+        self.robot_y_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num])
+        self.update_human_timelist = np.zeros([self.config.num_timestep, ])
+        self.data_collection = 0
+
+    def set_agent(self, agent):
+        self.agent = agent
+
+    def generate_human(self, human_id, selected_data, selected_next_data):
+        human = Human(human_id, self.config)
+        px, py, theta = get_human_position_from_list(self.current_timestep, human_id, selected_data, selected_next_data, self.config)
+        # human obs: [px, py, theta, aoi]
+        human.set(px, py, theta, 1)  # initial aoi of human is 1
+        return human
+
+    def generate_robot(self, robot_id):
+        robot = Robot(robot_id, self.config)
+        # robot obs: [px, py, theta, energy]
+        robot.set(self.nlon / 2, self.nlat / 2, 0, self.max_uav_energy)  # robot有energy
+        return robot
+
+    def sync_human_df(self, human_id, current_timestep, aoi):
+        """
+        Overview:
+            Sync the human_df with the current timestep and aoi.
+        Args:
+            - human_id (:obj:`int`): The id of the human.
+            - current_timestep (:obj:`int`): The current timestep.
+            - aoi (:obj:`int`): The aoi of the human.
+        """
+        current_timestamp = self.start_timestamp + current_timestep * self.step_time
+        current_index = self.human_df[
+            (self.human_df.id == human_id) & (self.human_df.timestamp == current_timestamp)].index
+        # self.human_df.loc[current_index, "aoi"] = aoi   # slower
+        self.human_df.iat[current_index.values[0], 9] = aoi # faster
+
+    def reset(self, phase='test', test_case=None):
+        self.current_timestep = 0
+
+        # generate human
+        self.humans = []
+        selected_data, selected_next_data = get_human_position_list(self.current_timestep, self.human_df, self.config)
+        for human_id in range(self.human_num):
+            self.humans.append(self.generate_human(human_id, selected_data, selected_next_data))
+            self.sync_human_df(human_id, self.current_timestep, 1)
+
+        # generate robot
+        self.robots = []
+        for robot_id in range(self.robot_num):
+            self.robots.append(self.generate_robot(robot_id))
+
+        self.current_human_aoi_list = np.ones([self.human_num, ])
+        self.mean_aoi_timelist = np.ones([self.config.num_timestep + 1, ])
+        self.mean_aoi_timelist[self.current_timestep] = np.mean(self.current_human_aoi_list)
+        self.robot_energy_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num])
+        self.robot_energy_timelist[self.current_timestep, :] = self.max_uav_energy
+        self.robot_x_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num])
+        self.robot_x_timelist[self.current_timestep, :] = self.nlon / 2
+        self.robot_y_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num])
+        self.robot_y_timelist[self.current_timestep, :] = self.nlat / 2
+        self.update_human_timelist = np.zeros([self.config.num_timestep, ])
+        self.data_collection = 0
+
+        # for visualization
+        self.plot_states = []
+        self.robot_actions = []
+        self.rewards = []
+        self.action_values = []
+        self.plot_states.append([[robot.get_obs() for robot in self.robots],
+                                 [human.get_obs() for human in self.humans]])
+        
+        state = JointState([robot.get_obs() for robot in self.robots], [human.get_obs() for human in self.humans])
+        return state
+
+    def step(self, action):
+        new_robot_position = np.zeros([self.robot_num, 2])
+        current_enenrgy_consume = np.zeros([self.robot_num, ])
+
+        num_updated_human = 0   # number of humans whose AoI is updated
+
+        for robot_id, robot in enumerate(self.robots):
+            new_robot_px = robot.px + action[robot_id][0]
+            new_robot_py = robot.py + action[robot_id][1]
+            robot_theta = get_theta(0, 0, action[robot_id][0], action[robot_id][1])
+            # print(action[robot_id], robot_theta)
+            is_stopping = True if (action[robot_id][0] == 0 and action[robot_id][1] == 0) else False
+            is_collide = True if judge_collision(new_robot_px, new_robot_py, robot.px, robot.py, self.config) else False
+
+            if is_stopping is True:
+                consume_energy = consume_uav_energy(0, self.step_time, self.config)
+            else:
+                consume_energy = consume_uav_energy(self.step_time, 0, self.config)
+            current_enenrgy_consume[robot_id] = consume_energy / self.config.max_uav_energy
+            new_energy = robot.energy - consume_energy
+            self.robot_energy_timelist[self.current_timestep + 1][robot_id] = new_energy
+
+            if is_collide is True:
+                new_robot_position[robot_id][0] = robot.px
+                new_robot_position[robot_id][1] = robot.py
+                self.robot_x_timelist[self.current_timestep + 1][robot_id] = robot.px
+                self.robot_y_timelist[self.current_timestep + 1][robot_id] = robot.py
+                robot.set(robot.px, robot.py, robot_theta, energy=new_energy)
+            else:
+                new_robot_position[robot_id][0] = new_robot_px
+                new_robot_position[robot_id][1] = new_robot_py
+                self.robot_x_timelist[self.current_timestep + 1][robot_id] = new_robot_px
+                self.robot_y_timelist[self.current_timestep + 1][robot_id] = new_robot_py
+                robot.set(new_robot_px, new_robot_py, robot_theta, energy=new_energy)
+
+        selected_data, selected_next_data = get_human_position_list(self.current_timestep + 1, self.human_df, self.config)
+        delta_human_aoi_list = np.zeros_like(self.current_human_aoi_list)   # 0 means no update
+        for human_id, human in enumerate(self.humans):
+            next_px, next_py, next_theta = get_human_position_from_list(self.current_timestep + 1, human_id,
+                                                                        selected_data, selected_next_data, self.config)
+            should_reset = judge_aoi_update([next_px, next_py], new_robot_position, self.config)
+            if should_reset:
+                # if the human is in the range of the robot, then reset the aoi of the human
+                if human.aoi > 1:
+                    delta_human_aoi_list[human_id] = human.aoi
+                else:
+                    delta_human_aoi_list[human_id] = 1
+
+                human.set(next_px, next_py, next_theta, aoi=1)
+                num_updated_human += 1
+            else:
+                # if the human is not in the range of the robot, then update the aoi of the human
+                delta_human_aoi_list[human_id] = 0
+                new_aoi = human.aoi + 1
+                human.set(next_px, next_py, next_theta, aoi=new_aoi)
+
+            self.current_human_aoi_list[human_id] = human.aoi
+            self.sync_human_df(human_id, self.current_timestep + 1, human.aoi)
+
+        self.mean_aoi_timelist[self.current_timestep + 1] = np.mean(self.current_human_aoi_list)
+        self.update_human_timelist[self.current_timestep] = num_updated_human
+        delta_sum_aoi = np.sum(delta_human_aoi_list)
+        self.data_collection += (delta_sum_aoi * 0.3)  # Mb, 0.02M/s per person
+
+        # TODO: need to be well-defined
+        reward = self.mean_aoi_timelist[self.current_timestep] - self.mean_aoi_timelist[self.current_timestep + 1] \
+                 - self.config.energy_factor * np.sum(current_enenrgy_consume)
+
+        # if hasattr(self.agent.policy, 'action_values'):
+        #     self.action_values.append(self.agent.policy.action_values)
+        self.robot_actions.append(action)
+        self.rewards.append(reward)
+        self.plot_states.append([[robot.get_obs() for robot in self.robots],
+                                 [human.get_obs() for human in self.humans]])
+
+        next_state = JointState([robot.get_obs() for robot in self.robots],
+                                [human.get_obs() for human in self.humans])
+
+        self.current_timestep += 1
+        # print('This game is on',self.current_timestep,' step\n')
+        if self.current_timestep >= self.num_timestep:
+            done = True
+        else:
+            done = False
+        info = {
+            "performance_info": {
+            "mean_aoi": self.mean_aoi_timelist[self.current_timestep],
+            "mean_energy_consumption": 1.0 - (
+                        np.mean(self.robot_energy_timelist[self.current_timestep]) / self.max_uav_energy),
+            "collected_data_amount": self.data_collection/(self.num_timestep*self.human_num*0.3),
+            "human_coverage": np.mean(self.update_human_timelist) / self.human_num
+            },
+        }
+
+        return next_state, reward, done, info
+
+    def render(self, mode='traj', output_file=None, plot_loop=False, moving_line=False):
+        # -------------------------------------------------------------------
+        if mode == 'html':
+            pass
+        elif mode == 'traj':
+            pass
+        else:
+            raise NotImplementedError
\ No newline at end of file
diff --git a/zoo/CrowdSim/envs/Crowdsim/env/model/__init__.py b/zoo/CrowdSim/envs/Crowdsim/env/model/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py b/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py
new file mode 100644
index 000000000..5201c5b61
--- /dev/null
+++ b/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py
@@ -0,0 +1,72 @@
+import abc
+import logging
+from zoo.CrowdSim.envs.Crowdsim.env.model.mdp import *
+
+
+class Agent():
+    def __init__(self):
+        """
+        Base class for robot and human. Have the physical attributes of an agent.
+
+        """
+        self.policy = None
+
+    def print_info(self):
+        logging.info('Agent is visible and has "holonomic" kinematic constraint')
+
+    def set_policy(self, policy):
+        self.policy = policy
+
+    def act(self, state, current_timestep):
+        if self.policy is None:
+            raise AttributeError('Policy attribute has to be set!')
+        action = self.policy.predict(state, current_timestep)
+        return action
+
+
+class Human():
+    def __init__(self, id, config):
+        self.id = id
+        self.config = config
+        self.px = None
+        self.py = None
+        self.theta = None
+        self.aoi = None
+
+    def set(self, px, py, theta, aoi):
+        self.px = px  # position
+        self.py = py
+        self.theta = theta
+        self.aoi = aoi
+
+    # TODO: change state,可能需要归一化
+    def get_obs(self):
+        return HumanState(self.px / self.config.nlon,
+                          self.py / self.config.nlat,
+                          self.theta / self.config.rotation_limit,
+                          self.aoi / self.config.num_timestep)
+
+
+class Robot():
+    def __init__(self, id, config):
+        self.id = id
+        self.config = config
+        self.px = None  # position
+        self.py = None
+        self.theta = None
+        self.energy = None
+
+    def set(self, px, py, theta, energy):
+        self.px = px  # position
+        self.py = py
+        self.theta = theta
+        self.energy = energy
+
+    # TODO: change state,可能需要归一化
+    def get_obs(self):
+        return RobotState(self.px / self.config.nlon,
+                          self.py / self.config.nlat,
+                          self.theta / self.config.rotation_limit,
+                          self.energy / self.config.max_uav_energy)
+
+
diff --git a/zoo/CrowdSim/envs/Crowdsim/env/model/mdp.py b/zoo/CrowdSim/envs/Crowdsim/env/model/mdp.py
new file mode 100644
index 000000000..72f1b9c4a
--- /dev/null
+++ b/zoo/CrowdSim/envs/Crowdsim/env/model/mdp.py
@@ -0,0 +1,89 @@
+from collections import namedtuple
+from itertools import product
+import torch
+import numpy as np
+
+
+# State
+class HumanState(object):
+    def __init__(self, px, py, theta, aoi):
+        self.px = px
+        self.py = py
+        self.theta = theta
+        self.aoi = aoi
+        self.position = (self.px, self.py)
+
+    def __add__(self, other):
+        return other + (self.px, self.py, self.theta, self.aoi)
+
+    def __str__(self):
+        return ' '.join([str(x) for x in [self.px, self.py, self.theta, self.aoi]])
+
+    def to_tuple(self):
+        return self.px, self.py, self.theta, self.aoi
+
+
+class RobotState(object):
+    def __init__(self, px, py, theta, energy):
+        self.px = px
+        self.py = py
+        self.theta = theta
+        self.energy = energy
+
+        self.position = (self.px, self.py)
+
+    def __add__(self, other):
+        return other + (self.px, self.py, self.theta, self.energy)
+
+    def __str__(self):
+        return ' '.join([str(x) for x in [self.px, self.py, self.theta, self.energy]])
+
+    def to_tuple(self):
+        return self.px, self.py, self.theta, self.energy
+
+
+class JointState(object):
+    def __init__(self, robot_states, human_states):
+        for robot_state in robot_states:
+            assert isinstance(robot_state, RobotState)
+        for human_state in human_states:
+            assert isinstance(human_state, HumanState)
+
+        self.robot_states = robot_states
+        self.human_states = human_states
+
+    def to_tensor(self, add_batch_size=False, device=None):
+        robot_states_tensor = torch.tensor([robot_state.to_tuple() for robot_state in self.robot_states],
+                                           dtype=torch.float32)
+        human_states_tensor = torch.tensor([human_state.to_tuple() for human_state in self.human_states],
+                                           dtype=torch.float32)
+
+        if add_batch_size:  # True
+            robot_states_tensor = robot_states_tensor.unsqueeze(0)
+            human_states_tensor = human_states_tensor.unsqueeze(0)
+
+        if device is not None:
+            robot_states_tensor = robot_states_tensor.to(device)
+            human_states_tensor = human_states_tensor.to(device)
+
+        return robot_states_tensor, human_states_tensor
+    
+    def to_array(self):
+        robot_states_array = np.array([robot_state.to_tuple() for robot_state in self.robot_states])
+        human_states_array = np.array([human_state.to_tuple() for human_state in self.human_states])
+
+        return robot_states_array, human_states_array
+
+
+def build_action_space(config):
+    robot_num = config.robot_num
+
+    # dx, dy
+    one_uav_action_space = config.one_uav_action_space
+    action_space = list(product(one_uav_action_space, repeat=robot_num))
+
+    return np.array(action_space)
+
+
+if __name__ == "__main__":
+    print(build_action_space())
diff --git a/zoo/CrowdSim/envs/Crowdsim/env/model/utils.py b/zoo/CrowdSim/envs/Crowdsim/env/model/utils.py
new file mode 100644
index 000000000..ce903d9f8
--- /dev/null
+++ b/zoo/CrowdSim/envs/Crowdsim/env/model/utils.py
@@ -0,0 +1,248 @@
+import numpy as np
+
+np.seterr(invalid='ignore')
+
+from zoo.CrowdSim.envs.Crowdsim.env.model.agent import *
+from zoo.CrowdSim.envs.Crowdsim.env.model.mdp import JointState
+from shapely.geometry import *
+
+
+def tensor_to_joint_state(state, config):  # 恢复原先尺度
+    robot_states, human_states = state
+
+    robot_states = robot_states.cpu().squeeze(0).data.numpy()
+    robot_states = [RobotState(robot_state[0] * config.nlon,
+                               robot_state[1] * config.nlat,
+                               robot_state[2] * config.rotation_limit,
+                               robot_state[3] * config.max_uav_energy) for robot_state in robot_states]
+
+    human_states = human_states.cpu().squeeze(0).data.numpy()
+    human_states = [HumanState(human_state[0] * config.nlon,
+                               human_state[1] * config.nlat,
+                               human_state[2] * config.rotation_limit,
+                               human_state[3] * config.num_timestep) for human_state in human_states]
+
+    return JointState(robot_states, human_states)
+
+
+def tensor_to_robot_states(robot_state_tensor, config):
+    robot_states = robot_state_tensor.cpu().squeeze(0).data.numpy()
+    robot_states = [RobotState(robot_state[0] * config.nlon,
+                               robot_state[1] * config.nlat,
+                               robot_state[2] * config.rotation_limit,
+                               robot_state[3] * config.max_uav_energy) for robot_state in robot_states]
+    return robot_states
+
+
+def get_human_position_list(selected_timestep, human_df, config):
+    # config.step_time means the time interval between two timesteps
+    selected_timestamp = config.start_timestamp + selected_timestep * config.step_time
+    selected_data = human_df[human_df.timestamp == selected_timestamp]
+    selected_data = selected_data.set_index("id")
+
+    if selected_timestep < config.num_timestep:
+        selected_next_data = human_df[human_df.timestamp == (selected_timestamp + config.step_time)]
+        selected_next_data = selected_next_data.set_index("id")
+    else:
+        selected_next_data = None
+
+    return selected_data, selected_next_data
+
+
+def get_human_position_from_list(selected_timestep, human_id, selected_data, selected_next_data, config):
+    px, py = selected_data.loc[human_id, ["x", "y"]]
+
+    if selected_timestep < config.num_timestep:
+        npx, npy = selected_next_data.loc[human_id, ["x", "y"]]
+        theta = get_theta(0, 0, npx - px, npy - py)
+        # print(px, py, npx, npy, theta)
+    else:
+        theta = 0
+
+    return px, py, theta
+
+
+def judge_aoi_update(human_position, robot_position, config):
+    """
+    Overview:
+        Judge whether the AoI should be updated
+    Args:
+        - human_position (:obj:`list`): The position of the human.
+        - robot_position (:obj:`list`): The position of the robot.
+        - config (:obj:`dict`): The configuration of the environment.
+    """
+    should_reset = False
+    for robot_id in range(config.robot_num):
+        unit_distance = np.sqrt(np.power(robot_position[robot_id][0] - human_position[0], 2)
+                                + np.power(robot_position[robot_id][1] - human_position[1], 2))
+        if unit_distance <= config.sensing_range:
+            should_reset = True
+            break
+
+    return should_reset
+
+
+def inPoly(polygon, x, y):
+    pt = (x, y)
+    line = LineString(polygon)
+    point = Point(pt)
+    polygon = Polygon(line)
+    return polygon.contains(point)
+
+
+def iscrosses(line1, line2):
+    if LineString(line1).crosses(LineString(line2)):
+        return True
+    return False
+
+
+def crossPoly(square, x1, y1, x2, y2):
+    our_line = LineString([[x1, y1], [x2, y2]])
+    line1 = LineString([square[0], square[2]])
+    line2 = LineString([square[1], square[3]])
+    if our_line.crosses(line1) or our_line.crosses(line2):
+        return True
+    else:
+        return False
+
+
+def judge_collision(new_robot_px, new_robot_py, old_robot_px, old_robot_py, config):
+    if config.no_fly_zone is None:
+        return False
+
+    for square in config.no_fly_zone:
+        if inPoly(square, new_robot_px, new_robot_py):
+            return True
+        if crossPoly(square, new_robot_px, new_robot_py, old_robot_px, old_robot_py):
+            return True
+    return False
+
+
+def get_theta(x1, y1, x2, y2):
+    ang1 = np.arctan2(y1, x1)
+    ang2 = np.arctan2(y2, x2)
+    theta = np.rad2deg((ang1 - ang2) % (2 * np.pi))
+    return theta
+
+
+def consume_uav_energy(fly_time, hover_time, config):
+    # configs
+    Pu = 0.5  # the average transmitted power of each user, W,  e.g. mobile phone
+    P0 = 79.8563  # blade profile power, W
+    P1 = 88.6279  # derived power, W
+    U_tips = 120  # tip speed of the rotor blade of the UAV,m/s
+    v0 = 4.03  # the mean rotor induced velocity in the hovering state,m/s
+    d0 = 0.6  # fuselage drag ratio
+    rho = 1.225  # density of air,kg/m^3
+    s0 = 0.05  # the rotor solidity
+    A = 0.503  # the area of the rotor disk, m^2
+    Vt = config.velocity  # velocity of the UAV,m/s
+
+    Power_flying = P0 * (1 + 3 * Vt ** 2 / U_tips ** 2) + \
+                   P1 * np.sqrt((np.sqrt(1 + Vt ** 4 / (4 * v0 ** 4)) - Vt ** 2 / (2 * v0 ** 2))) + \
+                   0.5 * d0 * rho * s0 * A * Vt ** 3
+
+    Power_hovering = P0 + P1
+
+    return fly_time * Power_flying + hover_time * Power_hovering
+
+
+def get_border(ur, lf):
+    upper_left = [lf[0], ur[1]]
+    upper_right = [ur[0], ur[1]]
+    lower_right = [ur[0], lf[1]]
+    lower_left = [lf[0], lf[1]]
+
+    coordinates = [
+        upper_left,
+        upper_right,
+        lower_right,
+        lower_left,
+        upper_left
+    ]
+
+    geo_json = {"type": "FeatureCollection",
+                "properties": {
+                    "lower_left": lower_left,
+                    "upper_right": upper_right
+                },
+                "features": []}
+
+    grid_feature = {
+        "type": "Feature",
+        "geometry": {
+            "type": "Polygon",
+            "coordinates": [coordinates],
+        }
+    }
+
+    geo_json["features"].append(grid_feature)
+
+    return geo_json
+
+
+def traj_to_timestamped_geojson(index, trajectory, robot_num, color):
+    point_gdf = trajectory.df.copy()
+    point_gdf["previous_geometry"] = point_gdf["geometry"].shift()
+    point_gdf["time"] = point_gdf.index
+    point_gdf["previous_time"] = point_gdf["time"].shift()
+
+    features = []
+
+    # for Point in GeoJSON type
+    for _, row in point_gdf.iterrows():
+        corrent_point_coordinates = [
+            row["geometry"].xy[0][0],
+            row["geometry"].xy[1][0]
+        ]
+        current_time = [row["time"].isoformat()]
+
+        if index < robot_num:
+            radius = 8  # 125(5 units)
+            opacity = 0.05
+            popup_html = f'<h4>UAV {int(row["id"])}</h4>' + f'<p>raw coord: {corrent_point_coordinates}</p>' \
+                         + f'<p>grid coord: ({row["x"]},{row["y"]})</p>' \
+                         + f'<p>dist coord: ({row["x_distance"]}m, {row["y_distance"]}m)</p>' \
+                         + f'<p>energy: {row["energy"]}J </p>'
+        else:
+            radius = 2
+            opacity = 1
+            popup_html = f'<h4>Human {int(row["id"])}</h4>' + f'<p>raw coord: {corrent_point_coordinates}</p>' \
+                         + f'<p>grid coord: ({row["x"]},{row["y"]})</p>' \
+                         + f'<p>dist coord: ({row["x_distance"]}m, {row["y_distance"]}m)</p>' \
+                         + f'<p>aoi: {int(row["aoi"])} </p>'
+
+        # for Point in GeoJSON type  (Temporally Deprecated)
+        features.append(
+            {
+                "type": "Feature",
+                "geometry": {
+                    "type": "Point",
+                    "coordinates": corrent_point_coordinates,
+                },
+                "properties": {
+                    "times": current_time,
+                    'popup': popup_html,
+                    "icon": 'circle',  # point
+                    "iconstyle": {
+                        'fillColor': color,
+                        'fillOpacity': opacity,  # 透明度
+                        'stroke': 'true',
+                        'radius': radius,
+                        'weight': 1,
+                    },
+
+                    "style": {  # line
+                        "color": color,
+                    },
+                    "code": 11,
+
+                },
+            }
+        )
+    return features
+
+
+if __name__ == "__main__":
+    print(judge_collision(new_robot_px=6505, new_robot_py=5130,
+                          old_robot_px=6925, old_robot_py=5130))
diff --git a/zoo/CrowdSim/envs/test_CrowdSim_env.py b/zoo/CrowdSim/envs/test_CrowdSim_env.py
index 4f95511dc..25a8bc411 100644
--- a/zoo/CrowdSim/envs/test_CrowdSim_env.py
+++ b/zoo/CrowdSim/envs/test_CrowdSim_env.py
@@ -4,6 +4,7 @@
 
 mcfg=EasyDict(
         env_name='CrowdSim-v0',
+        dataset = 'purdue',
         robot_num = 2,
         human_num = 59, # purdue
         one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]])

From 14542a116ac5e4ca409cff6543348feb0d729469 Mon Sep 17 00:00:00 2001
From: nighood <nighoodren@gmail.com>
Date: Mon, 8 Apr 2024 17:58:13 +0800
Subject: [PATCH 04/16] feature(rjy): add RGCN for represent net

---
 lzero/model/common.py                         | 214 ++++++++++++++++++
 .../config/crowdsim_muzero_rgcn_config.py     | 106 +++++++++
 zoo/CrowdSim/envs/crowdsim_lightzero_env.py   | 121 ++++++++++
 .../envs/test_crowdsim_lightzero_env.py       |  35 +++
 4 files changed, 476 insertions(+)
 create mode 100644 zoo/CrowdSim/config/crowdsim_muzero_rgcn_config.py
 create mode 100644 zoo/CrowdSim/envs/crowdsim_lightzero_env.py
 create mode 100644 zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py

diff --git a/lzero/model/common.py b/lzero/model/common.py
index 363f7f779..0b97d2b7a 100644
--- a/lzero/model/common.py
+++ b/lzero/model/common.py
@@ -8,6 +8,8 @@
 import math
 from typing import Optional, Tuple
 from dataclasses import dataclass
+import logging
+import itertools
 import numpy as np
 import torch
 import torch.nn as nn
@@ -271,6 +273,218 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         return self.fc_representation(x)
 
+class RGCNLayer(nn.Module):
+    """
+    Overview:
+        Relational graph convolutional network layer.
+    """
+    def __init__(
+            self,
+            robot_state_dim,
+            human_state_dim,
+            similarity_function,
+            num_layer = 2,
+            X_dim = 32,
+            layerwise_graph = False,
+            skip_connection = True,
+            wr_dims = [64, 32],     # the last dim should equal to X_dim
+            wh_dims = [64, 32],     # the last dim should equal to X_dim
+            final_state_dim = 32,   # should equal to X_dim
+            norm_type= 'BN',
+            last_linear_layer_init_zero=True,
+            activation: Optional[nn.Module] = nn.ReLU(inplace=True),
+            ):
+        super().__init__()
+
+        # design choice
+        # 'gaussian', 'embedded_gaussian', 'cosine', 'cosine_softmax', 'concatenation'
+        self.similarity_function = similarity_function
+        self.robot_state_dim = robot_state_dim
+        self.human_state_dim = human_state_dim
+        self.num_layer = num_layer
+        self.X_dim = X_dim
+        self.layerwise_graph = layerwise_graph
+        self.skip_connection = skip_connection
+
+        logging.info('Similarity_func: {}'.format(self.similarity_function))
+        logging.info('Layerwise_graph: {}'.format(self.layerwise_graph))
+        logging.info('Skip_connection: {}'.format(self.skip_connection))
+        logging.info('Number of layers: {}'.format(self.num_layer))
+
+        self.w_r = MLP(
+            in_channels=robot_state_dim,
+            hidden_channels=wr_dims[0],
+            out_channels=wr_dims[1],
+            layer_num=num_layer,
+            activation=activation,
+            norm_type=norm_type,
+            last_linear_layer_init_zero=last_linear_layer_init_zero,
+            )  # inputs,64,32
+        self.w_h = MLP(
+            in_channels=human_state_dim,
+            hidden_channels=wh_dims[0],
+            out_channels=wh_dims[1],
+            layer_num=num_layer,
+            activation=activation,
+            norm_type=norm_type,
+            last_linear_layer_init_zero=last_linear_layer_init_zero,
+            )  # inputs,64,32
+
+        if self.similarity_function == 'embedded_gaussian':
+            self.w_a = nn.Parameter(torch.randn(self.X_dim, self.X_dim))
+        elif self.similarity_function == 'concatenation':
+            # TODO: fix the dim size
+            self.w_a = MLP(
+                in_channels=2 * X_dim,
+                hidden_channels=2 * X_dim,
+                out_channels=1,
+                layer_num=1,
+                )
+
+        embedding_dim = self.X_dim
+        self.Ws = torch.nn.ParameterList()
+        for i in range(self.num_layer):
+            if i == 0:
+                self.Ws.append(nn.Parameter(torch.randn(self.X_dim, embedding_dim)))
+            elif i == self.num_layer - 1:
+                self.Ws.append(nn.Parameter(torch.randn(embedding_dim, final_state_dim)))
+            else:
+                self.Ws.append(nn.Parameter(torch.randn(embedding_dim, embedding_dim)))
+
+        # TODO: for visualization
+        self.A = None
+
+    def compute_similarity_matrix(self, X):
+        if self.similarity_function == 'embedded_gaussian':
+            A = torch.matmul(torch.matmul(X, self.w_a), X.permute(0, 2, 1))
+            normalized_A = nn.functional.softmax(A, dim=2)
+        elif self.similarity_function == 'gaussian':
+            A = torch.matmul(X, X.permute(0, 2, 1))
+            normalized_A = nn.functional.softmax(A, dim=2)
+        elif self.similarity_function == 'cosine':
+            A = torch.matmul(X, X.permute(0, 2, 1))
+            magnitudes = torch.norm(A, dim=2, keepdim=True)
+            norm_matrix = torch.matmul(magnitudes, magnitudes.permute(0, 2, 1))
+            normalized_A = torch.div(A, norm_matrix)
+        elif self.similarity_function == 'cosine_softmax':
+            A = torch.matmul(X, X.permute(0, 2, 1))
+            magnitudes = torch.norm(A, dim=2, keepdim=True)
+            norm_matrix = torch.matmul(magnitudes, magnitudes.permute(0, 2, 1))
+            normalized_A = nn.functional.softmax(torch.div(A, norm_matrix), dim=2)
+        elif self.similarity_function == 'concatenation':
+            indices = [pair for pair in itertools.product(list(range(X.size(1))), repeat=2)]
+            selected_features = torch.index_select(X, dim=1, index=torch.LongTensor(indices).reshape(-1))
+            pairwise_features = selected_features.reshape((-1, X.size(1) * X.size(1), X.size(2) * 2))
+            A = self.w_a(pairwise_features).reshape(-1, X.size(1), X.size(1))
+            normalized_A = A
+        elif self.similarity_function == 'squared':
+            A = torch.matmul(X, X.permute(0, 2, 1))
+            squared_A = A * A
+            normalized_A = squared_A / torch.sum(squared_A, dim=2, keepdim=True)
+        elif self.similarity_function == 'equal_attention':
+            normalized_A = (torch.ones(X.size(1), X.size(1)) / X.size(1)).expand(X.size(0), X.size(1), X.size(1))
+        elif self.similarity_function == 'diagonal':
+            normalized_A = (torch.eye(X.size(1), X.size(1))).expand(X.size(0), X.size(1), X.size(1))
+        else:
+            raise NotImplementedError
+
+        return normalized_A
+
+    def forward(self, state):
+        robot_states = state['robot_state']
+        human_states = state['human_state']
+
+        # compute feature matrix X
+        robot_state_embedings = self.w_r(robot_states)  # batch x num x embedding_dim
+        human_state_embedings = self.w_h(human_states)
+        X = torch.cat([robot_state_embedings, human_state_embedings], dim=1)
+
+        # compute matrix A
+        if not self.layerwise_graph:
+            normalized_A = self.compute_similarity_matrix(X)
+            self.A = normalized_A[0, :, :].data.cpu().numpy()  # total_num x total_num
+
+        # next_H = H = X
+
+        H = X.contiguous().clone()
+        next_H = H.contiguous().clone()  # batch x total_num x embedding_dim
+        for i in range(self.num_layer):  # 2
+            if self.layerwise_graph:  # False
+                A = self.compute_similarity_matrix(H)
+                next_H = nn.functional.relu(torch.matmul(torch.matmul(A, H), self.Ws[i]))
+            else:  # (A x H) x W_i
+                next_H = nn.functional.relu(torch.matmul(torch.matmul(normalized_A, H), self.Ws[i]))
+
+            if self.skip_connection:
+                # next_H += H
+                next_H = next_H + H
+            H = next_H.contiguous().clone()
+
+        return next_H
+
+class RepresentationNetworkGCN(nn.Module):
+
+    def __init__(
+            self,
+            robot_observation_shape: tuple,
+            human_observation_shape: tuple,
+            hidden_channels: int = 64,
+            layer_num: int = 2,
+            activation: Optional[nn.Module] = nn.ReLU(inplace=True),
+            last_linear_layer_init_zero: bool = True,
+            norm_type: Optional[str] = 'BN',
+    ) -> torch.Tensor:
+        """
+        Overview:
+            
+        Arguments:
+            - robot_observation_shape (:obj:`tuple`): The shape of robot observation space, e.g. (2, 4).
+            - human_observation_shape (:obj:`tuple`): The shape of human observation space, e.g. (59, 4).
+            - hidden_channels (:obj:`int`): The channel of output hidden state.
+            - activation (:obj:`nn.Module`): The activation function used in network, defaults to nn.ReLU(). \
+                Use the inplace operation to speed up.
+            - last_linear_layer_init_zero (:obj:`bool`): Whether to initialize the last linear layer with zeros, \
+                which can provide stable zero outputs in the beginning, defaults to True.
+            - norm_type (:obj:`str`): The type of normalization in networks. defaults to 'BN'.
+        """
+        super().__init__()
+        # self.fc_representation = MLP(
+        #     in_channels=observation_shape,
+        #     hidden_channels=hidden_channels,
+        #     out_channels=hidden_channels,
+        #     layer_num=layer_num,
+        #     activation=activation,
+        #     norm_type=norm_type,
+        #     # don't use activation and norm in the last layer of representation network is important for convergence.
+        #     output_activation=False,
+        #     output_norm=False,
+        #     # last_linear_layer_init_zero=True is beneficial for convergence speed.
+        #     last_linear_layer_init_zero=True,
+        # )
+        self.rgl_representation = RGCNLayer(
+            robot_state_dim=robot_observation_shape[-1],
+            human_state_dim=human_observation_shape[-1],
+            similarity_function='embedded_gaussian',
+            num_layer=layer_num,
+            X_dim=hidden_channels,
+            layerwise_graph=False,
+            skip_connection=True,
+            wr_dims=[64, hidden_channels],
+            wh_dims=[64, hidden_channels],
+            final_state_dim=hidden_channels,
+            # for mlp
+            norm_type=norm_type,
+            last_linear_layer_init_zero=last_linear_layer_init_zero,
+            activation=activation,
+        )
+
+    def forward(self, x: dict) -> torch.Tensor:
+        """
+        Shapes:
+            - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size, N is the length of vector observation.
+            - output (:obj:`torch.Tensor`): :math:`(B, hidden_channels)`, where B is batch size.
+        """
+        return self.rgl_representation(x)
 
 class PredictionNetwork(nn.Module):
 
diff --git a/zoo/CrowdSim/config/crowdsim_muzero_rgcn_config.py b/zoo/CrowdSim/config/crowdsim_muzero_rgcn_config.py
new file mode 100644
index 000000000..e611aa6b1
--- /dev/null
+++ b/zoo/CrowdSim/config/crowdsim_muzero_rgcn_config.py
@@ -0,0 +1,106 @@
+from easydict import EasyDict
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = '2'
+# ==============================================================
+# begin of the most frequently changed config specified by the user
+# ==============================================================
+collector_env_num = 8
+n_episode = 8
+evaluator_env_num = 3
+num_simulations = 25
+update_per_collect = 100
+batch_size = 256
+max_env_step = int(3e5)
+reanalyze_ratio = 0.
+robot_num = 2
+human_num = 10  # purdue
+# human_num = 33  # NCSU
+# human_num = 92  # KAIST
+one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]
+# ==============================================================
+# end of the most frequently changed config specified by the user
+# ==============================================================
+
+CrowdSim_muzero_config = dict(
+    exp_name=
+    f'result/crowd_num_human/CrowdSim_muzero_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_upc{update_per_collect}_rr{reanalyze_ratio}_seed0',
+    env=dict(
+        env_name='CrowdSim-v0',
+        robot_num = robot_num,
+        human_num = human_num, 
+        one_uav_action_space = one_uav_action_space,
+        continuous=False,
+        manually_discretization=False,
+        collector_env_num=collector_env_num,
+        evaluator_env_num=evaluator_env_num,
+        n_evaluator_episode=evaluator_env_num,
+        manager=dict(shared_memory=False, ),
+    ),
+    policy=dict(
+        model=dict(
+            robot_observation_shape=(robot_num, 4),
+            human_observation_shape=(human_num, 4),
+            action_space_size=(len(one_uav_action_space))**robot_num,
+            model_type='rgcn', 
+            lstm_hidden_size=256,
+            latent_state_dim=256,
+            self_supervised_learning_loss=True,  # NOTE: default is False.
+            discrete_action_encoding_type='one_hot',
+            res_connection_in_dynamics=True,
+            norm_type='BN',
+        ),
+        cuda=True,
+        env_type='not_board_games',
+        game_segment_length=200,
+        update_per_collect=update_per_collect,
+        batch_size=batch_size,
+        optim_type='Adam',
+        lr_piecewise_constant_decay=False,
+        learning_rate=0.003,
+        ssl_loss_weight=2,  # NOTE: default is 0.
+        grad_clip_value=0.5,
+        num_simulations=num_simulations,
+        reanalyze_ratio=reanalyze_ratio,
+        n_episode=n_episode,
+        eval_freq=int(1e3),
+        replay_buffer_size=int(1e6),  # the size/capacity of replay_buffer, in the terms of transitions.
+        collector_env_num=collector_env_num,
+        evaluator_env_num=evaluator_env_num,
+    ),
+)
+
+CrowdSim_muzero_config = EasyDict(CrowdSim_muzero_config)
+main_config = CrowdSim_muzero_config
+
+CrowdSim_muzero_create_config = dict(
+    env=dict(
+        type='crowdsim_lightzero',
+        import_names=['zoo.CrowdSim.envs.CrowdSim_env'],
+    ),
+    env_manager=dict(type='subprocess'),
+    policy=dict(
+        type='muzero',
+        import_names=['lzero.policy.muzero'],
+    ),
+    collector=dict(
+        type='episode_muzero',
+        import_names=['lzero.worker.muzero_collector'],
+    )
+)
+CrowdSim_muzero_create_config = EasyDict(CrowdSim_muzero_create_config)
+create_config = CrowdSim_muzero_create_config
+
+if __name__ == "__main__":
+    # Users can use different train entry by specifying the entry_type.
+    entry_type = "train_muzero"  # options={"train_muzero", "train_muzero_with_gym_env"}
+
+    if entry_type == "train_muzero":
+        from lzero.entry import train_muzero
+    elif entry_type == "train_muzero_with_gym_env":
+        """
+        The ``train_muzero_with_gym_env`` entry means that the environment used in the training process is generated by wrapping the original gym environment with LightZeroEnvWrapper.
+        Users can refer to lzero/envs/wrappers for more details.
+        """
+        from lzero.entry import train_muzero_with_gym_env as train_muzero
+
+    train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step)
diff --git a/zoo/CrowdSim/envs/crowdsim_lightzero_env.py b/zoo/CrowdSim/envs/crowdsim_lightzero_env.py
new file mode 100644
index 000000000..0bdedd202
--- /dev/null
+++ b/zoo/CrowdSim/envs/crowdsim_lightzero_env.py
@@ -0,0 +1,121 @@
+from typing import Union, Optional
+
+import gym
+import numpy as np
+from itertools import product
+import logging
+
+from ding.envs import BaseEnv, BaseEnvTimestep
+from ding.envs import ObsPlusPrevActRewWrapper
+from ding.torch_utils import to_ndarray
+from ding.utils import ENV_REGISTRY
+
+import zoo.CrowdSim.envs.Crowdsim.env
+
+
+@ENV_REGISTRY.register('crowdsim_lightzero')
+class CrowdSimEnv(BaseEnv):
+
+    def __init__(self, cfg: dict = {}) -> None:
+        self._cfg = cfg
+        self._init_flag = False
+        self._replay_path = None
+        self._robot_num = self._cfg.robot_num
+        self._human_num = self._cfg.human_num
+        self._observation_space = gym.spaces.Dict({
+            'robot_state': gym.spaces.Box(
+                low=float("-inf"),
+                high=float("inf"),
+                shape=(self._robot_num, 4),
+                dtype=np.float32
+            ),
+            'human_state': gym.spaces.Box(
+                low=float("-inf"),
+                high=float("inf"),
+                shape=(self._human_num, 4),
+                dtype=np.float32
+            )
+        })
+        # action space
+        # one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]
+        self.real_action_space = list(product(self._cfg.one_uav_action_space, repeat=self._robot_num))
+        one_uav_action_n = len(self._cfg.one_uav_action_space)
+        self._action_space = gym.spaces.Discrete(one_uav_action_n**self._robot_num)
+        self._action_space.seed(0)  # default seed
+        self._reward_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1, ), dtype=np.float32)
+        self._continuous = False
+
+    def reset(self) -> np.ndarray:
+        if not self._init_flag:
+            self._env = gym.make('CrowdSim-v0', dataset = self._cfg.dataset, custom_config = self._cfg)
+            self._init_flag = True
+        if hasattr(self, '_seed') and hasattr(self, '_dynamic_seed') and self._dynamic_seed:
+            np_seed = 100 * np.random.randint(1, 1000)
+            self._env.seed(self._seed + np_seed)
+            self._action_space.seed(self._seed + np_seed)
+        elif hasattr(self, '_seed'):
+            self._env.seed(self._seed)
+            self._action_space.seed(self._seed)
+        self._eval_episode_return = 0
+        # process obs
+        raw_obs = self._env.reset()
+        obs_list = list(raw_obs.to_tensor())
+        obs = {'robot_state': obs_list[0], 'human_state': obs_list[1]}
+        action_mask = np.ones(self.action_space.n, 'int8')
+        obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1}
+
+        return obs
+
+    def close(self) -> None:
+        if self._init_flag:
+            self._env.close()
+        self._init_flag = False
+
+    def seed(self, seed: int, dynamic_seed: bool = True) -> None:
+        self._seed = seed
+        self._dynamic_seed = dynamic_seed
+        np.random.seed(self._seed)
+
+    def step(self, action: Union[int, np.ndarray]) -> BaseEnvTimestep:
+        if isinstance(action, np.ndarray) and action.shape == (1, ):
+            action = action.squeeze()  # 0-dim array
+        real_action = self.real_action_space[action]
+        assert isinstance(real_action, tuple) and len(real_action) == self._robot_num, "illegal action!"
+        raw_obs, rew, done, info = self._env.step(real_action)
+        obs_list = list(raw_obs.to_array())
+        obs = {'robot_state': obs_list[0], 'human_state': obs_list[1]}
+
+        self._eval_episode_return += rew
+        if done:
+            info['eval_episode_return'] = self._eval_episode_return
+            # logging.INFO('one game finish!')
+
+        action_mask = np.ones(self.action_space.n, 'int8')
+        obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1}
+        rew = to_ndarray([rew]).astype(np.float32)
+        return BaseEnvTimestep(obs, rew, done, info)
+
+    def enable_save_replay(self, replay_path: Optional[str] = None) -> None:
+        if replay_path is None:
+            replay_path = './video'
+        self._replay_path = replay_path
+
+    def random_action(self) -> np.ndarray:
+        random_action = self.action_space.sample()
+        random_action = to_ndarray([random_action], dtype=np.int64)
+        return random_action
+
+    @property
+    def observation_space(self) -> gym.spaces.Space:
+        return self._observation_space
+
+    @property
+    def action_space(self) -> gym.spaces.Space:
+        return self._action_space
+
+    @property
+    def reward_space(self) -> gym.spaces.Space:
+        return self._reward_space
+
+    def __repr__(self) -> str:
+        return "LightZero CrowdSim Env"
diff --git a/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py b/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py
new file mode 100644
index 000000000..47f483717
--- /dev/null
+++ b/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py
@@ -0,0 +1,35 @@
+import numpy as np
+import pytest
+from easydict import EasyDict
+from zoo.CrowdSim.envs.crowdsim_lightzero_env import CrowdSimEnv
+
+mcfg=EasyDict(
+        env_name='CrowdSim-v0',
+        dataset = 'purdue',
+        robot_num = 2,
+        human_num = 59, # purdue
+        one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]
+        )
+
+@ pytest.mark.envtest
+
+class TestCrowdSimEnv:
+    def test_naive(self):
+        env = CrowdSimEnv(mcfg)
+        env.seed(314)
+        assert env._seed == 314
+        obs = env.reset()
+        assert isinstance(obs['observation'], dict)
+        assert obs['observation']['robot_state'].shape == (2, 4)
+        assert obs['observation']['human_state'].shape == (59, 4)
+        for i in range(10):
+            random_action = env.random_action()
+            timestep = env.step(random_action)
+            print(timestep)
+            assert isinstance(timestep.obs['observation'], dict)
+            assert timestep.obs['observation']['robot_state'].shape == (2, 4)
+            assert timestep.obs['observation']['human_state'].shape == (59, 4)
+            assert isinstance(timestep.done, bool)
+            assert timestep.reward.shape == (1, )
+        print(env.observation_space, env.action_space, env.reward_space)
+        env.close()

From dc4a7747b59d4e0c2f3c266dc126e8d2349aa732 Mon Sep 17 00:00:00 2001
From: nighood <nighood@163.com>
Date: Wed, 1 May 2024 18:00:31 +0800
Subject: [PATCH 05/16] feature(rjy): add obs/action env mode. fix rgcn
 pipeline.

---
 lzero/agent/muzero.py                         |   3 +
 lzero/mcts/utils.py                           |  14 +-
 lzero/model/common.py                         | 212 -------
 lzero/model/common_gcn.py                     | 261 +++++++++
 lzero/model/muzero_model_gcn.py               | 457 +++++++++++++++
 .../model/sampled_efficientzero_model_gcn.py  | 534 ++++++++++++++++++
 lzero/model/tests/test_common_gcn.py          | 102 ++++
 lzero/model/tests/test_rgcn.py                |  50 ++
 lzero/policy/muzero.py                        |  10 +-
 lzero/worker/muzero_collector.py              |   2 +-
 .../config/crowdsim_muzero_rgcn_config.py     |  20 +-
 zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py   |   6 +-
 zoo/CrowdSim/envs/crowdsim_lightzero_env.py   |  36 +-
 .../envs/test_crowdsim_lightzero_env.py       |  40 +-
 14 files changed, 1516 insertions(+), 231 deletions(-)
 create mode 100644 lzero/model/common_gcn.py
 create mode 100644 lzero/model/muzero_model_gcn.py
 create mode 100644 lzero/model/sampled_efficientzero_model_gcn.py
 create mode 100644 lzero/model/tests/test_common_gcn.py
 create mode 100644 lzero/model/tests/test_rgcn.py

diff --git a/lzero/agent/muzero.py b/lzero/agent/muzero.py
index 55dda5d00..b087377b5 100644
--- a/lzero/agent/muzero.py
+++ b/lzero/agent/muzero.py
@@ -110,6 +110,9 @@ def __init__(
             elif self.cfg.policy.model.model_type == 'conv':
                 from lzero.model.muzero_model import MuZeroModel
                 model = MuZeroModel(**self.cfg.policy.model)
+            elif self.cfg.policy.model.model_type == 'rgcn':
+                from lzero.model.muzero_model_gcn import MuZeroModelGCN
+                model = MuZeroModelGCN(**self.cfg.policy.model)
             else:
                 raise NotImplementedError
         if self.cfg.policy.cuda and torch.cuda.is_available():
diff --git a/lzero/mcts/utils.py b/lzero/mcts/utils.py
index c40052e62..1861cb2a9 100644
--- a/lzero/mcts/utils.py
+++ b/lzero/mcts/utils.py
@@ -97,7 +97,7 @@ def prepare_observation(observation_list, model_type='conv'):
     Returns:
         - np.ndarray: Reshaped array of observations.
     """
-    assert model_type in ['conv', 'mlp'], "model_type must be either 'conv' or 'mlp'"
+    assert model_type in ['conv', 'mlp', 'rgcn'], "model_type must be either 'conv' or 'mlp'"
     observation_array = np.array(observation_list)
     batch_size = observation_array.shape[0]
 
@@ -116,6 +116,18 @@ def prepare_observation(observation_list, model_type='conv'):
             observation_array = observation_array.reshape(batch_size, -1)
         else:
             raise ValueError("For 'mlp' model_type, the observation must have 3 dimensions [B, S, O]")
+        
+    elif model_type == 'rgcn':
+        if observation_array.ndim == 4:
+            # TODO(rjy): strage process
+            # observation_array should be reshaped to [B, S*M, O], where M is the agent number
+            # now observation_array.shape = [B, S, M, O]
+            observation_array = observation_array.reshape(batch_size, -1, observation_array.shape[-1])
+        elif observation_array.ndim == 3:
+            # Flatten the last two dimensions
+            observation_array = observation_array.reshape(batch_size, -1)
+        else:
+            raise ValueError("For 'rgcn' model_type, the observation must have 3 dimensions [B, S, O] or 4 dimensions [B, S, M, O]")
 
     return observation_array
 
diff --git a/lzero/model/common.py b/lzero/model/common.py
index 0b97d2b7a..ddf4a5d59 100644
--- a/lzero/model/common.py
+++ b/lzero/model/common.py
@@ -273,218 +273,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         return self.fc_representation(x)
 
-class RGCNLayer(nn.Module):
-    """
-    Overview:
-        Relational graph convolutional network layer.
-    """
-    def __init__(
-            self,
-            robot_state_dim,
-            human_state_dim,
-            similarity_function,
-            num_layer = 2,
-            X_dim = 32,
-            layerwise_graph = False,
-            skip_connection = True,
-            wr_dims = [64, 32],     # the last dim should equal to X_dim
-            wh_dims = [64, 32],     # the last dim should equal to X_dim
-            final_state_dim = 32,   # should equal to X_dim
-            norm_type= 'BN',
-            last_linear_layer_init_zero=True,
-            activation: Optional[nn.Module] = nn.ReLU(inplace=True),
-            ):
-        super().__init__()
-
-        # design choice
-        # 'gaussian', 'embedded_gaussian', 'cosine', 'cosine_softmax', 'concatenation'
-        self.similarity_function = similarity_function
-        self.robot_state_dim = robot_state_dim
-        self.human_state_dim = human_state_dim
-        self.num_layer = num_layer
-        self.X_dim = X_dim
-        self.layerwise_graph = layerwise_graph
-        self.skip_connection = skip_connection
-
-        logging.info('Similarity_func: {}'.format(self.similarity_function))
-        logging.info('Layerwise_graph: {}'.format(self.layerwise_graph))
-        logging.info('Skip_connection: {}'.format(self.skip_connection))
-        logging.info('Number of layers: {}'.format(self.num_layer))
-
-        self.w_r = MLP(
-            in_channels=robot_state_dim,
-            hidden_channels=wr_dims[0],
-            out_channels=wr_dims[1],
-            layer_num=num_layer,
-            activation=activation,
-            norm_type=norm_type,
-            last_linear_layer_init_zero=last_linear_layer_init_zero,
-            )  # inputs,64,32
-        self.w_h = MLP(
-            in_channels=human_state_dim,
-            hidden_channels=wh_dims[0],
-            out_channels=wh_dims[1],
-            layer_num=num_layer,
-            activation=activation,
-            norm_type=norm_type,
-            last_linear_layer_init_zero=last_linear_layer_init_zero,
-            )  # inputs,64,32
-
-        if self.similarity_function == 'embedded_gaussian':
-            self.w_a = nn.Parameter(torch.randn(self.X_dim, self.X_dim))
-        elif self.similarity_function == 'concatenation':
-            # TODO: fix the dim size
-            self.w_a = MLP(
-                in_channels=2 * X_dim,
-                hidden_channels=2 * X_dim,
-                out_channels=1,
-                layer_num=1,
-                )
-
-        embedding_dim = self.X_dim
-        self.Ws = torch.nn.ParameterList()
-        for i in range(self.num_layer):
-            if i == 0:
-                self.Ws.append(nn.Parameter(torch.randn(self.X_dim, embedding_dim)))
-            elif i == self.num_layer - 1:
-                self.Ws.append(nn.Parameter(torch.randn(embedding_dim, final_state_dim)))
-            else:
-                self.Ws.append(nn.Parameter(torch.randn(embedding_dim, embedding_dim)))
-
-        # TODO: for visualization
-        self.A = None
-
-    def compute_similarity_matrix(self, X):
-        if self.similarity_function == 'embedded_gaussian':
-            A = torch.matmul(torch.matmul(X, self.w_a), X.permute(0, 2, 1))
-            normalized_A = nn.functional.softmax(A, dim=2)
-        elif self.similarity_function == 'gaussian':
-            A = torch.matmul(X, X.permute(0, 2, 1))
-            normalized_A = nn.functional.softmax(A, dim=2)
-        elif self.similarity_function == 'cosine':
-            A = torch.matmul(X, X.permute(0, 2, 1))
-            magnitudes = torch.norm(A, dim=2, keepdim=True)
-            norm_matrix = torch.matmul(magnitudes, magnitudes.permute(0, 2, 1))
-            normalized_A = torch.div(A, norm_matrix)
-        elif self.similarity_function == 'cosine_softmax':
-            A = torch.matmul(X, X.permute(0, 2, 1))
-            magnitudes = torch.norm(A, dim=2, keepdim=True)
-            norm_matrix = torch.matmul(magnitudes, magnitudes.permute(0, 2, 1))
-            normalized_A = nn.functional.softmax(torch.div(A, norm_matrix), dim=2)
-        elif self.similarity_function == 'concatenation':
-            indices = [pair for pair in itertools.product(list(range(X.size(1))), repeat=2)]
-            selected_features = torch.index_select(X, dim=1, index=torch.LongTensor(indices).reshape(-1))
-            pairwise_features = selected_features.reshape((-1, X.size(1) * X.size(1), X.size(2) * 2))
-            A = self.w_a(pairwise_features).reshape(-1, X.size(1), X.size(1))
-            normalized_A = A
-        elif self.similarity_function == 'squared':
-            A = torch.matmul(X, X.permute(0, 2, 1))
-            squared_A = A * A
-            normalized_A = squared_A / torch.sum(squared_A, dim=2, keepdim=True)
-        elif self.similarity_function == 'equal_attention':
-            normalized_A = (torch.ones(X.size(1), X.size(1)) / X.size(1)).expand(X.size(0), X.size(1), X.size(1))
-        elif self.similarity_function == 'diagonal':
-            normalized_A = (torch.eye(X.size(1), X.size(1))).expand(X.size(0), X.size(1), X.size(1))
-        else:
-            raise NotImplementedError
-
-        return normalized_A
-
-    def forward(self, state):
-        robot_states = state['robot_state']
-        human_states = state['human_state']
-
-        # compute feature matrix X
-        robot_state_embedings = self.w_r(robot_states)  # batch x num x embedding_dim
-        human_state_embedings = self.w_h(human_states)
-        X = torch.cat([robot_state_embedings, human_state_embedings], dim=1)
-
-        # compute matrix A
-        if not self.layerwise_graph:
-            normalized_A = self.compute_similarity_matrix(X)
-            self.A = normalized_A[0, :, :].data.cpu().numpy()  # total_num x total_num
-
-        # next_H = H = X
-
-        H = X.contiguous().clone()
-        next_H = H.contiguous().clone()  # batch x total_num x embedding_dim
-        for i in range(self.num_layer):  # 2
-            if self.layerwise_graph:  # False
-                A = self.compute_similarity_matrix(H)
-                next_H = nn.functional.relu(torch.matmul(torch.matmul(A, H), self.Ws[i]))
-            else:  # (A x H) x W_i
-                next_H = nn.functional.relu(torch.matmul(torch.matmul(normalized_A, H), self.Ws[i]))
-
-            if self.skip_connection:
-                # next_H += H
-                next_H = next_H + H
-            H = next_H.contiguous().clone()
-
-        return next_H
-
-class RepresentationNetworkGCN(nn.Module):
-
-    def __init__(
-            self,
-            robot_observation_shape: tuple,
-            human_observation_shape: tuple,
-            hidden_channels: int = 64,
-            layer_num: int = 2,
-            activation: Optional[nn.Module] = nn.ReLU(inplace=True),
-            last_linear_layer_init_zero: bool = True,
-            norm_type: Optional[str] = 'BN',
-    ) -> torch.Tensor:
-        """
-        Overview:
-            
-        Arguments:
-            - robot_observation_shape (:obj:`tuple`): The shape of robot observation space, e.g. (2, 4).
-            - human_observation_shape (:obj:`tuple`): The shape of human observation space, e.g. (59, 4).
-            - hidden_channels (:obj:`int`): The channel of output hidden state.
-            - activation (:obj:`nn.Module`): The activation function used in network, defaults to nn.ReLU(). \
-                Use the inplace operation to speed up.
-            - last_linear_layer_init_zero (:obj:`bool`): Whether to initialize the last linear layer with zeros, \
-                which can provide stable zero outputs in the beginning, defaults to True.
-            - norm_type (:obj:`str`): The type of normalization in networks. defaults to 'BN'.
-        """
-        super().__init__()
-        # self.fc_representation = MLP(
-        #     in_channels=observation_shape,
-        #     hidden_channels=hidden_channels,
-        #     out_channels=hidden_channels,
-        #     layer_num=layer_num,
-        #     activation=activation,
-        #     norm_type=norm_type,
-        #     # don't use activation and norm in the last layer of representation network is important for convergence.
-        #     output_activation=False,
-        #     output_norm=False,
-        #     # last_linear_layer_init_zero=True is beneficial for convergence speed.
-        #     last_linear_layer_init_zero=True,
-        # )
-        self.rgl_representation = RGCNLayer(
-            robot_state_dim=robot_observation_shape[-1],
-            human_state_dim=human_observation_shape[-1],
-            similarity_function='embedded_gaussian',
-            num_layer=layer_num,
-            X_dim=hidden_channels,
-            layerwise_graph=False,
-            skip_connection=True,
-            wr_dims=[64, hidden_channels],
-            wh_dims=[64, hidden_channels],
-            final_state_dim=hidden_channels,
-            # for mlp
-            norm_type=norm_type,
-            last_linear_layer_init_zero=last_linear_layer_init_zero,
-            activation=activation,
-        )
-
-    def forward(self, x: dict) -> torch.Tensor:
-        """
-        Shapes:
-            - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size, N is the length of vector observation.
-            - output (:obj:`torch.Tensor`): :math:`(B, hidden_channels)`, where B is batch size.
-        """
-        return self.rgl_representation(x)
 
 class PredictionNetwork(nn.Module):
 
diff --git a/lzero/model/common_gcn.py b/lzero/model/common_gcn.py
new file mode 100644
index 000000000..857890c2f
--- /dev/null
+++ b/lzero/model/common_gcn.py
@@ -0,0 +1,261 @@
+from typing import Optional, Tuple, Dict
+import logging
+import itertools
+
+import torch
+import torch.nn as nn
+from ding.torch_utils import MLP
+from ding.utils import MODEL_REGISTRY, SequenceType
+
+from .utils import renormalize, get_params_mean, get_dynamic_mean, get_reward_mean
+
+class RGCNLayer(nn.Module):
+    """
+    Overview:
+        Relational graph convolutional network layer.
+    """
+    def __init__(
+            self,
+            robot_num: int,
+            human_num: int,
+            robot_state_dim,
+            human_state_dim,
+            similarity_function,
+            num_layer = 2,
+            X_dim = 32,
+            layerwise_graph = False,
+            skip_connection = True,
+            wr_dims = [64, 32],     # the last dim should equal to X_dim
+            wh_dims = [64, 32],     # the last dim should equal to X_dim
+            final_state_dim = 32,   # should equal to X_dim
+            norm_type= None,
+            last_linear_layer_init_zero=True,
+            activation: Optional[nn.Module] = nn.ReLU(inplace=True),
+            ):
+        super().__init__()
+
+        # design choice
+        # 'gaussian', 'embedded_gaussian', 'cosine', 'cosine_softmax', 'concatenation'
+        self.similarity_function = similarity_function
+        self.robot_num = robot_num
+        self.human_num = human_num
+        self.robot_state_dim = robot_state_dim
+        self.human_state_dim = human_state_dim
+        self.num_layer = num_layer
+        self.X_dim = X_dim
+        self.layerwise_graph = layerwise_graph
+        self.skip_connection = skip_connection
+
+        logging.info('Similarity_func: {}'.format(self.similarity_function))
+        logging.info('Layerwise_graph: {}'.format(self.layerwise_graph))
+        logging.info('Skip_connection: {}'.format(self.skip_connection))
+        logging.info('Number of layers: {}'.format(self.num_layer))
+
+        self.w_r = MLP(
+            in_channels=robot_state_dim,
+            hidden_channels=wr_dims[0],
+            out_channels=wr_dims[1],
+            layer_num=num_layer,
+            activation=activation,
+            norm_type=norm_type,
+            last_linear_layer_init_zero=last_linear_layer_init_zero,
+            )  # inputs,64,32
+        self.w_h = MLP(
+            in_channels=human_state_dim,
+            hidden_channels=wh_dims[0],
+            out_channels=wh_dims[1],
+            layer_num=num_layer,
+            activation=activation,
+            norm_type=norm_type,
+            last_linear_layer_init_zero=last_linear_layer_init_zero,
+            )  # inputs,64,32
+
+        if self.similarity_function == 'embedded_gaussian':
+            self.w_a = nn.Parameter(torch.randn(self.X_dim, self.X_dim))
+        elif self.similarity_function == 'concatenation':
+            # TODO: fix the dim size
+            self.w_a = MLP(
+                in_channels=2 * X_dim,
+                hidden_channels=2 * X_dim,
+                out_channels=1,
+                layer_num=1,
+                )
+
+        embedding_dim = self.X_dim
+        self.Ws = torch.nn.ParameterList()
+        for i in range(self.num_layer):
+            if i == 0:
+                self.Ws.append(nn.Parameter(torch.randn(self.X_dim, embedding_dim)))
+            elif i == self.num_layer - 1:
+                self.Ws.append(nn.Parameter(torch.randn(embedding_dim, final_state_dim)))
+            else:
+                self.Ws.append(nn.Parameter(torch.randn(embedding_dim, embedding_dim)))
+
+        # TODO: for visualization
+        self.A = None
+
+    def compute_similarity_matrix(self, X):
+        if self.similarity_function == 'embedded_gaussian':
+            A = torch.matmul(torch.matmul(X, self.w_a), X.permute(0, 2, 1))
+            normalized_A = nn.functional.softmax(A, dim=2)
+        elif self.similarity_function == 'gaussian':
+            A = torch.matmul(X, X.permute(0, 2, 1))
+            normalized_A = nn.functional.softmax(A, dim=2)
+        elif self.similarity_function == 'cosine':
+            A = torch.matmul(X, X.permute(0, 2, 1))
+            magnitudes = torch.norm(A, dim=2, keepdim=True)
+            norm_matrix = torch.matmul(magnitudes, magnitudes.permute(0, 2, 1))
+            normalized_A = torch.div(A, norm_matrix)
+        elif self.similarity_function == 'cosine_softmax':
+            A = torch.matmul(X, X.permute(0, 2, 1))
+            magnitudes = torch.norm(A, dim=2, keepdim=True)
+            norm_matrix = torch.matmul(magnitudes, magnitudes.permute(0, 2, 1))
+            normalized_A = nn.functional.softmax(torch.div(A, norm_matrix), dim=2)
+        elif self.similarity_function == 'concatenation':
+            indices = [pair for pair in itertools.product(list(range(X.size(1))), repeat=2)]
+            selected_features = torch.index_select(X, dim=1, index=torch.LongTensor(indices).reshape(-1))
+            pairwise_features = selected_features.reshape((-1, X.size(1) * X.size(1), X.size(2) * 2))
+            A = self.w_a(pairwise_features).reshape(-1, X.size(1), X.size(1))
+            normalized_A = A
+        elif self.similarity_function == 'squared':
+            A = torch.matmul(X, X.permute(0, 2, 1))
+            squared_A = A * A
+            normalized_A = squared_A / torch.sum(squared_A, dim=2, keepdim=True)
+        elif self.similarity_function == 'equal_attention':
+            normalized_A = (torch.ones(X.size(1), X.size(1)) / X.size(1)).expand(X.size(0), X.size(1), X.size(1))
+        elif self.similarity_function == 'diagonal':
+            normalized_A = (torch.eye(X.size(1), X.size(1))).expand(X.size(0), X.size(1), X.size(1))
+        else:
+            raise NotImplementedError
+
+        return normalized_A
+
+    def forward(self, state):
+        state = state.to(self.w_r[0].weight.dtype)
+        if isinstance(state, dict):
+            robot_states = state['robot_state']
+            human_states = state['human_state']
+        elif isinstance(state, torch.Tensor):
+            if state.dim() == 3:
+                # state shape:(B, stack_num*(robot_num+human_num), state_dim)
+                stack_num = state.size(1) // (self.robot_num + self.human_num)
+                # robot_states shape:(B, stack_num*robot_num, state_dim)
+                robot_states = state[:, :stack_num * self.robot_num, :]
+                # human_states shape:(B, stack_num*human_num, state_dim)
+                human_states = state[:, stack_num * self.robot_num:, :]
+            elif state.dim() == 2:
+                # state shape:(B, stack_num*(robot_num+human_num)*state_dim)
+                stack_num = state.size(1) // ((self.robot_num + self.human_num) * self.robot_state_dim)
+                assert stack_num == 1, "stack_num should be 1 for 1-dim-array obs"
+                # robot_states shape:(B, stack_num*robot_num, state_dim)
+                robot_states = state[:, :stack_num * self.robot_num * self.robot_state_dim].reshape(-1, self.robot_num, self.robot_state_dim)
+                # human_states shape:(B, stack_num*human_num, state_dim)
+                human_states = state[:, stack_num * self.robot_num * self.robot_state_dim:].reshape(-1, self.human_num, self.human_state_dim)
+                
+        # compute feature matrix X
+        robot_state_embedings = self.w_r(robot_states)  # batch x num x embedding_dim
+        human_state_embedings = self.w_h(human_states)
+        X = torch.cat([robot_state_embedings, human_state_embedings], dim=1)
+
+        # compute matrix A
+        if not self.layerwise_graph:
+            normalized_A = self.compute_similarity_matrix(X)
+            self.A = normalized_A[0, :, :].data.cpu().numpy()  # total_num x total_num
+
+        # next_H = H = X
+
+        H = X.contiguous().clone()
+        next_H = H.contiguous().clone()  # batch x total_num x embedding_dim
+        for i in range(self.num_layer):  # 2
+            if self.layerwise_graph:  # False
+                A = self.compute_similarity_matrix(H)
+                next_H = nn.functional.relu(torch.matmul(torch.matmul(A, H), self.Ws[i]))
+            else:  # (A x H) x W_i
+                next_H = nn.functional.relu(torch.matmul(torch.matmul(normalized_A, H), self.Ws[i]))
+
+            if self.skip_connection:
+                # next_H += H
+                next_H = next_H + H
+            H = next_H.contiguous().clone()
+
+        return next_H
+
+class RepresentationNetworkGCN(nn.Module):
+
+    def __init__(
+            self,
+            robot_state_dim: int,
+            human_state_dim:  int,
+            robot_num: int,
+            human_num: int,
+            hidden_channels: int = 64,
+            layer_num: int = 2,
+            activation: Optional[nn.Module] = nn.ReLU(inplace=True),
+            last_linear_layer_init_zero: bool = True,
+            norm_type: Optional[str] = 'BN',
+    ) -> torch.Tensor:
+        """
+        Overview:
+            Representation network used in MuZero and derived algorithms. 
+        Arguments:
+            - robot_state_dim (:obj:`int`): The dimension of robot state.
+            - human_state_dim (:obj:`int`): The dimension of human state.
+            - robot_num (:obj:`int`): The number of robots.
+            - human_num (:obj:`int`): The number of humans.
+            - num_res_blocks (:obj:`int`): The number of residual blocks.
+            - hidden_channels (:obj:`int`): The channel of output hidden state.
+            - downsample (:obj:`bool`): Whether to do downsampling for observations in ``representation_network``, \
+                defaults to True. This option is often used in video games like Atari. In board games like go, \
+                we don't need this module.
+            - activation (:obj:`nn.Module`): The activation function used in network, defaults to nn.ReLU(). \
+                Use the inplace operation to speed up.
+            - last_linear_layer_init_zero (:obj:`bool`): Whether to initialize the last linear layer with zeros, \
+                which can provide stable zero outputs in the beginning, defaults to True.
+            - norm_type (:obj:`str`): The type of normalization in networks. defaults to 'BN'.
+        """
+        super().__init__()
+        self.robot_state_dim = robot_state_dim
+        self.human_state_dim = human_state_dim
+        self.hidden_channels = hidden_channels
+        self.similarity_function = 'embedded_gaussian'
+        self.robot_num = robot_num
+        self.human_num = human_num
+        self.rgcn = RGCNLayer(
+            robot_num=self.robot_num,
+            human_num=self.human_num,
+            robot_state_dim=self.robot_state_dim,
+            human_state_dim=self.human_state_dim,
+            similarity_function=self.similarity_function,
+            num_layer=2,
+            X_dim=hidden_channels,
+            final_state_dim=hidden_channels,
+            wr_dims=[hidden_channels, hidden_channels], # TODO: check dim
+            wh_dims=[hidden_channels, hidden_channels],
+            layerwise_graph=False,
+            skip_connection=True,
+            norm_type=None,
+        )
+        mlp_input_shape = (robot_num + human_num) * hidden_channels
+        self.fc_representation = MLP(
+            in_channels=mlp_input_shape,
+            hidden_channels=hidden_channels,
+            out_channels=hidden_channels,
+            layer_num=layer_num,
+            activation=activation,
+            norm_type=norm_type,
+            # don't use activation and norm in the last layer of representation network is important for convergence.
+            output_activation=False,
+            output_norm=False,
+            # last_linear_layer_init_zero=True is beneficial for convergence speed.
+            last_linear_layer_init_zero=True,
+        )
+
+    def forward(self, x: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        Shapes:
+            - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size, N is the length of vector observation.
+            - output (:obj:`torch.Tensor`): :math:`(B, hidden_channels)`, where B is batch size.
+        """
+        gcn_embedding = self.rgcn(x)
+        gcn_embedding = gcn_embedding.view(gcn_embedding.shape[0], -1)  # (B,M,N) -> (B,M*N)
+        return self.fc_representation(gcn_embedding)
\ No newline at end of file
diff --git a/lzero/model/muzero_model_gcn.py b/lzero/model/muzero_model_gcn.py
new file mode 100644
index 000000000..6e2c808a6
--- /dev/null
+++ b/lzero/model/muzero_model_gcn.py
@@ -0,0 +1,457 @@
+from typing import Optional, Tuple, Dict
+
+import torch
+import torch.nn as nn
+from ding.torch_utils import MLP
+from ding.utils import MODEL_REGISTRY, SequenceType
+
+from .common import MZNetworkOutput, RepresentationNetworkMLP, PredictionNetworkMLP
+from .common_gcn import RepresentationNetworkGCN
+from .utils import renormalize, get_params_mean, get_dynamic_mean, get_reward_mean
+
+
+@MODEL_REGISTRY.register('MuZeroModelGCN')
+class MuZeroModelGCN(nn.Module):
+
+    def __init__(
+        self,
+        robot_state_dim: int,
+        human_state_dim: int,
+        robot_num: int,
+        human_num: int,
+        action_space_size: int,
+        latent_state_dim: int = 64,
+        fc_reward_layers: SequenceType = [32],
+        fc_value_layers: SequenceType = [32],
+        fc_policy_layers: SequenceType = [32],
+        reward_support_size: int = 601,
+        value_support_size: int = 601,
+        proj_hid: int = 1024,
+        proj_out: int = 1024,
+        pred_hid: int = 512,
+        pred_out: int = 1024,
+        self_supervised_learning_loss: bool = False,
+        categorical_distribution: bool = True,
+        activation: Optional[nn.Module] = nn.ReLU(inplace=True),
+        last_linear_layer_init_zero: bool = True,
+        state_norm: bool = False,
+        discrete_action_encoding_type: str = 'one_hot',
+        norm_type: Optional[str] = 'BN',
+        res_connection_in_dynamics: bool = False,
+        *args,
+        **kwargs
+    ):
+        """
+        Overview:
+            The definition of the network model of MuZero, which is a generalization version for 1D vector obs.
+            The networks are mainly built on fully connected layers.
+            The representation network is an MLP network which maps the raw observation to a latent state.
+            The dynamics network is an MLP network which predicts the next latent state, and reward given the current latent state and action.
+            The prediction network is an MLP network which predicts the value and policy given the current latent state.
+        Arguments:
+            - observation_shape (:obj:`int`): Observation space shape, e.g. 8 for Lunarlander.
+            - action_space_size: (:obj:`int`): Action space size, e.g. 4 for Lunarlander.
+            - latent_state_dim (:obj:`int`): The dimension of latent state, such as 256.
+            - fc_reward_layers (:obj:`SequenceType`): The number of hidden layers of the reward head (MLP head).
+            - fc_value_layers (:obj:`SequenceType`): The number of hidden layers used in value head (MLP head).
+            - fc_policy_layers (:obj:`SequenceType`): The number of hidden layers used in policy head (MLP head).
+            - reward_support_size (:obj:`int`): The size of categorical reward output
+            - value_support_size (:obj:`int`): The size of categorical value output.
+            - proj_hid (:obj:`int`): The size of projection hidden layer.
+            - proj_out (:obj:`int`): The size of projection output layer.
+            - pred_hid (:obj:`int`): The size of prediction hidden layer.
+            - pred_out (:obj:`int`): The size of prediction output layer.
+            - self_supervised_learning_loss (:obj:`bool`): Whether to use self_supervised_learning related networks in MuZero model, default set it to False.
+            - categorical_distribution (:obj:`bool`): Whether to use discrete support to represent categorical distribution for value, reward/value_prefix.
+            - activation (:obj:`Optional[nn.Module]`): Activation function used in network, which often use in-place \
+                operation to speedup, e.g. ReLU(inplace=True).
+            - last_linear_layer_init_zero (:obj:`bool`): Whether to use zero initializations for the last layer of value/policy mlp, default sets it to True.
+            - state_norm (:obj:`bool`): Whether to use normalization for latent states, default sets it to True.
+            - discrete_action_encoding_type (:obj:`str`): The encoding type of discrete action, which can be 'one_hot' or 'not_one_hot'.
+            - norm_type (:obj:`str`): The type of normalization in networks. defaults to 'BN'.
+            - res_connection_in_dynamics (:obj:`bool`): Whether to use residual connection for dynamics network, default set it to False.
+        """
+        super(MuZeroModelGCN, self).__init__()
+        self.categorical_distribution = categorical_distribution
+        if not self.categorical_distribution:
+            self.reward_support_size = 1
+            self.value_support_size = 1
+        else:
+            self.reward_support_size = reward_support_size
+            self.value_support_size = value_support_size
+
+        self.action_space_size = action_space_size
+        self.continuous_action_space = False
+        # The dim of action space. For discrete action space, it is 1.
+        # For continuous action space, it is the dimension of continuous action.
+        self.action_space_dim = action_space_size if self.continuous_action_space else 1
+        assert discrete_action_encoding_type in ['one_hot', 'not_one_hot'], discrete_action_encoding_type
+        self.discrete_action_encoding_type = discrete_action_encoding_type
+        if self.continuous_action_space:
+            self.action_encoding_dim = action_space_size
+        else:
+            if self.discrete_action_encoding_type == 'one_hot':
+                self.action_encoding_dim = action_space_size
+            elif self.discrete_action_encoding_type == 'not_one_hot':
+                self.action_encoding_dim = 1
+
+        self.latent_state_dim = latent_state_dim
+        self.proj_hid = proj_hid
+        self.proj_out = proj_out
+        self.pred_hid = pred_hid
+        self.pred_out = pred_out
+        self.self_supervised_learning_loss = self_supervised_learning_loss
+        self.last_linear_layer_init_zero = last_linear_layer_init_zero
+        self.state_norm = state_norm
+        self.res_connection_in_dynamics = res_connection_in_dynamics
+
+        self.representation_network = RepresentationNetworkGCN(
+            robot_state_dim = robot_state_dim,
+            human_state_dim = human_state_dim,
+            robot_num = robot_num,
+            human_num = human_num,
+            hidden_channels=self.latent_state_dim,
+            layer_num=2,
+            norm_type=norm_type
+        )
+
+        self.dynamics_network = DynamicsNetwork(
+            action_encoding_dim=self.action_encoding_dim,
+            num_channels=self.latent_state_dim + self.action_encoding_dim,
+            common_layer_num=2,
+            fc_reward_layers=fc_reward_layers,
+            output_support_size=self.reward_support_size,
+            last_linear_layer_init_zero=self.last_linear_layer_init_zero,
+            norm_type=norm_type,
+            res_connection_in_dynamics=self.res_connection_in_dynamics,
+        )
+
+        self.prediction_network = PredictionNetworkMLP(
+            action_space_size=action_space_size,
+            num_channels=latent_state_dim,
+            fc_value_layers=fc_value_layers,
+            fc_policy_layers=fc_policy_layers,
+            output_support_size=self.value_support_size,
+            last_linear_layer_init_zero=self.last_linear_layer_init_zero,
+            norm_type=norm_type
+        )
+
+        if self.self_supervised_learning_loss:
+            # self_supervised_learning_loss related network proposed in EfficientZero
+            self.projection_input_dim = latent_state_dim
+
+            self.projection = nn.Sequential(
+                nn.Linear(self.projection_input_dim, self.proj_hid), nn.BatchNorm1d(self.proj_hid), activation,
+                nn.Linear(self.proj_hid, self.proj_hid), nn.BatchNorm1d(self.proj_hid), activation,
+                nn.Linear(self.proj_hid, self.proj_out), nn.BatchNorm1d(self.proj_out)
+            )
+            self.prediction_head = nn.Sequential(
+                nn.Linear(self.proj_out, self.pred_hid),
+                nn.BatchNorm1d(self.pred_hid),
+                activation,
+                nn.Linear(self.pred_hid, self.pred_out),
+            )
+
+    def initial_inference(self, obs: torch.Tensor) -> MZNetworkOutput:
+        """
+        Overview:
+            Initial inference of MuZero model, which is the first step of the MuZero model.
+            To perform the initial inference, we first use the representation network to obtain the "latent_state" of the observation.
+            Then we use the prediction network to predict the "value" and "policy_logits" of the "latent_state", and
+            also prepare the zeros-like ``reward`` for the next step of the MuZero model.
+        Arguments:
+            - obs (:obj:`torch.Tensor`): The 1D vector observation data.
+        Returns (MZNetworkOutput):
+            - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation.
+            - value_prefix (:obj:`torch.Tensor`): The predicted prefix sum of value for input state. \
+                In initial inference, we set it to zero vector.
+            - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action.
+            - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state.
+            - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The hidden state of LSTM about reward. In initial inference, \
+                we set it to the zeros-like hidden state (H and C).
+        Shapes:
+            - obs (:obj:`torch.Tensor`): :math:`(B, obs_shape)`, where B is batch_size.
+            - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size.
+            - reward (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size.
+            - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size.
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+        """
+        batch_size = obs.size(0)
+        latent_state = self._representation(obs)
+        policy_logits, value = self._prediction(latent_state)
+        return MZNetworkOutput(
+            value,
+            [0. for _ in range(batch_size)],
+            policy_logits,
+            latent_state,
+        )
+
+    def recurrent_inference(self, latent_state: torch.Tensor, action: torch.Tensor) -> MZNetworkOutput:
+        """
+        Overview:
+            Recurrent inference of MuZero model, which is the rollout step of the MuZero model.
+            To perform the recurrent inference, we first use the dynamics network to predict ``next_latent_state``,
+            ``reward`` by the given current ``latent_state`` and ``action``.
+             We then use the prediction network to predict the ``value`` and ``policy_logits``.
+        Arguments:
+            - latent_state (:obj:`torch.Tensor`): The encoding latent state of input obs.
+            - action (:obj:`torch.Tensor`): The predicted action to rollout.
+        Returns (MZNetworkOutput):
+            - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation.
+            - reward (:obj:`torch.Tensor`): The predicted reward for input state.
+            - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action.
+            - next_latent_state (:obj:`torch.Tensor`): The predicted next latent state.
+        Shapes:
+            - action (:obj:`torch.Tensor`): :math:`(B, )`, where B is batch_size.
+            - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size.
+            - reward (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size.
+            - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size.
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - next_latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+        """
+        next_latent_state, reward = self._dynamics(latent_state, action)
+        policy_logits, value = self._prediction(next_latent_state)
+        return MZNetworkOutput(value, reward, policy_logits, next_latent_state)
+
+    def _representation(self, observation: torch.Tensor) -> Tuple[torch.Tensor]:
+        """
+        Overview:
+             Use the representation network to encode the observations into latent state.
+        Arguments:
+             - obs (:obj:`torch.Tensor`): The 1D vector  observation data.
+        Returns:
+             - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state.
+        Shapes:
+             - obs (:obj:`torch.Tensor`): :math:`(B, obs_shape)`, where B is batch_size.
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+         """
+        latent_state = self.representation_network(observation)
+        if self.state_norm:
+            latent_state = renormalize(latent_state)
+        return latent_state
+
+    def _prediction(self, latent_state: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Overview:
+            Use the representation network to encode the observations into latent state.
+        Arguments:
+            - obs (:obj:`torch.Tensor`): The 1D vector observation data.
+        Returns:
+            - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action.
+            - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation.
+        Shapes:
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size.
+            - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size.
+        """
+        policy_logits, value = self.prediction_network(latent_state)
+        return policy_logits, value
+
+    def _dynamics(self, latent_state: torch.Tensor, action: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Overview:
+            Concatenate ``latent_state`` and ``action`` and use the dynamics network to predict ``next_latent_state``
+            ``reward`` and ``next_reward_hidden_state``.
+        Arguments:
+            - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state.
+            - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The input hidden state of LSTM about reward.
+            - action (:obj:`torch.Tensor`): The predicted action to rollout.
+        Returns:
+            - next_latent_state (:obj:`torch.Tensor`): The predicted latent state of the next timestep.
+            - next_reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The output hidden state of LSTM about reward.
+            - reward (:obj:`torch.Tensor`): The predicted reward for input state.
+        Shapes:
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - action (:obj:`torch.Tensor`): :math:`(B, )`, where B is batch_size.
+            - next_latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - reward (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size.
+        """
+        # NOTE: the discrete action encoding type is important for some environments
+
+        # discrete action space
+        if self.discrete_action_encoding_type == 'one_hot':
+            # Stack latent_state with the one hot encoded action
+            if len(action.shape) == 1:
+                # (batch_size, ) -> (batch_size, 1)
+                # e.g.,  torch.Size([8]) ->  torch.Size([8, 1])
+                action = action.unsqueeze(-1)
+
+            # transform action to one-hot encoding.
+            # action_one_hot shape: (batch_size, action_space_size), e.g., (8, 4)
+            action_one_hot = torch.zeros(action.shape[0], self.action_space_size, device=action.device)
+            # transform action to torch.int64
+            action = action.long()
+            action_one_hot.scatter_(1, action, 1)
+            action_encoding = action_one_hot
+        elif self.discrete_action_encoding_type == 'not_one_hot':
+            action_encoding = action / self.action_space_size
+            if len(action_encoding.shape) == 1:
+                # (batch_size, ) -> (batch_size, 1)
+                # e.g.,  torch.Size([8]) ->  torch.Size([8, 1])
+                action_encoding = action_encoding.unsqueeze(-1)
+
+        action_encoding = action_encoding.to(latent_state.device).float()
+        # state_action_encoding shape: (batch_size, latent_state[1] + action_dim]) or
+        # (batch_size, latent_state[1] + action_space_size]) depending on the discrete_action_encoding_type.
+        state_action_encoding = torch.cat((latent_state, action_encoding), dim=1)
+
+        next_latent_state, reward = self.dynamics_network(state_action_encoding)
+
+        if not self.state_norm:
+            return next_latent_state, reward
+        else:
+            next_latent_state_normalized = renormalize(next_latent_state)
+            return next_latent_state_normalized, reward
+
+    def project(self, latent_state: torch.Tensor, with_grad=True) -> torch.Tensor:
+        """
+        Overview:
+            Project the latent state to a lower dimension to calculate the self-supervised loss, which is proposed in EfficientZero.
+            For more details, please refer to the paper ``Exploring Simple Siamese Representation Learning``.
+        Arguments:
+            - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state.
+            - with_grad (:obj:`bool`): Whether to calculate gradient for the projection result.
+        Returns:
+            - proj (:obj:`torch.Tensor`): The result embedding vector of projection operation.
+        Shapes:
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - proj (:obj:`torch.Tensor`): :math:`(B, projection_output_dim)`, where B is batch_size.
+
+        Examples:
+            >>> latent_state = torch.randn(256, 64)
+            >>> output = self.project(latent_state)
+            >>> output.shape # (256, 1024)
+        """
+        proj = self.projection(latent_state)
+
+        if with_grad:
+            # with grad, use prediction_head
+            return self.prediction_head(proj)
+        else:
+            return proj.detach()
+
+    def get_params_mean(self) -> float:
+        return get_params_mean(self)
+
+
+class DynamicsNetwork(nn.Module):
+
+    def __init__(
+        self,
+        action_encoding_dim: int = 2,
+        num_channels: int = 64,
+        common_layer_num: int = 2,
+        fc_reward_layers: SequenceType = [32],
+        output_support_size: int = 601,
+        last_linear_layer_init_zero: bool = True,
+        activation: Optional[nn.Module] = nn.ReLU(inplace=True),
+        norm_type: Optional[str] = 'BN',
+        res_connection_in_dynamics: bool = False,
+    ):
+        """
+        Overview:
+            The definition of dynamics network in MuZero algorithm, which is used to predict next latent state
+            reward by the given current latent state and action.
+            The networks are mainly built on fully connected layers.
+        Arguments:
+            - action_encoding_dim (:obj:`int`): The dimension of action encoding.
+            - num_channels (:obj:`int`): The num of channels in latent states.
+            - common_layer_num (:obj:`int`): The number of common layers in dynamics network.
+            - fc_reward_layers (:obj:`SequenceType`): The number of hidden layers of the reward head (MLP head).
+            - output_support_size (:obj:`int`): The size of categorical reward output.
+            - last_linear_layer_init_zero (:obj:`bool`): Whether to use zero initializations for the last layer of value/policy mlp, default sets it to True.
+            - activation (:obj:`Optional[nn.Module]`): Activation function used in network, which often use in-place \
+                operation to speedup, e.g. ReLU(inplace=True).
+            - norm_type (:obj:`str`): The type of normalization in networks. defaults to 'BN'.
+            - res_connection_in_dynamics (:obj:`bool`): Whether to use residual connection in dynamics network.
+        """
+        super().__init__()
+        self.num_channels = num_channels
+        self.action_encoding_dim = action_encoding_dim
+        self.latent_state_dim = self.num_channels - self.action_encoding_dim
+
+        self.res_connection_in_dynamics = res_connection_in_dynamics
+        if self.res_connection_in_dynamics:
+            self.fc_dynamics_1 = MLP(
+                in_channels=self.num_channels,
+                hidden_channels=self.latent_state_dim,
+                layer_num=common_layer_num,
+                out_channels=self.latent_state_dim,
+                activation=activation,
+                norm_type=norm_type,
+                output_activation=True,
+                output_norm=True,
+                # last_linear_layer_init_zero=False is important for convergence
+                last_linear_layer_init_zero=False,
+            )
+            self.fc_dynamics_2 = MLP(
+                in_channels=self.latent_state_dim,
+                hidden_channels=self.latent_state_dim,
+                layer_num=common_layer_num,
+                out_channels=self.latent_state_dim,
+                activation=activation,
+                norm_type=norm_type,
+                output_activation=True,
+                output_norm=True,
+                # last_linear_layer_init_zero=False is important for convergence
+                last_linear_layer_init_zero=False,
+            )
+        else:
+            self.fc_dynamics = MLP(
+                in_channels=self.num_channels,
+                hidden_channels=self.latent_state_dim,
+                layer_num=common_layer_num,
+                out_channels=self.latent_state_dim,
+                activation=activation,
+                norm_type=norm_type,
+                output_activation=True,
+                output_norm=True,
+                # last_linear_layer_init_zero=False is important for convergence
+                last_linear_layer_init_zero=False,
+            )
+
+        self.fc_reward_head = MLP(
+            in_channels=self.latent_state_dim,
+            hidden_channels=fc_reward_layers[0],
+            layer_num=2,
+            out_channels=output_support_size,
+            activation=activation,
+            norm_type=norm_type,
+            output_activation=False,
+            output_norm=False,
+            last_linear_layer_init_zero=last_linear_layer_init_zero
+        )
+
+    def forward(self, state_action_encoding: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Overview:
+            Forward computation of the dynamics network. Predict the next latent state given current latent state and action.
+        Arguments:
+            - state_action_encoding (:obj:`torch.Tensor`): The state-action encoding, which is the concatenation of \
+                    latent state and action encoding, with shape (batch_size, num_channels, height, width).
+        Returns:
+            - next_latent_state (:obj:`torch.Tensor`): The next latent state, with shape (batch_size, latent_state_dim).
+            - reward (:obj:`torch.Tensor`): The predicted reward for input state.
+        """
+        if self.res_connection_in_dynamics:
+            # take the state encoding (e.g. latent_state),
+            # state_action_encoding[:, -self.action_encoding_dim:] is action encoding
+            latent_state = state_action_encoding[:, :-self.action_encoding_dim]
+            x = self.fc_dynamics_1(state_action_encoding)
+            # the residual link: add the latent_state to the state_action encoding
+            next_latent_state = x + latent_state
+            next_latent_state_encoding = self.fc_dynamics_2(next_latent_state)
+        else:
+            next_latent_state = self.fc_dynamics(state_action_encoding)
+            next_latent_state_encoding = next_latent_state
+
+        reward = self.fc_reward_head(next_latent_state_encoding)
+
+        return next_latent_state, reward
+
+    def get_dynamic_mean(self) -> float:
+        return get_dynamic_mean(self)
+
+    def get_reward_mean(self) -> float:
+        return get_reward_mean(self)
+
diff --git a/lzero/model/sampled_efficientzero_model_gcn.py b/lzero/model/sampled_efficientzero_model_gcn.py
new file mode 100644
index 000000000..5747736eb
--- /dev/null
+++ b/lzero/model/sampled_efficientzero_model_gcn.py
@@ -0,0 +1,534 @@
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+from ding.model.common import ReparameterizationHead
+from ding.torch_utils import MLP
+from ding.utils import MODEL_REGISTRY, SequenceType
+
+from .common import EZNetworkOutput
+from .common_gcn import RepresentationNetworkGCN
+from .efficientzero_model_mlp import DynamicsNetworkMLP
+from .utils import renormalize, get_params_mean
+
+
+@MODEL_REGISTRY.register('SampledEfficientZeroModelMLP')
+class SampledEfficientZeroModelMLP(nn.Module):
+
+    def __init__(
+        self,
+        robot_state_dim: int = 10,
+        human_state_dim: int = 10,
+        robot_num: int = 5,
+        human_num: int = 5,
+        action_space_size: int = 6,
+        latent_state_dim: int = 256,
+        lstm_hidden_size: int = 512,
+        fc_reward_layers: SequenceType = [32],
+        fc_value_layers: SequenceType = [32],
+        fc_policy_layers: SequenceType = [32],
+        reward_support_size: int = 601,
+        value_support_size: int = 601,
+        proj_hid: int = 1024,
+        proj_out: int = 1024,
+        pred_hid: int = 512,
+        pred_out: int = 1024,
+        self_supervised_learning_loss: bool = True,
+        categorical_distribution: bool = True,
+        activation: Optional[nn.Module] = nn.ReLU(inplace=True),
+        last_linear_layer_init_zero: bool = True,
+        state_norm: bool = False,
+        # ==============================================================
+        # specific sampled related config
+        # ==============================================================
+        continuous_action_space: bool = False,
+        num_of_sampled_actions: int = 6,
+        sigma_type='conditioned',
+        fixed_sigma_value: float = 0.3,
+        bound_type: str = None,
+        norm_type: str = 'BN',
+        discrete_action_encoding_type: str = 'one_hot',
+        res_connection_in_dynamics: bool = False,
+        *args,
+        **kwargs,
+    ):
+        """
+        Overview:
+            The definition of the network model of Sampled EfficientZero, which is a generalization version for 1D vector obs.
+            The networks are mainly built on fully connected layers.
+            Sampled EfficientZero model consists of a representation network, a dynamics network and a prediction network.
+            The representation network is an MLP network which maps the raw observation to a latent state.
+            The dynamics network is an MLP+LSTM network which predicts the next latent state, reward_hidden_state and value_prefix given the current latent state and action.
+            The prediction network is an MLP network which predicts the value and policy given the current latent state.
+        Arguments:
+            - observation_shape (:obj:`int`): Observation space shape, e.g. 8 for Lunarlander.
+            - action_space_size: (:obj:`int`): Action space size, which is an integer number. For discrete action space, it is the num of discrete actions, \
+                e.g. 4 for Lunarlander. For continuous action space, it is the dimension of the continuous action, e.g. 4 for bipedalwalker.
+            - latent_state_dim (:obj:`int`): The dimension of latent state, such as 256.
+            - lstm_hidden_size (:obj:`int`): The hidden size of LSTM in dynamics network to predict value_prefix.
+            - fc_reward_layers (:obj:`SequenceType`): The number of hidden layers of the reward head (MLP head).
+            - fc_value_layers (:obj:`SequenceType`): The number of hidden layers used in value head (MLP head).
+            - fc_policy_layers (:obj:`SequenceType`): The number of hidden layers used in policy head (MLP head).
+            - reward_support_size (:obj:`int`): The size of categorical reward output
+            - value_support_size (:obj:`int`): The size of categorical value output.
+            - proj_hid (:obj:`int`): The size of projection hidden layer.
+            - proj_out (:obj:`int`): The size of projection output layer.
+            - pred_hid (:obj:`int`): The size of prediction hidden layer.
+            - pred_out (:obj:`int`): The size of prediction output layer.
+            - self_supervised_learning_loss (:obj:`bool`): Whether to use self_supervised_learning related networks in Sampled EfficientZero model, default set it to False.
+            - categorical_distribution (:obj:`bool`): Whether to use discrete support to represent categorical distribution for value, reward/value_prefix.
+            - activation (:obj:`Optional[nn.Module]`): Activation function used in network, which often use in-place \
+                operation to speedup, e.g. ReLU(inplace=True).
+            - last_linear_layer_init_zero (:obj:`bool`): Whether to use zero initializations for the last layer of value/policy mlp, default sets it to True.
+            - state_norm (:obj:`bool`): Whether to use normalization for latent states, default sets it to True.
+            # ==============================================================
+            # specific sampled related config
+            # ==============================================================
+            - continuous_action_space (:obj:`bool`): The type of action space. default set it to False.
+            - num_of_sampled_actions (:obj:`int`): the number of sampled actions, i.e. the K in original Sampled MuZero paper.
+            # see ``ReparameterizationHead`` in ``ding.model.common.head`` for more details about the following arguments.
+            - sigma_type (:obj:`str`): the type of sigma in policy head of prediction network, options={'conditioned', 'fixed'}.
+            - fixed_sigma_value (:obj:`float`): the fixed sigma value in policy head of prediction network,
+            - bound_type (:obj:`str`): The type of bound in networks.  Default sets it to None.
+            - norm_type (:obj:`str`): The type of normalization in networks. default set it to 'BN'.
+            - discrete_action_encoding_type (:obj:`str`): The type of encoding for discrete action. Default sets it to 'one_hot'. options = {'one_hot', 'not_one_hot'}
+            - res_connection_in_dynamics (:obj:`bool`): Whether to use residual connection for dynamics network, default set it to False.
+        """
+        super(SampledEfficientZeroModelMLP, self).__init__()
+        if not categorical_distribution:
+            self.reward_support_size = 1
+            self.value_support_size = 1
+        else:
+            self.reward_support_size = reward_support_size
+            self.value_support_size = value_support_size
+
+        self.continuous_action_space = continuous_action_space
+        # self.observation_shape = observation_shape
+        self.robot_state_dim = robot_state_dim
+        self.human_state_dim = human_state_dim
+        self.robot_num = robot_num
+        self.human_num = human_num
+        self.action_space_size = action_space_size
+        # The dim of action space. For discrete action space, it is 1.
+        # For continuous action space, it is the dimension of continuous action.
+        self.action_space_dim = action_space_size if self.continuous_action_space else 1
+        assert discrete_action_encoding_type in ['one_hot', 'not_one_hot'], discrete_action_encoding_type
+        self.discrete_action_encoding_type = discrete_action_encoding_type
+        if self.continuous_action_space:
+            self.action_encoding_dim = action_space_size
+        else:
+            if self.discrete_action_encoding_type == 'one_hot':
+                self.action_encoding_dim = action_space_size
+            elif self.discrete_action_encoding_type == 'not_one_hot':
+                self.action_encoding_dim = 1
+
+        self.lstm_hidden_size = lstm_hidden_size
+        self.latent_state_dim = latent_state_dim
+        self.fc_reward_layers = fc_reward_layers
+        self.fc_value_layers = fc_value_layers
+        self.fc_policy_layers = fc_policy_layers
+        self.proj_hid = proj_hid
+        self.proj_out = proj_out
+        self.pred_hid = pred_hid
+        self.pred_out = pred_out
+
+        self.last_linear_layer_init_zero = last_linear_layer_init_zero
+        self.state_norm = state_norm
+        self.self_supervised_learning_loss = self_supervised_learning_loss
+
+        self.sigma_type = sigma_type
+        self.fixed_sigma_value = fixed_sigma_value
+        self.bound_type = bound_type
+        self.norm_type = norm_type
+        self.num_of_sampled_actions = num_of_sampled_actions
+        self.res_connection_in_dynamics = res_connection_in_dynamics
+
+        self.representation_network = RepresentationNetworkGCN(
+            robot_state_dim = robot_state_dim,
+            human_state_dim = human_state_dim,
+            robot_num = robot_num,
+            human_num = human_num,
+            hidden_channels=self.latent_state_dim,
+            norm_type=norm_type
+        )
+
+        self.dynamics_network = DynamicsNetworkMLP(
+            action_encoding_dim=self.action_encoding_dim,
+            num_channels=self.latent_state_dim + self.action_encoding_dim,
+            common_layer_num=2,
+            lstm_hidden_size=self.lstm_hidden_size,
+            fc_reward_layers=self.fc_reward_layers,
+            output_support_size=self.reward_support_size,
+            last_linear_layer_init_zero=self.last_linear_layer_init_zero,
+            norm_type=norm_type,
+            res_connection_in_dynamics=self.res_connection_in_dynamics,
+        )
+
+        self.prediction_network = PredictionNetworkMLP(
+            continuous_action_space=self.continuous_action_space,
+            action_space_size=self.action_space_size,
+            num_channels=self.latent_state_dim,
+            fc_value_layers=self.fc_value_layers,
+            fc_policy_layers=self.fc_policy_layers,
+            output_support_size=self.value_support_size,
+            last_linear_layer_init_zero=self.last_linear_layer_init_zero,
+            sigma_type=self.sigma_type,
+            fixed_sigma_value=self.fixed_sigma_value,
+            bound_type=self.bound_type,
+            norm_type=self.norm_type,
+        )
+
+        if self.self_supervised_learning_loss:
+            # self_supervised_learning_loss related network proposed in EfficientZero
+            self.projection_input_dim = latent_state_dim
+            self.projection = nn.Sequential(
+                nn.Linear(self.projection_input_dim, self.proj_hid), nn.BatchNorm1d(self.proj_hid), activation,
+                nn.Linear(self.proj_hid, self.proj_hid), nn.BatchNorm1d(self.proj_hid), activation,
+                nn.Linear(self.proj_hid, self.proj_out), nn.BatchNorm1d(self.proj_out)
+            )
+            self.prediction_head = nn.Sequential(
+                nn.Linear(self.proj_out, self.pred_hid),
+                nn.BatchNorm1d(self.pred_hid),
+                activation,
+                nn.Linear(self.pred_hid, self.pred_out),
+            )
+
+    def initial_inference(self, obs: torch.Tensor) -> EZNetworkOutput:
+        """
+         Overview:
+            Initial inference of SampledEfficientZero model, which is the first step of the SampledEfficientZero model.
+            To perform the initial inference, we first use the representation network to obtain the "latent_state" of the observation.
+            Then we use the prediction network to predict the "value" and "policy_logits" of the "latent_state", and
+            also prepare the zeros-like ``reward_hidden_state`` for the next step of the Sampled EfficientZero model.
+        Arguments:
+            - obs (:obj:`torch.Tensor`): The 1D vector observation data.
+        Returns (EZNetworkOutput):
+            - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation.
+            - value_prefix (:obj:`torch.Tensor`): The predicted prefix sum of value for input state. \
+                In initial inference, we set it to zero vector.
+            - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action.
+            - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state.
+            - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The hidden state of LSTM about reward. In initial inference, \
+                we set it to the zeros-like hidden state (H and C).
+        Shapes:
+            - obs (:obj:`torch.Tensor`): :math:`(B, obs_shape)`, where B is batch_size.
+            - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size.
+            - value_prefix (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size.
+            - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size.
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The shape of each element is :math:`(1, B, lstm_hidden_size)`, where B is batch_size.
+        """
+        batch_size = obs.size(0)
+        latent_state = self._representation(obs)
+        policy_logits, value = self._prediction(latent_state)
+        # zero initialization for reward hidden states
+        # (hn, cn), each element shape is (layer_num=1, batch_size, lstm_hidden_size)
+        reward_hidden_state = (
+            torch.zeros(1, batch_size,
+                        self.lstm_hidden_size).to(obs.device), torch.zeros(1, batch_size,
+                                                                           self.lstm_hidden_size).to(obs.device)
+        )
+        return EZNetworkOutput(value, [0. for _ in range(batch_size)], policy_logits, latent_state, reward_hidden_state)
+
+    def recurrent_inference(
+            self, latent_state: torch.Tensor, reward_hidden_state: torch.Tensor, action: torch.Tensor
+    ) -> EZNetworkOutput:
+        """
+        Overview:
+            Recurrent inference of Sampled EfficientZero model, which is the rollout step of the Sampled EfficientZero model.
+            To perform the recurrent inference, we first use the dynamics network to predict ``next_latent_state``,
+            ``reward_hidden_state``, ``value_prefix`` by the given current ``latent_state`` and ``action``.
+             We then use the prediction network to predict the ``value`` and ``policy_logits``.
+        Arguments:
+            - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state.
+            - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The input hidden state of LSTM about reward.
+            - action (:obj:`torch.Tensor`): The predicted action to rollout.
+        Returns (EZNetworkOutput):
+            - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation.
+            - value_prefix (:obj:`torch.Tensor`): The predicted prefix sum of value for input state.
+            - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action.
+            - next_latent_state (:obj:`torch.Tensor`): The predicted next latent state.
+            - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The output hidden state of LSTM about reward.
+        Shapes:
+            - action (:obj:`torch.Tensor`): :math:`(B, )`, where B is batch_size.
+            - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size.
+            - value_prefix (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size.
+            - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size.
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - next_latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The shape of each element is :math:`(1, B, lstm_hidden_size)`, where B is batch_size.
+         """
+        next_latent_state, reward_hidden_state, value_prefix = self._dynamics(latent_state, reward_hidden_state, action)
+        policy_logits, value = self._prediction(next_latent_state)
+        return EZNetworkOutput(value, value_prefix, policy_logits, next_latent_state, reward_hidden_state)
+
+    def _representation(self, observation: torch.Tensor) -> Tuple[torch.Tensor]:
+        """
+         Overview:
+             Use the representation network to encode the observations into latent state.
+         Arguments:
+             - obs (:obj:`torch.Tensor`): The 1D vector  observation data.
+         Returns:
+             - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state.
+         Shapes:
+             - obs (:obj:`torch.Tensor`): :math:`(B, obs_shape)`, where B is batch_size.
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+         """
+        latent_state = self.representation_network(observation)
+        if self.state_norm:
+            latent_state = renormalize(latent_state)
+        return latent_state
+
+    def _prediction(self, latent_state: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Overview:
+            Use the representation network to encode the observations into latent state.
+        Arguments:
+            - obs (:obj:`torch.Tensor`): The 1D vector observation data.
+        Returns:
+            - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action.
+            - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation.
+        Shapes:
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size.
+            - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size.
+        """
+        policy, value = self.prediction_network(latent_state)
+        return policy, value
+
+    def _dynamics(self, latent_state: torch.Tensor, reward_hidden_state: Tuple,
+                  action: torch.Tensor) -> Tuple[torch.Tensor, Tuple[torch.Tensor], torch.Tensor]:
+        """
+        Overview:
+            Concatenate ``latent_state`` and ``action`` and use the dynamics network to predict ``next_latent_state``
+            ``value_prefix`` and ``next_reward_hidden_state``.
+        Arguments:
+            - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state.
+            - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The input hidden state of LSTM about reward.
+            - action (:obj:`torch.Tensor`): The predicted action to rollout.
+        Returns:
+            - next_latent_state (:obj:`torch.Tensor`): The predicted latent state of the next timestep.
+            - next_reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The output hidden state of LSTM about reward.
+            - value_prefix (:obj:`torch.Tensor`): The predicted prefix sum of value for input state.
+        Shapes:
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - action (:obj:`torch.Tensor`): :math:`(B, )`, where B is batch_size.
+            - next_latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - value_prefix (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size.
+          """
+        # NOTE: the discrete action encoding type is important for some environments
+
+        if not self.continuous_action_space:
+            # discrete action space
+            if self.discrete_action_encoding_type == 'one_hot':
+                # Stack latent_state with the one hot encoded action
+                if len(action.shape) == 1:
+                    # (batch_size, ) -> (batch_size, 1)
+                    # e.g.,  torch.Size([8]) ->  torch.Size([8, 1])
+                    action = action.unsqueeze(-1)
+
+                # transform action to one-hot encoding.
+                # action_one_hot shape: (batch_size, action_space_size), e.g., (8, 4)
+                action_one_hot = torch.zeros(action.shape[0], self.action_space_size, device=action.device)
+                # transform action to torch.int64
+                action = action.long()
+                action_one_hot.scatter_(1, action, 1)
+                action_encoding = action_one_hot
+            elif self.discrete_action_encoding_type == 'not_one_hot':
+                action_encoding = action / self.action_space_size
+                if len(action_encoding.shape) == 1:
+                    # (batch_size, ) -> (batch_size, 1)
+                    # e.g., torch.Size([8]) ->  torch.Size([8, 1])
+                    action_encoding = action_encoding.unsqueeze(-1)
+        else:
+            # continuous action space
+            if len(action.shape) == 1:
+                # (batch_size, ) -> (batch_size, 1)
+                # e.g., torch.Size([8]) ->  torch.Size([8, 1])
+                action = action.unsqueeze(-1)
+            elif len(action.shape) == 3:
+                # (batch_size, action_dim, 1) -> (batch_size,  action_dim)
+                # e.g., torch.Size([8, 2, 1]) ->  torch.Size([8, 2])
+                action = action.squeeze(-1)
+
+            action_encoding = action
+
+        action_encoding = action_encoding.to(latent_state.device).float()
+        # state_action_encoding shape: (batch_size, latent_state[1] + action_dim]) or
+        # (batch_size, latent_state[1] + action_space_size]) depending on the discrete_action_encoding_type.
+        state_action_encoding = torch.cat((latent_state, action_encoding), dim=1)
+
+        next_latent_state, next_reward_hidden_state, value_prefix = self.dynamics_network(
+            state_action_encoding, reward_hidden_state
+        )
+
+        if not self.state_norm:
+            return next_latent_state, next_reward_hidden_state, value_prefix
+        else:
+            next_latent_state_normalized = renormalize(next_latent_state)
+            return next_latent_state_normalized, next_reward_hidden_state, value_prefix
+
+    def project(self, latent_state: torch.Tensor, with_grad=True) -> torch.Tensor:
+        """
+        Overview:
+            Project the latent state to a lower dimension to calculate the self-supervised loss, which is proposed in EfficientZero.
+            For more details, please refer to the paper ``Exploring Simple Siamese Representation Learning``.
+        Arguments:
+            - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state.
+            - with_grad (:obj:`bool`): Whether to calculate gradient for the projection result.
+        Returns:
+            - proj (:obj:`torch.Tensor`): The result embedding vector of projection operation.
+        Shapes:
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - proj (:obj:`torch.Tensor`): :math:`(B, projection_output_dim)`, where B is batch_size.
+
+        Examples:
+            >>> latent_state = torch.randn(256, 64)
+            >>> output = self.project(latent_state)
+            >>> output.shape # (256, 1024)
+         """
+        proj = self.projection(latent_state)
+
+        if with_grad:
+            # with grad, use prediction_head
+            return self.prediction_head(proj)
+        else:
+            return proj.detach()
+
+    def get_params_mean(self):
+        return get_params_mean(self)
+
+
+class PredictionNetworkMLP(nn.Module):
+
+    def __init__(
+        self,
+        continuous_action_space,
+        action_space_size,
+        num_channels,
+        common_layer_num: int = 2,
+        fc_value_layers: SequenceType = [32],
+        fc_policy_layers: SequenceType = [32],
+        output_support_size: int = 601,
+        last_linear_layer_init_zero: bool = True,
+        activation: Optional[nn.Module] = nn.ReLU(inplace=True),
+        # ==============================================================
+        # specific sampled related config
+        # ==============================================================
+        sigma_type='conditioned',
+        fixed_sigma_value: float = 0.3,
+        bound_type: str = None,
+        norm_type: str = 'BN',
+    ):
+        """
+        Overview:
+            The definition of policy and value prediction network, which is used to predict value and policy by the
+            given latent state.
+            The networks are mainly built on fully connected layers.
+        Arguments:
+            - continuous_action_space (:obj:`bool`): The type of action space. default set it to False.
+            - action_space_size: (:obj:`int`): Action space size, usually an integer number. For discrete action \
+                space, it is the number of discrete actions. For continuous action space, it is the dimension of \
+                continuous action.
+            - num_channels (:obj:`int`): The num of channels in latent states.
+            - num_res_blocks (:obj:`int`): The number of res blocks.
+            - fc_value_layers (:obj:`SequenceType`): hidden layers of the value prediction head (MLP head).
+            - fc_policy_layers (:obj:`SequenceType`): hidden layers of the policy prediction head (MLP head).
+            - output_support_size (:obj:`int`): dim of value output.
+            - last_linear_layer_init_zero (:obj:`bool`): Whether to use zero initializations for the last layer of value/policy mlp, default sets it to True.
+            # ==============================================================
+            # specific sampled related config
+            # ==============================================================
+            # see ``ReparameterizationHead`` in ``ding.model.common.head`` for more details about thee following arguments.
+            - sigma_type (:obj:`str`): the type of sigma in policy head of prediction network, options={'conditioned', 'fixed'}.
+            - fixed_sigma_value (:obj:`float`): the fixed sigma value in policy head of prediction network,
+            - bound_type (:obj:`str`): The type of bound in networks.  default set it to None.
+            - norm_type (:obj:`str`): The type of normalization in networks. default set it to 'BN'.
+        """
+        super().__init__()
+        self.num_channels = num_channels
+        self.continuous_action_space = continuous_action_space
+        self.norm_type = norm_type
+        self.sigma_type = sigma_type
+        self.fixed_sigma_value = fixed_sigma_value
+        self.bound_type = bound_type
+        self.action_space_size = action_space_size
+        if self.continuous_action_space:
+            self.action_encoding_dim = self.action_space_size
+        else:
+            self.action_encoding_dim = 1
+
+        # ******* common backbone ******
+        self.fc_prediction_common = MLP(
+            in_channels=self.num_channels,
+            hidden_channels=self.num_channels,
+            out_channels=self.num_channels,
+            layer_num=common_layer_num,
+            activation=activation,
+            norm_type=norm_type,
+            output_activation=True,
+            output_norm=True,
+            # last_linear_layer_init_zero=False is important for convergence
+            last_linear_layer_init_zero=False,
+        )
+
+        # ******* value and policy head ******
+        self.fc_value_head = MLP(
+            in_channels=self.num_channels,
+            hidden_channels=fc_value_layers[0],
+            out_channels=output_support_size,
+            layer_num=2,
+            activation=activation,
+            norm_type=norm_type,
+            output_activation=False,
+            output_norm=False,
+            # last_linear_layer_init_zero=True is beneficial for convergence speed.
+            last_linear_layer_init_zero=last_linear_layer_init_zero
+        )
+
+        # sampled related core code
+        if self.continuous_action_space:
+            self.fc_policy_head = ReparameterizationHead(
+                input_size=self.num_channels,
+                output_size=action_space_size,
+                layer_num=2,
+                sigma_type=self.sigma_type,
+                fixed_sigma_value=self.fixed_sigma_value,
+                activation=nn.ReLU(),
+                norm_type=None,
+                bound_type=self.bound_type
+            )
+        else:
+            self.fc_policy_head = MLP(
+                in_channels=self.num_channels,
+                hidden_channels=fc_policy_layers[0],
+                out_channels=action_space_size,
+                layer_num=2,
+                activation=activation,
+                norm_type=self.norm_type,
+                output_activation=False,
+                output_norm=False,
+                # last_linear_layer_init_zero=True is beneficial for convergence speed.
+                last_linear_layer_init_zero=last_linear_layer_init_zero
+            )
+
+    def forward(self, latent_state: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Overview:
+             Forward computation of the prediction network.
+        Arguments:
+             - latent_state (:obj:`torch.Tensor`): input tensor with shape (B, in_channels).
+        Returns:
+             - policy (:obj:`torch.Tensor`): policy tensor. If action space is discrete, shape is (B, action_space_size).
+                If action space is continuous, shape is (B, action_space_size * 2).
+             - value (:obj:`torch.Tensor`): value tensor with shape (B, output_support_size).
+        """
+        x_prediction_common = self.fc_prediction_common(latent_state)
+        value = self.fc_value_head(x_prediction_common)
+
+        # sampled related core code
+        policy = self.fc_policy_head(x_prediction_common)
+        if self.continuous_action_space:
+            policy = torch.cat([policy['mu'], policy['sigma']], dim=-1)
+
+        return policy, value
diff --git a/lzero/model/tests/test_common_gcn.py b/lzero/model/tests/test_common_gcn.py
new file mode 100644
index 000000000..a7574b3b8
--- /dev/null
+++ b/lzero/model/tests/test_common_gcn.py
@@ -0,0 +1,102 @@
+import torch
+import numpy as np
+from torch import nn
+from lzero.model.common_gcn import RepresentationNetworkGCN, RGCNLayer
+
+# ...
+
+class TestLightZeroEnvWrapper:
+
+    # ...
+    def test_representation_network_gcn_with_dict_obs(self):
+        robot_state_dim = 10
+        human_state_dim = 5
+        robot_num = 3
+        human_num = 2
+        hidden_channels = 64
+        layer_num = 2
+        activation = nn.ReLU(inplace=True)
+        last_linear_layer_init_zero = True
+        norm_type = 'BN'
+
+        representation_network = RepresentationNetworkGCN(
+            robot_state_dim=robot_state_dim,
+            human_state_dim=human_state_dim,
+            robot_num=robot_num,
+            human_num=human_num,
+            hidden_channels=hidden_channels,
+            layer_num=layer_num,
+            activation=activation,
+            last_linear_layer_init_zero=last_linear_layer_init_zero,
+            norm_type=norm_type,
+        )
+
+        # Create dummy input
+        batch_size = 4
+        x = {
+            'robot_state': torch.randn(batch_size, robot_num, robot_state_dim),
+            'human_state': torch.randn(batch_size, human_num, human_state_dim)
+        }
+
+        # Forward pass
+        output = representation_network(x)
+
+        # Check output shape
+        assert output.shape == (batch_size, hidden_channels)
+
+        # Check output type
+        assert isinstance(output, torch.Tensor)
+
+        # Check intermediate shape
+        assert representation_network.rgcn(x).shape == (batch_size, robot_num + human_num, hidden_channels)
+
+        # Check intermediate type
+        assert isinstance(representation_network.rgcn(x), torch.Tensor)
+
+    def test_representation_network_gcn_with_2d_array_obs(self):
+        robot_state_dim = 10
+        human_state_dim = 10    # 2d_array_obs, so the dimensions must be the same
+        robot_num = 3
+        human_num = 2
+        hidden_channels = 64
+        layer_num = 2
+        activation = nn.ReLU(inplace=True)
+        last_linear_layer_init_zero = True
+        norm_type = 'BN'
+
+        representation_network = RepresentationNetworkGCN(
+            robot_state_dim=robot_state_dim,
+            human_state_dim=human_state_dim,
+            robot_num=robot_num,
+            human_num=human_num,
+            hidden_channels=hidden_channels,
+            layer_num=layer_num,
+            activation=activation,
+            last_linear_layer_init_zero=last_linear_layer_init_zero,
+            norm_type=norm_type,
+        )
+
+        # Create dummy input
+        batch_size = 4
+        x = torch.randn(batch_size, robot_num + human_num, robot_state_dim)
+
+        # Forward pass
+        output = representation_network(x)
+
+        # Check output shape
+        assert output.shape == (batch_size, hidden_channels)
+
+        # Check output type
+        assert isinstance(output, torch.Tensor)
+
+        # Check intermediate shape
+        assert representation_network.rgcn(x).shape == (batch_size, robot_num + human_num, hidden_channels)
+
+        # Check intermediate type
+        assert isinstance(representation_network.rgcn(x), torch.Tensor)
+
+if __name__ == '__main__':
+    test = TestLightZeroEnvWrapper()
+    test.test_representation_network_gcn_with_dict_obs()
+    test.test_representation_network_gcn_with_2d_array_obs()
+    print("All tests passed.")
\ No newline at end of file
diff --git a/lzero/model/tests/test_rgcn.py b/lzero/model/tests/test_rgcn.py
new file mode 100644
index 000000000..32b90b6bb
--- /dev/null
+++ b/lzero/model/tests/test_rgcn.py
@@ -0,0 +1,50 @@
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from itertools import product
+import unittest
+from lzero.model.common_gcn import RGCNLayer
+
+class TestRGCNLayer(unittest.TestCase):
+    def setUp(self):
+        self.robot_state_dim = 10
+        self.human_state_dim = 10
+        self.similarity_function = 'embedded_gaussian'
+        self.batch_size = 4
+        self.num_nodes = 5  # Suppose 5 robots and 5 humans
+
+        # Create a RGCNLayer object
+        self.rgcn_layer = RGCNLayer(
+            robot_state_dim=self.robot_state_dim,
+            human_state_dim=self.human_state_dim,
+            similarity_function=self.similarity_function,
+            num_layer=2,
+            X_dim=32,
+            layerwise_graph=False,
+            skip_connection=True
+        )
+
+        # Creating dummy inputs
+        self.state = {
+            'robot_state': torch.randn(self.batch_size, self.num_nodes, self.robot_state_dim),
+            'human_state': torch.randn(self.batch_size, self.num_nodes, self.human_state_dim)
+        }
+
+    def test_forward_shape(self):
+        # Forward pass
+        output = self.rgcn_layer(self.state)
+        expected_shape = (self.batch_size, self.num_nodes * 2, 32)  # Since final_state_dim is set to X_dim
+        self.assertEqual(output.shape, expected_shape, "Output shape is incorrect.")
+
+    def test_similarity_function(self):
+        # Check if the similarity matrix computation is working as expected
+        # This only checks for one similarity function due to space constraints
+        if self.similarity_function == 'embedded_gaussian':
+            X = torch.randn(self.batch_size, self.num_nodes * 2, 32)
+            A = self.rgcn_layer.compute_similarity_matrix(X)
+            self.assertEqual(A.shape, (self.batch_size, self.num_nodes * 2, self.num_nodes * 2), "Similarity matrix shape is incorrect.")
+            self.assertTrue(torch.all(A >= 0) and torch.all(A <= 1), "Similarity matrix values should be normalized.")
+
+# Running the tests
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/lzero/policy/muzero.py b/lzero/policy/muzero.py
index 0acc66b07..6e12c3c3c 100644
--- a/lzero/policy/muzero.py
+++ b/lzero/policy/muzero.py
@@ -222,6 +222,8 @@ def default_model(self) -> Tuple[str, List[str]]:
             return 'MuZeroModel', ['lzero.model.muzero_model']
         elif self._cfg.model.model_type == "mlp":
             return 'MuZeroModelMLP', ['lzero.model.muzero_model_mlp']
+        elif self._cfg.model.model_type == "rgcn":
+            return 'MuZeroModelGCN', ['lzero.model.muzero_model_gcn']
         else:
             raise ValueError("model type {} is not supported".format(self._cfg.model.model_type))
 
@@ -644,6 +646,9 @@ def _get_target_obs_index_in_step_k(self, step):
         elif self._cfg.model.model_type == 'mlp':
             beg_index = self._cfg.model.observation_shape * step
             end_index = self._cfg.model.observation_shape * (step + self._cfg.model.frame_stack_num)
+        elif self._cfg.model.model_type == 'rgcn':
+            beg_index = self._cfg.model.observation_shape * step
+            end_index = self._cfg.model.observation_shape * (step + self._cfg.model.frame_stack_num)
         return beg_index, end_index
 
     def _forward_eval(self, data: torch.Tensor, action_mask: list, to_play: int = -1,
@@ -670,7 +675,10 @@ def _forward_eval(self, data: torch.Tensor, action_mask: list, to_play: int = -1
                 ``visit_count_distribution_entropy``, ``value``, ``pred_value``, ``policy_logits``.
         """
         self._eval_model.eval()
-        active_eval_env_num = data.shape[0]
+        if type(data) is dict:
+            active_eval_env_num = data['robot_state'].shape[0]
+        else:
+            active_eval_env_num = data.shape[0]
         with torch.no_grad():
             # data shape [B, S x C, W, H], e.g. {Tensor:(B, 12, 96, 96)}
             network_output = self._collect_model.initial_inference(data)
diff --git a/lzero/worker/muzero_collector.py b/lzero/worker/muzero_collector.py
index 55f0e9f85..ae516960c 100644
--- a/lzero/worker/muzero_collector.py
+++ b/lzero/worker/muzero_collector.py
@@ -414,7 +414,7 @@ def collect(self,
                 stack_obs = prepare_observation(stack_obs, self.policy_config.model.model_type)
 
                 # stack_obs = torch.from_numpy(stack_obs).to(self.policy_config.device).float()
-                stack_obs = torch.from_numpy(stack_obs).to(self.policy_config.device)
+                stack_obs = torch.from_numpy(stack_obs).to(self.policy_config.device).float()
 
                 # ==============================================================
                 # policy forward
diff --git a/zoo/CrowdSim/config/crowdsim_muzero_rgcn_config.py b/zoo/CrowdSim/config/crowdsim_muzero_rgcn_config.py
index e611aa6b1..bb6b9ced3 100644
--- a/zoo/CrowdSim/config/crowdsim_muzero_rgcn_config.py
+++ b/zoo/CrowdSim/config/crowdsim_muzero_rgcn_config.py
@@ -13,7 +13,7 @@
 max_env_step = int(3e5)
 reanalyze_ratio = 0.
 robot_num = 2
-human_num = 10  # purdue
+human_num = 59  # purdue
 # human_num = 33  # NCSU
 # human_num = 92  # KAIST
 one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]
@@ -23,9 +23,11 @@
 
 CrowdSim_muzero_config = dict(
     exp_name=
-    f'result/crowd_num_human/CrowdSim_muzero_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_upc{update_per_collect}_rr{reanalyze_ratio}_seed0',
+    f'result/CrowdSim_muzerogcn_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0',
     env=dict(
+        obs_mode='1-dim-array',
         env_name='CrowdSim-v0',
+        dataset = 'purdue',
         robot_num = robot_num,
         human_num = human_num, 
         one_uav_action_space = one_uav_action_space,
@@ -38,8 +40,14 @@
     ),
     policy=dict(
         model=dict(
-            robot_observation_shape=(robot_num, 4),
-            human_observation_shape=(human_num, 4),
+            # robot_observation_shape=(robot_num, 4),
+            # human_observation_shape=(human_num, 4),
+            observation_shape=(robot_num + human_num)*4,
+            obs_mode='1-dim-array',
+            robot_state_dim = 4,
+            human_state_dim = 4,
+            robot_num = robot_num,
+            human_num = human_num,
             action_space_size=(len(one_uav_action_space))**robot_num,
             model_type='rgcn', 
             lstm_hidden_size=256,
@@ -75,9 +83,9 @@
 CrowdSim_muzero_create_config = dict(
     env=dict(
         type='crowdsim_lightzero',
-        import_names=['zoo.CrowdSim.envs.CrowdSim_env'],
+        import_names=['zoo.CrowdSim.envs.crowdsim_lightzero_env'],
     ),
-    env_manager=dict(type='subprocess'),
+    env_manager=dict(type='base'),
     policy=dict(
         type='muzero',
         import_names=['lzero.policy.muzero'],
diff --git a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
index c27a888d5..fe058d8fd 100644
--- a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
+++ b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
@@ -3,10 +3,10 @@
 import logging
 import random
 import gym
-from shapely.geometry import Point
+# from shapely.geometry import Point
 import numpy as np
-import folium
-from folium.plugins import TimestampedGeoJson, AntPath
+# import folium
+# from folium.plugins import TimestampedGeoJson, AntPath
 
 from zoo.CrowdSim.envs.Crowdsim.env.model.utils import *
 from zoo.CrowdSim.envs.Crowdsim.env.model.mdp import HumanState, RobotState, JointState
diff --git a/zoo/CrowdSim/envs/crowdsim_lightzero_env.py b/zoo/CrowdSim/envs/crowdsim_lightzero_env.py
index 0bdedd202..e28a1baac 100644
--- a/zoo/CrowdSim/envs/crowdsim_lightzero_env.py
+++ b/zoo/CrowdSim/envs/crowdsim_lightzero_env.py
@@ -44,6 +44,16 @@ def __init__(self, cfg: dict = {}) -> None:
         self._action_space.seed(0)  # default seed
         self._reward_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1, ), dtype=np.float32)
         self._continuous = False
+        # obs_mode 'dict': {'robot_state': robot_state, 'human_state': human_state}
+        # obs_mode '2-dim-array': np.concatenate((robot_state, human_state), axis=0)
+        # obs_mode '1-dim-array': np.concatenate((robot_state, human_state), axis=0).flatten()
+        self.obs_mode = self._cfg.get('obs_mode', '2-dim-array')
+        assert self.obs_mode in ['dict', '2-dim-array', '1-dim-array'], "obs_mode should be 'dict' or '2-dim-array' or '1-dim-array'!"
+        # action_mode 'combine': combine all robot actions into one action, action space size = one_uav_action_n**robot_num
+        # action_mode 'separate': separate robot actions, shape (robot_num,), for each robot action space size = one_uav_action_n
+        self.action_mode = self._cfg.get('action_mode', 'combine')
+        assert self.action_mode in ['combine', 'separate'], "action_mode should be 'combine' or 'separate'!"
+
 
     def reset(self) -> np.ndarray:
         if not self._init_flag:
@@ -60,7 +70,13 @@ def reset(self) -> np.ndarray:
         # process obs
         raw_obs = self._env.reset()
         obs_list = list(raw_obs.to_tensor())
-        obs = {'robot_state': obs_list[0], 'human_state': obs_list[1]}
+        if self.obs_mode == 'dict':
+            obs = {'robot_state': obs_list[0], 'human_state': obs_list[1]}
+        elif self.obs_mode == '2-dim-array':
+            # robot_state: (robot_num, 4), human_state: (human_num, 4)
+            obs = np.concatenate((obs_list[0], obs_list[1]), axis=0)
+        elif self.obs_mode == '1-dim-array':
+            obs = np.concatenate((obs_list[0], obs_list[1]), axis=0).flatten()
         action_mask = np.ones(self.action_space.n, 'int8')
         obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1}
 
@@ -77,13 +93,23 @@ def seed(self, seed: int, dynamic_seed: bool = True) -> None:
         np.random.seed(self._seed)
 
     def step(self, action: Union[int, np.ndarray]) -> BaseEnvTimestep:
-        if isinstance(action, np.ndarray) and action.shape == (1, ):
-            action = action.squeeze()  # 0-dim array
-        real_action = self.real_action_space[action]
+        if self.action_mode == 'combine':
+            if isinstance(action, np.ndarray) and action.shape == (1, ):
+                action = action.squeeze()
+            real_action = self.real_action_space[action]
+        elif self.action_mode == 'separate':
+            assert isinstance(action, np.ndarray) and action.shape == (self._robot_num, ), "illegal action!"
+            real_action = tuple([self._cfg.one_uav_action_space[action[i]] for i in range(self._robot_num)])
         assert isinstance(real_action, tuple) and len(real_action) == self._robot_num, "illegal action!"
         raw_obs, rew, done, info = self._env.step(real_action)
         obs_list = list(raw_obs.to_array())
-        obs = {'robot_state': obs_list[0], 'human_state': obs_list[1]}
+        if self.obs_mode == 'dict':
+            obs = {'robot_state': obs_list[0], 'human_state': obs_list[1]}
+        elif self.obs_mode == '2-dim-array':
+            # robot_state: (robot_num, 4), human_state: (human_num, 4)
+            obs = np.concatenate((obs_list[0], obs_list[1]), axis=0)
+        elif self.obs_mode == '1-dim-array':
+            obs = np.concatenate((obs_list[0], obs_list[1]), axis=0).flatten()
 
         self._eval_episode_return += rew
         if done:
diff --git a/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py b/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py
index 47f483717..b5721d0c3 100644
--- a/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py
+++ b/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py
@@ -8,13 +8,15 @@
         dataset = 'purdue',
         robot_num = 2,
         human_num = 59, # purdue
-        one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]
+        one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]],
+        obs_mode = '2-dim-array',
         )
 
 @ pytest.mark.envtest
 
 class TestCrowdSimEnv:
-    def test_naive(self):
+    def test_obs_dict(self):
+        mcfg['obs_mode'] = 'dict'
         env = CrowdSimEnv(mcfg)
         env.seed(314)
         assert env._seed == 314
@@ -33,3 +35,37 @@ def test_naive(self):
             assert timestep.reward.shape == (1, )
         print(env.observation_space, env.action_space, env.reward_space)
         env.close()
+
+    def test_obs_2_dim_array(self):
+        mcfg['obs_mode'] = '2-dim-array'
+        env = CrowdSimEnv(mcfg)
+        env.seed(314)
+        assert env._seed == 314
+        obs = env.reset()
+        assert obs['observation'].shape == (61, 4)
+        for i in range(10):
+            random_action = env.random_action()
+            timestep = env.step(random_action)
+            print(timestep)
+            assert timestep.obs['observation'].shape == (61, 4)
+            assert isinstance(timestep.done, bool)
+            assert timestep.reward.shape == (1, )
+        print(env.observation_space, env.action_space, env.reward_space)
+        env.close()
+
+    def test_obs_1_dim_array(self):
+        mcfg['obs_mode'] = '1-dim-array'
+        env = CrowdSimEnv(mcfg)
+        env.seed(314)
+        assert env._seed == 314
+        obs = env.reset()
+        assert obs['observation'].shape == (244, )
+        for i in range(10):
+            random_action = env.random_action()
+            timestep = env.step(random_action)
+            print(timestep)
+            assert timestep.obs['observation'].shape == (244, )
+            assert isinstance(timestep.done, bool)
+            assert timestep.reward.shape == (1, )
+        print(env.observation_space, env.action_space, env.reward_space)
+        env.close()
\ No newline at end of file

From c99db40f0b134110b46434d6cd304bd95828202b Mon Sep 17 00:00:00 2001
From: nighood <nighoodren@gmail.com>
Date: Fri, 3 May 2024 14:57:46 +0800
Subject: [PATCH 06/16] feature(rjy): add multi-head policy(combine logits)

---
 lzero/agent/muzero.py                         |   3 +
 lzero/mcts/utils.py                           |   4 +-
 lzero/model/muzero_model_md.py                | 558 ++++++++++++++++++
 lzero/policy/muzero.py                        |   4 +-
 lzero/worker/muzero_evaluator.py              |  30 +-
 .../config/crowdsim_muzero_md_config.py       | 117 ++++
 6 files changed, 698 insertions(+), 18 deletions(-)
 create mode 100644 lzero/model/muzero_model_md.py
 create mode 100644 zoo/CrowdSim/config/crowdsim_muzero_md_config.py

diff --git a/lzero/agent/muzero.py b/lzero/agent/muzero.py
index b087377b5..26ffc6f04 100644
--- a/lzero/agent/muzero.py
+++ b/lzero/agent/muzero.py
@@ -113,6 +113,9 @@ def __init__(
             elif self.cfg.policy.model.model_type == 'rgcn':
                 from lzero.model.muzero_model_gcn import MuZeroModelGCN
                 model = MuZeroModelGCN(**self.cfg.policy.model)
+            elif self.cfg.policy.model.model_type == 'mlp_md':
+                from lzero.model.muzero_model_md import MuZeroModelMD
+                model = MuZeroModelMD(**self.cfg.policy.model)
             else:
                 raise NotImplementedError
         if self.cfg.policy.cuda and torch.cuda.is_available():
diff --git a/lzero/mcts/utils.py b/lzero/mcts/utils.py
index 1861cb2a9..80e3f588e 100644
--- a/lzero/mcts/utils.py
+++ b/lzero/mcts/utils.py
@@ -97,7 +97,7 @@ def prepare_observation(observation_list, model_type='conv'):
     Returns:
         - np.ndarray: Reshaped array of observations.
     """
-    assert model_type in ['conv', 'mlp', 'rgcn'], "model_type must be either 'conv' or 'mlp'"
+    assert model_type in ['conv', 'mlp', 'rgcn', 'mlp_md'], "model_type must be either 'conv', 'mlp', 'rgcn' or 'mlp_md'"
     observation_array = np.array(observation_list)
     batch_size = observation_array.shape[0]
 
@@ -110,7 +110,7 @@ def prepare_observation(observation_list, model_type='conv'):
             _, stack_num, channels, width, height = observation_array.shape
             observation_array = observation_array.reshape(batch_size, stack_num * channels, width, height)
 
-    elif model_type == 'mlp':
+    elif model_type == 'mlp' or model_type == 'mlp_md':
         if observation_array.ndim == 3:
             # Flatten the last two dimensions
             observation_array = observation_array.reshape(batch_size, -1)
diff --git a/lzero/model/muzero_model_md.py b/lzero/model/muzero_model_md.py
new file mode 100644
index 000000000..187509ac8
--- /dev/null
+++ b/lzero/model/muzero_model_md.py
@@ -0,0 +1,558 @@
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+from ding.torch_utils import MLP
+from ding.utils import MODEL_REGISTRY, SequenceType
+from ding.model.common.head import MultiHead, DiscreteHead
+
+from .common import MZNetworkOutput, RepresentationNetworkMLP
+from .utils import renormalize, get_params_mean, get_dynamic_mean, get_reward_mean
+
+
+@MODEL_REGISTRY.register('MuZeroModelMD')
+class MuZeroModelMD(nn.Module):
+
+    def __init__(
+        self,
+        agent_num: int,
+        output_separate_logit: bool = False,
+        observation_shape: int = 2,
+        single_agent_action_size: int = 5,
+        action_space_size: int = 6,
+        latent_state_dim: int = 256,
+        fc_reward_layers: SequenceType = [32],
+        fc_value_layers: SequenceType = [32],
+        reward_support_size: int = 601,
+        value_support_size: int = 601,
+        proj_hid: int = 1024,
+        proj_out: int = 1024,
+        pred_hid: int = 512,
+        pred_out: int = 1024,
+        self_supervised_learning_loss: bool = False,
+        categorical_distribution: bool = True,
+        activation: Optional[nn.Module] = nn.ReLU(inplace=True),
+        last_linear_layer_init_zero: bool = True,
+        state_norm: bool = False,
+        discrete_action_encoding_type: str = 'one_hot',
+        norm_type: Optional[str] = 'BN',
+        res_connection_in_dynamics: bool = False,
+        *args,
+        **kwargs
+    ):
+        """
+        Overview:
+            The definition of the network model of MuZero, which is a generalization version for 1D vector obs.
+            The networks are mainly built on fully connected layers.
+            The representation network is an MLP network which maps the raw observation to a latent state.
+            The dynamics network is an MLP network which predicts the next latent state, and reward given the current latent state and action.
+            The prediction network is an network with agent_num multihead which predicts the value and policy given the current latent state.
+        Arguments:
+            - agent_num (:obj:`int`): The number of agents in the environment.
+            - output_separate_logit (:obj:`bool`): Whether to output separate logit for each action.
+            - observation_shape (:obj:`int`): Observation space shape, e.g. 8 for Lunarlander.
+            - action_space_size: (:obj:`int`): Combinational action space size.
+            - single_agent_action_size: (:obj:`int`): The size of action space for single agent.
+            - latent_state_dim (:obj:`int`): The dimension of latent state, such as 256.
+            - fc_reward_layers (:obj:`SequenceType`): The number of hidden layers of the reward head (MLP head).
+            - fc_value_layers (:obj:`SequenceType`): The number of hidden layers used in value head (MLP head).
+            - reward_support_size (:obj:`int`): The size of categorical reward output
+            - value_support_size (:obj:`int`): The size of categorical value output.
+            - proj_hid (:obj:`int`): The size of projection hidden layer.
+            - proj_out (:obj:`int`): The size of projection output layer.
+            - pred_hid (:obj:`int`): The size of prediction hidden layer.
+            - pred_out (:obj:`int`): The size of prediction output layer.
+            - self_supervised_learning_loss (:obj:`bool`): Whether to use self_supervised_learning related networks in MuZero model, default set it to False.
+            - categorical_distribution (:obj:`bool`): Whether to use discrete support to represent categorical distribution for value, reward/value_prefix.
+            - activation (:obj:`Optional[nn.Module]`): Activation function used in network, which often use in-place \
+                operation to speedup, e.g. ReLU(inplace=True).
+            - last_linear_layer_init_zero (:obj:`bool`): Whether to use zero initializations for the last layer of value/policy mlp, default sets it to True.
+            - state_norm (:obj:`bool`): Whether to use normalization for latent states, default sets it to True.
+            - discrete_action_encoding_type (:obj:`str`): The encoding type of discrete action, which can be 'one_hot' or 'not_one_hot'.
+            - norm_type (:obj:`str`): The type of normalization in networks. defaults to 'BN'.
+            - res_connection_in_dynamics (:obj:`bool`): Whether to use residual connection for dynamics network, default set it to False.
+        """
+        super(MuZeroModelMD, self).__init__()
+        self.categorical_distribution = categorical_distribution
+        if not self.categorical_distribution:
+            self.reward_support_size = 1
+            self.value_support_size = 1
+        else:
+            self.reward_support_size = reward_support_size
+            self.value_support_size = value_support_size
+
+        self.action_space_size = action_space_size
+        self.continuous_action_space = False
+        # The dim of action space. For discrete action space, it is 1.
+        # For continuous action space, it is the dimension of continuous action.
+        self.action_space_dim = action_space_size if self.continuous_action_space else 1
+        assert discrete_action_encoding_type in ['one_hot', 'not_one_hot'], discrete_action_encoding_type
+        self.discrete_action_encoding_type = discrete_action_encoding_type
+        if self.continuous_action_space:
+            self.action_encoding_dim = action_space_size
+        else:
+            if self.discrete_action_encoding_type == 'one_hot':
+                self.action_encoding_dim = action_space_size
+            elif self.discrete_action_encoding_type == 'not_one_hot':
+                self.action_encoding_dim = 1
+
+        self.latent_state_dim = latent_state_dim
+        self.proj_hid = proj_hid
+        self.proj_out = proj_out
+        self.pred_hid = pred_hid
+        self.pred_out = pred_out
+        self.self_supervised_learning_loss = self_supervised_learning_loss
+        self.last_linear_layer_init_zero = last_linear_layer_init_zero
+        self.state_norm = state_norm
+        self.res_connection_in_dynamics = res_connection_in_dynamics
+
+        self.representation_network = RepresentationNetworkMLP(
+            observation_shape=observation_shape, hidden_channels=self.latent_state_dim, norm_type=norm_type
+        )
+
+        self.dynamics_network = DynamicsNetwork(
+            action_encoding_dim=self.action_encoding_dim,
+            num_channels=self.latent_state_dim + self.action_encoding_dim,
+            common_layer_num=2,
+            fc_reward_layers=fc_reward_layers,
+            output_support_size=self.reward_support_size,
+            last_linear_layer_init_zero=self.last_linear_layer_init_zero,
+            norm_type=norm_type,
+            res_connection_in_dynamics=self.res_connection_in_dynamics,
+        )
+
+        self.prediction_network = PredictionNetworkMD(
+            agent_num=agent_num,
+            single_agent_action_size=single_agent_action_size,
+            num_channels=latent_state_dim,
+            fc_value_layers=fc_value_layers,
+            output_support_size=self.value_support_size,
+            last_linear_layer_init_zero=self.last_linear_layer_init_zero,
+            norm_type=norm_type,
+            output_separate_logit=output_separate_logit,
+        )
+
+        if self.self_supervised_learning_loss:
+            # self_supervised_learning_loss related network proposed in EfficientZero
+            self.projection_input_dim = latent_state_dim
+
+            self.projection = nn.Sequential(
+                nn.Linear(self.projection_input_dim, self.proj_hid), nn.BatchNorm1d(self.proj_hid), activation,
+                nn.Linear(self.proj_hid, self.proj_hid), nn.BatchNorm1d(self.proj_hid), activation,
+                nn.Linear(self.proj_hid, self.proj_out), nn.BatchNorm1d(self.proj_out)
+            )
+            self.prediction_head = nn.Sequential(
+                nn.Linear(self.proj_out, self.pred_hid),
+                nn.BatchNorm1d(self.pred_hid),
+                activation,
+                nn.Linear(self.pred_hid, self.pred_out),
+            )
+
+    def initial_inference(self, obs: torch.Tensor) -> MZNetworkOutput:
+        """
+        Overview:
+            Initial inference of MuZero model, which is the first step of the MuZero model.
+            To perform the initial inference, we first use the representation network to obtain the "latent_state" of the observation.
+            Then we use the prediction network to predict the "value" and "policy_logits" of the "latent_state", and
+            also prepare the zeros-like ``reward`` for the next step of the MuZero model.
+        Arguments:
+            - obs (:obj:`torch.Tensor`): The 1D vector observation data.
+        Returns (MZNetworkOutput):
+            - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation.
+            - value_prefix (:obj:`torch.Tensor`): The predicted prefix sum of value for input state. \
+                In initial inference, we set it to zero vector.
+            - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action.
+            - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state.
+            - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The hidden state of LSTM about reward. In initial inference, \
+                we set it to the zeros-like hidden state (H and C).
+        Shapes:
+            - obs (:obj:`torch.Tensor`): :math:`(B, obs_shape)`, where B is batch_size.
+            - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size.
+            - reward (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size.
+            - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size.
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+        """
+        batch_size = obs.size(0)
+        latent_state = self._representation(obs)
+        policy_logits, value = self._prediction(latent_state)
+        return MZNetworkOutput(
+            value,
+            [0. for _ in range(batch_size)],
+            policy_logits,
+            latent_state,
+        )
+
+    def recurrent_inference(self, latent_state: torch.Tensor, action: torch.Tensor) -> MZNetworkOutput:
+        """
+        Overview:
+            Recurrent inference of MuZero model, which is the rollout step of the MuZero model.
+            To perform the recurrent inference, we first use the dynamics network to predict ``next_latent_state``,
+            ``reward`` by the given current ``latent_state`` and ``action``.
+             We then use the prediction network to predict the ``value`` and ``policy_logits``.
+        Arguments:
+            - latent_state (:obj:`torch.Tensor`): The encoding latent state of input obs.
+            - action (:obj:`torch.Tensor`): The predicted action to rollout.
+        Returns (MZNetworkOutput):
+            - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation.
+            - reward (:obj:`torch.Tensor`): The predicted reward for input state.
+            - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action.
+            - next_latent_state (:obj:`torch.Tensor`): The predicted next latent state.
+        Shapes:
+            - action (:obj:`torch.Tensor`): :math:`(B, )`, where B is batch_size.
+            - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size.
+            - reward (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size.
+            - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size.
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - next_latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+        """
+        next_latent_state, reward = self._dynamics(latent_state, action)
+        policy_logits, value = self._prediction(next_latent_state)
+        return MZNetworkOutput(value, reward, policy_logits, next_latent_state)
+
+    def _representation(self, observation: torch.Tensor) -> Tuple[torch.Tensor]:
+        """
+        Overview:
+             Use the representation network to encode the observations into latent state.
+        Arguments:
+             - obs (:obj:`torch.Tensor`): The 1D vector  observation data.
+        Returns:
+             - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state.
+        Shapes:
+             - obs (:obj:`torch.Tensor`): :math:`(B, obs_shape)`, where B is batch_size.
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+         """
+        observation = observation.float()
+        latent_state = self.representation_network(observation)
+        if self.state_norm:
+            latent_state = renormalize(latent_state)
+        return latent_state
+
+    def _prediction(self, latent_state: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Overview:
+            Use the representation network to encode the observations into latent state.
+        Arguments:
+            - obs (:obj:`torch.Tensor`): The 1D vector observation data.
+        Returns:
+            - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action.
+            - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation.
+        Shapes:
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size.
+            - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size.
+        """
+        policy_logits, value = self.prediction_network(latent_state)
+        return policy_logits, value
+
+    def _dynamics(self, latent_state: torch.Tensor, action: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Overview:
+            Concatenate ``latent_state`` and ``action`` and use the dynamics network to predict ``next_latent_state``
+            ``reward`` and ``next_reward_hidden_state``.
+        Arguments:
+            - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state.
+            - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The input hidden state of LSTM about reward.
+            - action (:obj:`torch.Tensor`): The predicted action to rollout.
+        Returns:
+            - next_latent_state (:obj:`torch.Tensor`): The predicted latent state of the next timestep.
+            - next_reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The output hidden state of LSTM about reward.
+            - reward (:obj:`torch.Tensor`): The predicted reward for input state.
+        Shapes:
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - action (:obj:`torch.Tensor`): :math:`(B, )`, where B is batch_size.
+            - next_latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - reward (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size.
+        """
+        # NOTE: the discrete action encoding type is important for some environments
+
+        # discrete action space
+        if self.discrete_action_encoding_type == 'one_hot':
+            # Stack latent_state with the one hot encoded action
+            if len(action.shape) == 1:
+                # (batch_size, ) -> (batch_size, 1)
+                # e.g.,  torch.Size([8]) ->  torch.Size([8, 1])
+                action = action.unsqueeze(-1)
+
+            # transform action to one-hot encoding.
+            # action_one_hot shape: (batch_size, action_space_size), e.g., (8, 4)
+            action_one_hot = torch.zeros(action.shape[0], self.action_space_size, device=action.device)
+            # transform action to torch.int64
+            action = action.long()
+            action_one_hot.scatter_(1, action, 1)
+            action_encoding = action_one_hot
+        elif self.discrete_action_encoding_type == 'not_one_hot':
+            action_encoding = action / self.action_space_size
+            if len(action_encoding.shape) == 1:
+                # (batch_size, ) -> (batch_size, 1)
+                # e.g.,  torch.Size([8]) ->  torch.Size([8, 1])
+                action_encoding = action_encoding.unsqueeze(-1)
+
+        action_encoding = action_encoding.to(latent_state.device).float()
+        # state_action_encoding shape: (batch_size, latent_state[1] + action_dim]) or
+        # (batch_size, latent_state[1] + action_space_size]) depending on the discrete_action_encoding_type.
+        state_action_encoding = torch.cat((latent_state, action_encoding), dim=1)
+
+        next_latent_state, reward = self.dynamics_network(state_action_encoding)
+
+        if not self.state_norm:
+            return next_latent_state, reward
+        else:
+            next_latent_state_normalized = renormalize(next_latent_state)
+            return next_latent_state_normalized, reward
+
+    def project(self, latent_state: torch.Tensor, with_grad=True) -> torch.Tensor:
+        """
+        Overview:
+            Project the latent state to a lower dimension to calculate the self-supervised loss, which is proposed in EfficientZero.
+            For more details, please refer to the paper ``Exploring Simple Siamese Representation Learning``.
+        Arguments:
+            - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state.
+            - with_grad (:obj:`bool`): Whether to calculate gradient for the projection result.
+        Returns:
+            - proj (:obj:`torch.Tensor`): The result embedding vector of projection operation.
+        Shapes:
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - proj (:obj:`torch.Tensor`): :math:`(B, projection_output_dim)`, where B is batch_size.
+
+        Examples:
+            >>> latent_state = torch.randn(256, 64)
+            >>> output = self.project(latent_state)
+            >>> output.shape # (256, 1024)
+        """
+        proj = self.projection(latent_state)
+
+        if with_grad:
+            # with grad, use prediction_head
+            return self.prediction_head(proj)
+        else:
+            return proj.detach()
+
+    def get_params_mean(self) -> float:
+        return get_params_mean(self)
+
+
+class DynamicsNetwork(nn.Module):
+
+    def __init__(
+        self,
+        action_encoding_dim: int = 2,
+        num_channels: int = 64,
+        common_layer_num: int = 2,
+        fc_reward_layers: SequenceType = [32],
+        output_support_size: int = 601,
+        last_linear_layer_init_zero: bool = True,
+        activation: Optional[nn.Module] = nn.ReLU(inplace=True),
+        norm_type: Optional[str] = 'BN',
+        res_connection_in_dynamics: bool = False,
+    ):
+        """
+        Overview:
+            The definition of dynamics network in MuZero algorithm, which is used to predict next latent state
+            reward by the given current latent state and action.
+            The networks are mainly built on fully connected layers.
+        Arguments:
+            - action_encoding_dim (:obj:`int`): The dimension of action encoding.
+            - num_channels (:obj:`int`): The num of channels in latent states.
+            - common_layer_num (:obj:`int`): The number of common layers in dynamics network.
+            - fc_reward_layers (:obj:`SequenceType`): The number of hidden layers of the reward head (MLP head).
+            - output_support_size (:obj:`int`): The size of categorical reward output.
+            - last_linear_layer_init_zero (:obj:`bool`): Whether to use zero initializations for the last layer of value/policy mlp, default sets it to True.
+            - activation (:obj:`Optional[nn.Module]`): Activation function used in network, which often use in-place \
+                operation to speedup, e.g. ReLU(inplace=True).
+            - norm_type (:obj:`str`): The type of normalization in networks. defaults to 'BN'.
+            - res_connection_in_dynamics (:obj:`bool`): Whether to use residual connection in dynamics network.
+        """
+        super().__init__()
+        self.num_channels = num_channels
+        self.action_encoding_dim = action_encoding_dim
+        self.latent_state_dim = self.num_channels - self.action_encoding_dim
+
+        self.res_connection_in_dynamics = res_connection_in_dynamics
+        if self.res_connection_in_dynamics:
+            self.fc_dynamics_1 = MLP(
+                in_channels=self.num_channels,
+                hidden_channels=self.latent_state_dim,
+                layer_num=common_layer_num,
+                out_channels=self.latent_state_dim,
+                activation=activation,
+                norm_type=norm_type,
+                output_activation=True,
+                output_norm=True,
+                # last_linear_layer_init_zero=False is important for convergence
+                last_linear_layer_init_zero=False,
+            )
+            self.fc_dynamics_2 = MLP(
+                in_channels=self.latent_state_dim,
+                hidden_channels=self.latent_state_dim,
+                layer_num=common_layer_num,
+                out_channels=self.latent_state_dim,
+                activation=activation,
+                norm_type=norm_type,
+                output_activation=True,
+                output_norm=True,
+                # last_linear_layer_init_zero=False is important for convergence
+                last_linear_layer_init_zero=False,
+            )
+        else:
+            self.fc_dynamics = MLP(
+                in_channels=self.num_channels,
+                hidden_channels=self.latent_state_dim,
+                layer_num=common_layer_num,
+                out_channels=self.latent_state_dim,
+                activation=activation,
+                norm_type=norm_type,
+                output_activation=True,
+                output_norm=True,
+                # last_linear_layer_init_zero=False is important for convergence
+                last_linear_layer_init_zero=False,
+            )
+
+        self.fc_reward_head = MLP(
+            in_channels=self.latent_state_dim,
+            hidden_channels=fc_reward_layers[0],
+            layer_num=2,
+            out_channels=output_support_size,
+            activation=activation,
+            norm_type=norm_type,
+            output_activation=False,
+            output_norm=False,
+            last_linear_layer_init_zero=last_linear_layer_init_zero
+        )
+
+    def forward(self, state_action_encoding: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Overview:
+            Forward computation of the dynamics network. Predict the next latent state given current latent state and action.
+        Arguments:
+            - state_action_encoding (:obj:`torch.Tensor`): The state-action encoding, which is the concatenation of \
+                    latent state and action encoding, with shape (batch_size, num_channels, height, width).
+        Returns:
+            - next_latent_state (:obj:`torch.Tensor`): The next latent state, with shape (batch_size, latent_state_dim).
+            - reward (:obj:`torch.Tensor`): The predicted reward for input state.
+        """
+        if self.res_connection_in_dynamics:
+            # take the state encoding (e.g. latent_state),
+            # state_action_encoding[:, -self.action_encoding_dim:] is action encoding
+            latent_state = state_action_encoding[:, :-self.action_encoding_dim]
+            x = self.fc_dynamics_1(state_action_encoding)
+            # the residual link: add the latent_state to the state_action encoding
+            next_latent_state = x + latent_state
+            next_latent_state_encoding = self.fc_dynamics_2(next_latent_state)
+        else:
+            next_latent_state = self.fc_dynamics(state_action_encoding)
+            next_latent_state_encoding = next_latent_state
+
+        reward = self.fc_reward_head(next_latent_state_encoding)
+
+        return next_latent_state, reward
+
+    def get_dynamic_mean(self) -> float:
+        return get_dynamic_mean(self)
+
+    def get_reward_mean(self) -> float:
+        return get_reward_mean(self)
+
+
+
+class PredictionNetworkMD(nn.Module):
+
+    def __init__(
+            self,
+            agent_num: int,
+            single_agent_action_size,
+            num_channels,
+            common_layer_num: int = 2,
+            fc_value_layers: SequenceType = [32],
+            output_support_size: int = 601,
+            last_linear_layer_init_zero: bool = True,
+            activation: Optional[nn.Module] = nn.ReLU(inplace=True),
+            norm_type: Optional[str] = 'BN',
+            output_separate_logit: bool = False,
+    ):
+        """
+        Overview:
+            The definition of policy and value prediction network with Multi-Layer Perceptron (MLP),
+            which is used to predict value and policy by the given latent state. Policy network is a multihead network, 
+            which predicts the policy for each agent.
+        Arguments:
+            - agent_num (:obj:`int`): The number of agents in the environment.
+            - single_agent_action_size: (:obj:`int`): Action space size for single agent.
+            - num_channels (:obj:`int`): The channels of latent states.
+            - fc_value_layers (:obj:`SequenceType`): The number of hidden layers used in value head (MLP head).
+            - output_support_size (:obj:`int`): The size of categorical value output.
+            - last_linear_layer_init_zero (:obj:`bool`): Whether to use zero initializations for the last layer of \
+                dynamics/prediction mlp, default sets it to True.
+            - activation (:obj:`Optional[nn.Module]`): Activation function used in network, which often use in-place \
+                operation to speedup, e.g. ReLU(inplace=True).
+            - norm_type (:obj:`str`): The type of normalization in networks. defaults to 'BN'.
+        """
+        super().__init__()
+        self.num_channels = num_channels
+
+        # ******* common backbone ******
+        self.fc_prediction_common = MLP(
+            in_channels=self.num_channels,
+            hidden_channels=self.num_channels,
+            out_channels=self.num_channels,
+            layer_num=common_layer_num,
+            activation=activation,
+            norm_type=norm_type,
+            output_activation=True,
+            output_norm=True,
+            # last_linear_layer_init_zero=False is important for convergence
+            last_linear_layer_init_zero=False,
+        )
+
+        # ******* value and policy head ******
+        self.fc_value_head = MLP(
+            in_channels=self.num_channels,
+            hidden_channels=fc_value_layers[0],
+            out_channels=output_support_size,
+            layer_num=len(fc_value_layers) + 1,
+            activation=activation,
+            norm_type=norm_type,
+            output_activation=False,
+            output_norm=False,
+            # last_linear_layer_init_zero=True is beneficial for convergence speed.
+            last_linear_layer_init_zero=last_linear_layer_init_zero
+        )
+        self.policy_multi_head = MultiHead(
+            head_cls=DiscreteHead,
+            hidden_size=self.num_channels,
+            output_size_list=[single_agent_action_size for _ in range(agent_num)],
+        )
+        self.output_separate_logit = output_separate_logit
+
+    def forward(self, latent_state: torch.Tensor):
+        """
+        Overview:
+            Forward computation of the prediction network.
+        Arguments:
+            - latent_state (:obj:`torch.Tensor`): input tensor with shape (B, latent_state_dim).
+        Returns:
+            - policy (:obj:`torch.Tensor`): policy tensor with shape (B, action_space_size).
+            - value (:obj:`torch.Tensor`): value tensor with shape (B, output_support_size).
+        """
+        latent_state = latent_state.to(torch.float32)
+        x_prediction_common = self.fc_prediction_common(latent_state)
+
+        value = self.fc_value_head(x_prediction_common)
+        # policy_list: {'logit': [policy1, policy2, ...],}
+        # policyi shape: (B, action_space_size)
+        policy_list = self.policy_multi_head(x_prediction_common)['logit']
+        if not self.output_separate_logit:
+            # The joint action space policy is the product of each agent policy
+            # policy shape: (B, action_space_size^^agent_num)
+            batch_size = latent_state.size(0)
+            joint_logits_batches = []
+            for i in range(batch_size):
+                current_batch = [policy[i] for policy in policy_list]
+                cartesian_prod_result = torch.cartesian_prod(*current_batch)
+                joint_logits = cartesian_prod_result.prod(dim=1)
+                joint_logits_batches.append(joint_logits)
+            policy = torch.stack(joint_logits_batches)
+        else:
+            # policy_list: [policy1, policy2, ...]
+            # policy sahpe: (B, agent_num, action_space_size)
+            policy = torch.stack(policy_list, dim=1)
+        return policy, value
diff --git a/lzero/policy/muzero.py b/lzero/policy/muzero.py
index 6e12c3c3c..d4e49c30f 100644
--- a/lzero/policy/muzero.py
+++ b/lzero/policy/muzero.py
@@ -224,6 +224,8 @@ def default_model(self) -> Tuple[str, List[str]]:
             return 'MuZeroModelMLP', ['lzero.model.muzero_model_mlp']
         elif self._cfg.model.model_type == "rgcn":
             return 'MuZeroModelGCN', ['lzero.model.muzero_model_gcn']
+        elif self._cfg.model.model_type == "mlp_md":
+            return 'MuZeroModelMD', ['lzero.model.muzero_model_md']
         else:
             raise ValueError("model type {} is not supported".format(self._cfg.model.model_type))
 
@@ -643,7 +645,7 @@ def _get_target_obs_index_in_step_k(self, step):
         if self._cfg.model.model_type == 'conv':
             beg_index = self._cfg.model.image_channel * step
             end_index = self._cfg.model.image_channel * (step + self._cfg.model.frame_stack_num)
-        elif self._cfg.model.model_type == 'mlp':
+        elif self._cfg.model.model_type == 'mlp' or self._cfg.model.model_type == 'mlp_md':
             beg_index = self._cfg.model.observation_shape * step
             end_index = self._cfg.model.observation_shape * (step + self._cfg.model.frame_stack_num)
         elif self._cfg.model.model_type == 'rgcn':
diff --git a/lzero/worker/muzero_evaluator.py b/lzero/worker/muzero_evaluator.py
index 6d871fc33..a718d33b7 100644
--- a/lzero/worker/muzero_evaluator.py
+++ b/lzero/worker/muzero_evaluator.py
@@ -337,22 +337,22 @@ def eval(
                         action_mask_dict[env_id] = to_ndarray(obs['action_mask'])
                         to_play_dict[env_id] = to_ndarray(obs['to_play'])
 
-                    dones[env_id] = done
-                    if t.done:
-                        # Env reset is done by env_manager automatically.
-                        self._policy.reset([env_id])
-                        reward = t.info['eval_episode_return']
-                        # 'performance_info' and 'episode_info' only choose one 
-                        if 'performance_info' in t.info:
-                            eval_monitor.update_info(env_id, t.info['performance_info'])
-                        elif 'episode_info' in t.info:
-                            eval_monitor.update_info(env_id, t.info['episode_info'])
-                        eval_monitor.update_reward(env_id, reward)
-                        self._logger.info(
-                            "[EVALUATOR]env {} finish episode, final reward: {}, current episode: {}".format(
-                                env_id, eval_monitor.get_latest_reward(env_id), eval_monitor.get_current_episode()
+                        dones[env_id] = done
+                        if t.done:
+                            # Env reset is done by env_manager automatically.
+                            self._policy.reset([env_id])
+                            reward = t.info['eval_episode_return']
+                            # 'performance_info' and 'episode_info' only choose one 
+                            if 'performance_info' in t.info:
+                                eval_monitor.update_info(env_id, t.info['performance_info'])
+                            elif 'episode_info' in t.info:
+                                eval_monitor.update_info(env_id, t.info['episode_info'])
+                            eval_monitor.update_reward(env_id, reward)
+                            self._logger.info(
+                                "[EVALUATOR]env {} finish episode, final reward: {}, current episode: {}".format(
+                                    env_id, eval_monitor.get_latest_reward(env_id), eval_monitor.get_current_episode()
+                                )
                             )
-                        )
 
                             # reset the finished env and init game_segments
                             if n_episode > self._env_num:
diff --git a/zoo/CrowdSim/config/crowdsim_muzero_md_config.py b/zoo/CrowdSim/config/crowdsim_muzero_md_config.py
new file mode 100644
index 000000000..ec4453ff8
--- /dev/null
+++ b/zoo/CrowdSim/config/crowdsim_muzero_md_config.py
@@ -0,0 +1,117 @@
+from easydict import EasyDict
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = '2'
+# ==============================================================
+# begin of the most frequently changed config specified by the user
+# ==============================================================
+collector_env_num = 8
+n_episode = 8
+evaluator_env_num = 3
+num_simulations = 25
+update_per_collect = 100
+batch_size = 256
+max_env_step = int(3e5)
+reanalyze_ratio = 0.
+robot_num = 2
+human_num = 59  # purdue
+# human_num = 33  # NCSU
+# human_num = 92  # KAIST
+one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]
+# ==============================================================
+# end of the most frequently changed config specified by the user
+# ==============================================================
+
+CrowdSim_muzero_config = dict(
+    exp_name=
+    f'result/CrowdSim_muzeromd_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0',
+    env=dict(
+        obs_mode='1-dim-array',
+        env_name='CrowdSim-v0',
+        dataset = 'purdue',
+        robot_num = robot_num,
+        human_num = human_num, 
+        one_uav_action_space = one_uav_action_space,
+        continuous=False,
+        manually_discretization=False,
+        collector_env_num=collector_env_num,
+        evaluator_env_num=evaluator_env_num,
+        n_evaluator_episode=evaluator_env_num,
+        manager=dict(shared_memory=False, ),
+    ),
+    policy=dict(
+        model=dict(
+            # robot_observation_shape=(robot_num, 4),
+            # human_observation_shape=(human_num, 4),
+            agent_num = robot_num,
+            observation_shape=(robot_num + human_num)*4,
+            obs_mode='1-dim-array',
+            robot_state_dim = 4,
+            human_state_dim = 4,
+            robot_num = robot_num,
+            human_num = human_num,
+            single_agent_action_size=len(one_uav_action_space),
+            action_space_size=(len(one_uav_action_space))**robot_num,
+            model_type='mlp_md', 
+            output_separate_logit=False,    # not output separate logit for each action.
+            lstm_hidden_size=256,
+            latent_state_dim=256,
+            self_supervised_learning_loss=True,  # NOTE: default is False.
+            discrete_action_encoding_type='one_hot',
+            res_connection_in_dynamics=True,
+            norm_type='BN',
+        ),
+        cuda=True,
+        env_type='not_board_games',
+        game_segment_length=200,
+        update_per_collect=update_per_collect,
+        batch_size=batch_size,
+        optim_type='Adam',
+        lr_piecewise_constant_decay=False,
+        learning_rate=0.003,
+        ssl_loss_weight=2,  # NOTE: default is 0.
+        grad_clip_value=0.5,
+        num_simulations=num_simulations,
+        reanalyze_ratio=reanalyze_ratio,
+        n_episode=n_episode,
+        eval_freq=int(1e3),
+        replay_buffer_size=int(1e6),  # the size/capacity of replay_buffer, in the terms of transitions.
+        collector_env_num=collector_env_num,
+        evaluator_env_num=evaluator_env_num,
+    ),
+)
+
+CrowdSim_muzero_config = EasyDict(CrowdSim_muzero_config)
+main_config = CrowdSim_muzero_config
+
+CrowdSim_muzero_create_config = dict(
+    env=dict(
+        type='crowdsim_lightzero',
+        import_names=['zoo.CrowdSim.envs.crowdsim_lightzero_env'],
+    ),
+    env_manager=dict(type='base'),
+    policy=dict(
+        type='muzero',
+        import_names=['lzero.policy.muzero'],
+    ),
+    collector=dict(
+        type='episode_muzero',
+        import_names=['lzero.worker.muzero_collector'],
+    )
+)
+CrowdSim_muzero_create_config = EasyDict(CrowdSim_muzero_create_config)
+create_config = CrowdSim_muzero_create_config
+
+if __name__ == "__main__":
+    # Users can use different train entry by specifying the entry_type.
+    entry_type = "train_muzero"  # options={"train_muzero", "train_muzero_with_gym_env"}
+
+    if entry_type == "train_muzero":
+        from lzero.entry import train_muzero
+    elif entry_type == "train_muzero_with_gym_env":
+        """
+        The ``train_muzero_with_gym_env`` entry means that the environment used in the training process is generated by wrapping the original gym environment with LightZeroEnvWrapper.
+        Users can refer to lzero/envs/wrappers for more details.
+        """
+        from lzero.entry import train_muzero_with_gym_env as train_muzero
+
+    train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step)

From 61831f1c3c5b374f5d418666497272ababc1e43a Mon Sep 17 00:00:00 2001
From: nighood <nighoodren@gmail.com>
Date: Sat, 4 May 2024 01:56:31 +0800
Subject: [PATCH 07/16] feature(rjy): modify new env with transmitted data

---
 zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py   | 59 +++++++++++--------
 zoo/CrowdSim/envs/Crowdsim/env/model/agent.py | 53 +++++++++++++++--
 zoo/CrowdSim/envs/Crowdsim/env/model/utils.py |  2 +-
 3 files changed, 83 insertions(+), 31 deletions(-)

diff --git a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
index fe058d8fd..f05567891 100644
--- a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
+++ b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
@@ -37,12 +37,13 @@ def __init__(self, dataset, custom_config=None):
         self.max_uav_energy = self.config.max_uav_energy
         # self.action_space = gym.spaces.Discrete(4**self.robot_num) # for each robot have 4 actions(up, down, left, right), then product
         self.action_space = gym.spaces.Discrete(len(self.config.one_uav_action_space))
-        # human obs: [px, py, theta, aoi]
+        # human obs: [px, py, remaining_data_amount, aoi]
         # robot obs: [px, py, theta, energy]
         # self.observation_space = gym.spaces.Box(low=float("-inf"), high=float("inf"), shape=(4), dtype=np.float32)
         self.observation_space = gym.spaces.Box(low=float("-inf"), high=float("inf"), shape=(self.robot_num+self.human_num, 4), dtype=np.float32)
 
         # load_dataset
+        self.transmit_v = 5  # 5*0.3Mb/s
         self.nlon = self.config.nlon
         self.nlat = self.config.nlat
         self.lower_left = self.config.lower_left
@@ -64,18 +65,20 @@ def __init__(self, dataset, custom_config=None):
 
         self.human_df['t'] = pd.to_datetime(self.human_df['timestamp'], unit='s')  # 's' stands for second
         self.human_df['aoi'] = -1  # 加入aoi记录aoi
+        self.human_df['data_amount'] = -1    # record the remaining data amount of each human
         self.human_df['energy'] = -1  # 加入energy记录energy
         logging.info('human number: {}'.format(self.human_num))
         logging.info('Robot number: {}'.format(self.robot_num))
 
         # for debug
-        self.current_human_aoi_list = np.ones([self.human_num, ])
+        self.current_human_aoi_list = np.zeros([self.human_num, ])
         self.mean_aoi_timelist = np.ones([self.config.num_timestep + 1, ])
+        self.cur_data_amount_timelist = np.zeros([self.human_num, ])
         self.robot_energy_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num])
         self.robot_x_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num])
         self.robot_y_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num])
         self.update_human_timelist = np.zeros([self.config.num_timestep, ])
-        self.data_collection = 0
+        self.data_transmission = 0
 
     def set_agent(self, agent):
         self.agent = agent
@@ -83,8 +86,8 @@ def set_agent(self, agent):
     def generate_human(self, human_id, selected_data, selected_next_data):
         human = Human(human_id, self.config)
         px, py, theta = get_human_position_from_list(self.current_timestep, human_id, selected_data, selected_next_data, self.config)
-        # human obs: [px, py, theta, aoi]
-        human.set(px, py, theta, 1)  # initial aoi of human is 1
+        # human obs: [px, py, data_amount, aoi]
+        human.set(px, py, theta, 0, 0)  # initial aoi of human is 0
         return human
 
     def generate_robot(self, robot_id):
@@ -93,7 +96,7 @@ def generate_robot(self, robot_id):
         robot.set(self.nlon / 2, self.nlat / 2, 0, self.max_uav_energy)  # robot有energy
         return robot
 
-    def sync_human_df(self, human_id, current_timestep, aoi):
+    def sync_human_df(self, human_id, current_timestep, aoi, data_amount):
         """
         Overview:
             Sync the human_df with the current timestep and aoi.
@@ -107,6 +110,7 @@ def sync_human_df(self, human_id, current_timestep, aoi):
             (self.human_df.id == human_id) & (self.human_df.timestamp == current_timestamp)].index
         # self.human_df.loc[current_index, "aoi"] = aoi   # slower
         self.human_df.iat[current_index.values[0], 9] = aoi # faster
+        # self.human_df.iat[current_index.values[0], 10] = data_amount
 
     def reset(self, phase='test', test_case=None):
         self.current_timestep = 0
@@ -114,17 +118,21 @@ def reset(self, phase='test', test_case=None):
         # generate human
         self.humans = []
         selected_data, selected_next_data = get_human_position_list(self.current_timestep, self.human_df, self.config)
+        self.generate_data_amount_per_step = 0
+        self.total_generated_data_amount = 0
         for human_id in range(self.human_num):
             self.humans.append(self.generate_human(human_id, selected_data, selected_next_data))
-            self.sync_human_df(human_id, self.current_timestep, 1)
+            self.generate_data_amount_per_step += self.humans[human_id].collect_v
+            self.sync_human_df(human_id, self.current_timestep, aoi=0, data_amount=0)
 
         # generate robot
         self.robots = []
         for robot_id in range(self.robot_num):
             self.robots.append(self.generate_robot(robot_id))
 
-        self.current_human_aoi_list = np.ones([self.human_num, ])
-        self.mean_aoi_timelist = np.ones([self.config.num_timestep + 1, ])
+        self.cur_data_amount_timelist = np.zeros([self.human_num, ])
+        self.current_human_aoi_list = np.zeros([self.human_num, ])
+        self.mean_aoi_timelist = np.zeros([self.config.num_timestep + 1, ])
         self.mean_aoi_timelist[self.current_timestep] = np.mean(self.current_human_aoi_list)
         self.robot_energy_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num])
         self.robot_energy_timelist[self.current_timestep, :] = self.max_uav_energy
@@ -133,7 +141,7 @@ def reset(self, phase='test', test_case=None):
         self.robot_y_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num])
         self.robot_y_timelist[self.current_timestep, :] = self.nlat / 2
         self.update_human_timelist = np.zeros([self.config.num_timestep, ])
-        self.data_collection = 0
+        self.data_transmission = 0
 
         # for visualization
         self.plot_states = []
@@ -182,33 +190,31 @@ def step(self, action):
                 robot.set(new_robot_px, new_robot_py, robot_theta, energy=new_energy)
 
         selected_data, selected_next_data = get_human_position_list(self.current_timestep + 1, self.human_df, self.config)
-        delta_human_aoi_list = np.zeros_like(self.current_human_aoi_list)   # 0 means no update
+        human_transmit_data_list = np.zeros_like(self.cur_data_amount_timelist)   # 0 means no update
         for human_id, human in enumerate(self.humans):
             next_px, next_py, next_theta = get_human_position_from_list(self.current_timestep + 1, human_id,
                                                                         selected_data, selected_next_data, self.config)
             should_reset = judge_aoi_update([next_px, next_py], new_robot_position, self.config)
             if should_reset:
-                # if the human is in the range of the robot, then reset the aoi of the human
-                if human.aoi > 1:
-                    delta_human_aoi_list[human_id] = human.aoi
-                else:
-                    delta_human_aoi_list[human_id] = 1
-
-                human.set(next_px, next_py, next_theta, aoi=1)
+                # if the human is in the range of the robot, then part of human's data will be transmitted
+                last_data_amount = human.data_amount
+                human.update(next_px, next_py, next_theta, transmitted_data=self.transmit_v)
+                human_transmit_data_list[human_id] = min(last_data_amount, self.transmit_v)
                 num_updated_human += 1
             else:
-                # if the human is not in the range of the robot, then update the aoi of the human
-                delta_human_aoi_list[human_id] = 0
-                new_aoi = human.aoi + 1
-                human.set(next_px, next_py, next_theta, aoi=new_aoi)
+                # if the human is not in the range of the robot, then no data will be transmitted, \
+                # and update aoi and caculate new collected data amount
+                human_transmit_data_list[human_id] = 0
+                human.update(next_px, next_py, next_theta, transmitted_data=0)
 
             self.current_human_aoi_list[human_id] = human.aoi
-            self.sync_human_df(human_id, self.current_timestep + 1, human.aoi)
+            self.sync_human_df(human_id, self.current_timestep + 1, human.aoi, human.data_amount)
 
         self.mean_aoi_timelist[self.current_timestep + 1] = np.mean(self.current_human_aoi_list)
         self.update_human_timelist[self.current_timestep] = num_updated_human
-        delta_sum_aoi = np.sum(delta_human_aoi_list)
-        self.data_collection += (delta_sum_aoi * 0.3)  # Mb, 0.02M/s per person
+        delta_sum_transmit_data = np.sum(human_transmit_data_list)
+        self.data_transmission += (delta_sum_transmit_data * 0.3)  # Mb, 0.02M/s per person
+        self.total_generated_data_amount += self.generate_data_amount_per_step
 
         # TODO: need to be well-defined
         reward = self.mean_aoi_timelist[self.current_timestep] - self.mean_aoi_timelist[self.current_timestep + 1] \
@@ -233,9 +239,10 @@ def step(self, action):
         info = {
             "performance_info": {
             "mean_aoi": self.mean_aoi_timelist[self.current_timestep],
+            "mean_transmit_data": delta_sum_transmit_data / self.human_num,
             "mean_energy_consumption": 1.0 - (
                         np.mean(self.robot_energy_timelist[self.current_timestep]) / self.max_uav_energy),
-            "collected_data_amount": self.data_collection/(self.num_timestep*self.human_num*0.3),
+            "transmitted_data_ratio": self.data_transmission/(self.total_generated_data_amount*0.3),
             "human_coverage": np.mean(self.update_human_timelist) / self.human_num
             },
         }
diff --git a/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py b/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py
index 5201c5b61..aabc184d2 100644
--- a/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py
+++ b/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py
@@ -1,4 +1,5 @@
 import abc
+import random
 import logging
 from zoo.CrowdSim.envs.Crowdsim.env.model.mdp import *
 
@@ -25,25 +26,39 @@ def act(self, state, current_timestep):
 
 
 class Human():
+    collect_v_prob = {1: 0.3, 2: 0.6}
     def __init__(self, id, config):
         self.id = id
         self.config = config
         self.px = None
         self.py = None
         self.theta = None
-        self.aoi = None
+        self.aoi = 0
+        self.data_queue = InformationQueue()
+        self.data_amount = 0
+        self.collect_v = random.choices(list(self.collect_v_prob.keys()), list(self.collect_v_prob.values()))[0]
 
-    def set(self, px, py, theta, aoi):
-        self.px = px  # position
+    def set(self, px, py, theta, aoi, data_amount):
+        self.px = px
         self.py = py
         self.theta = theta
         self.aoi = aoi
+        self.data_amount = data_amount
+
+    def update(self, px, py, theta, transmitted_data):
+        self.px = px  # position
+        self.py = py
+        self.theta = theta
+        self.data_queue.update(self.collect_v, transmitted_data)
+        self.aoi = self.data_queue.total_aoi()
+        self.data_amount = self.data_queue.total_blocks()
 
     # TODO: change state,可能需要归一化
     def get_obs(self):
+        # obs: (px, py, remaining_data, aoi)
         return HumanState(self.px / self.config.nlon,
                           self.py / self.config.nlat,
-                          self.theta / self.config.rotation_limit,
+                          self.data_amount / self.config.num_timestep,
                           self.aoi / self.config.num_timestep)
 
 
@@ -70,3 +85,33 @@ def get_obs(self):
                           self.energy / self.config.max_uav_energy)
 
 
+class InformationQueue:
+    def __init__(self):
+        # Initialize the queue to hold the age of each information block
+        self.queue = []
+
+    def update(self, arrivals, departures):
+        # Increase the age of information (aoi) for each block in the queue
+        self.queue = [age + 1 for age in self.queue]
+        
+        # Add new information blocks with aoi of 0
+        self.queue.extend([0] * arrivals)
+        
+        # Remove the specified number of oldest information blocks
+        self.queue = self.queue[departures:] if departures <= len(self.queue) else []
+
+    def total_aoi(self):
+        # Return the total age of information in the queue
+        return sum(self.queue)
+    
+    def total_blocks(self):
+        # Return the total number of information blocks in the queue
+        return len(self.queue)
+
+# # Example of using the InformationQueue class
+# info_queue = InformationQueue()
+# info_queue.update(arrivals=5, departures=0)  # 5 blocks enter the queue, all with aoi of 0
+# info_queue.update(arrivals=3, departures=2)  # 3 new blocks enter, 2 blocks leave
+# total_age = info_queue.total_aoi()  # Calculate the total age of information in the queue
+
+# total_age
diff --git a/zoo/CrowdSim/envs/Crowdsim/env/model/utils.py b/zoo/CrowdSim/envs/Crowdsim/env/model/utils.py
index ce903d9f8..e81ec842f 100644
--- a/zoo/CrowdSim/envs/Crowdsim/env/model/utils.py
+++ b/zoo/CrowdSim/envs/Crowdsim/env/model/utils.py
@@ -65,7 +65,7 @@ def get_human_position_from_list(selected_timestep, human_id, selected_data, sel
 def judge_aoi_update(human_position, robot_position, config):
     """
     Overview:
-        Judge whether the AoI should be updated
+        Judge whether the AoI should be updated, i.e., the human is in the sensing range of the robot.
     Args:
         - human_position (:obj:`list`): The position of the human.
         - robot_position (:obj:`list`): The position of the robot.

From 9599faa62df9f98a92e67b36b85b54ac251de7a2 Mon Sep 17 00:00:00 2001
From: nighood <nighoodren@gmail.com>
Date: Mon, 6 May 2024 01:00:05 +0800
Subject: [PATCH 08/16] feature(rjy): add rough vis of crowdsim

---
 zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py   | 61 ++++++++++++++++---
 zoo/CrowdSim/envs/crowdsim_lightzero_env.py   |  7 +++
 .../envs/test_crowdsim_lightzero_env.py       | 14 ++++-
 3 files changed, 70 insertions(+), 12 deletions(-)

diff --git a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
index f05567891..b84a92ba9 100644
--- a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
+++ b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
@@ -43,7 +43,7 @@ def __init__(self, dataset, custom_config=None):
         self.observation_space = gym.spaces.Box(low=float("-inf"), high=float("inf"), shape=(self.robot_num+self.human_num, 4), dtype=np.float32)
 
         # load_dataset
-        self.transmit_v = 5  # 5*0.3Mb/s
+        self.transmit_v = 20  # 5*0.3Mb/s
         self.nlon = self.config.nlon
         self.nlat = self.config.nlat
         self.lower_left = self.config.lower_left
@@ -110,7 +110,7 @@ def sync_human_df(self, human_id, current_timestep, aoi, data_amount):
             (self.human_df.id == human_id) & (self.human_df.timestamp == current_timestamp)].index
         # self.human_df.loc[current_index, "aoi"] = aoi   # slower
         self.human_df.iat[current_index.values[0], 9] = aoi # faster
-        # self.human_df.iat[current_index.values[0], 10] = data_amount
+        self.human_df.iat[current_index.values[0], 10] = data_amount
 
     def reset(self, phase='test', test_case=None):
         self.current_timestep = 0
@@ -249,11 +249,52 @@ def step(self, action):
 
         return next_state, reward, done, info
 
-    def render(self, mode='traj', output_file=None, plot_loop=False, moving_line=False):
-        # -------------------------------------------------------------------
-        if mode == 'html':
-            pass
-        elif mode == 'traj':
-            pass
-        else:
-            raise NotImplementedError
\ No newline at end of file
+    def render(self):
+        import matplotlib.pyplot as plt
+        import matplotlib.patches as patches
+        import io
+        import imageio
+
+        map_max_x = self.config.nlon
+        map_max_y = self.config.nlat
+        # 创建一个新的图形
+        fig, ax = plt.subplots(figsize=(8, 6))
+
+        # 绘制机器人的历史轨迹
+        for timestep in range(len(self.robot_x_timelist)):
+            for robot_id in range(len(self.robot_x_timelist[timestep])):
+                ax.plot(self.robot_x_timelist[timestep][robot_id], self.robot_y_timelist[timestep][robot_id], color='gray', alpha=0.5)
+
+        # 绘制机器人的位置
+        for robot in self.robots:
+            ax.plot(robot.px, robot.py, marker='o', markersize=5, color='blue')
+        
+        # 绘制机器人的感知范围
+        for robot in self.robots:
+            robot_x, robot_y = robot.px, robot.py
+            circle = patches.Circle((robot_x, robot_y), self.config.sensing_range, edgecolor='blue', facecolor='none')
+            ax.add_patch(circle)
+        
+        # 绘制人类的位置和AOI变化
+        for human in self.humans:
+            human_x, human_y, aoi = human.px, human.py, human.aoi
+            ax.plot(human_x, human_y, marker='x', markersize=5, color='red')
+            ax.text(human_x, human_y, str(aoi), fontsize=8, color='black')
+
+        # 设置图形标题和坐标轴标签
+        # ax.set_xlim(0, map_max_x)
+        # ax.set_ylim(0, map_max_y)
+        ax.set_xlabel('X')
+        ax.set_ylabel('Y')
+
+        # 在地图之外留出一些空白区域
+        ax.margins(x=0.1, y=0.1)
+        ax.set_title('Crowd Simulation Visualization')
+
+        # 显示图形
+        fig.canvas.draw()
+        image = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
+        image = image.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+        plt.close()
+
+        return image
\ No newline at end of file
diff --git a/zoo/CrowdSim/envs/crowdsim_lightzero_env.py b/zoo/CrowdSim/envs/crowdsim_lightzero_env.py
index e28a1baac..4aca9dd1a 100644
--- a/zoo/CrowdSim/envs/crowdsim_lightzero_env.py
+++ b/zoo/CrowdSim/envs/crowdsim_lightzero_env.py
@@ -79,6 +79,8 @@ def reset(self) -> np.ndarray:
             obs = np.concatenate((obs_list[0], obs_list[1]), axis=0).flatten()
         action_mask = np.ones(self.action_space.n, 'int8')
         obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1}
+        if self._replay_path is not None:
+            self._frame = []
 
         return obs
 
@@ -119,6 +121,11 @@ def step(self, action: Union[int, np.ndarray]) -> BaseEnvTimestep:
         action_mask = np.ones(self.action_space.n, 'int8')
         obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1}
         rew = to_ndarray([rew]).astype(np.float32)
+        if self._replay_path is not None:
+            self._frame.append(self._env.render())
+            if done:
+                import imageio
+                imageio.mimsave(self._replay_path + '/replay.gif', self._frame)
         return BaseEnvTimestep(obs, rew, done, info)
 
     def enable_save_replay(self, replay_path: Optional[str] = None) -> None:
diff --git a/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py b/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py
index b5721d0c3..0a5266143 100644
--- a/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py
+++ b/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py
@@ -57,15 +57,25 @@ def test_obs_1_dim_array(self):
         mcfg['obs_mode'] = '1-dim-array'
         env = CrowdSimEnv(mcfg)
         env.seed(314)
+        env.enable_save_replay('/home/nighoodRen/LightZero/result/test_replay')
         assert env._seed == 314
         obs = env.reset()
         assert obs['observation'].shape == (244, )
-        for i in range(10):
+        while True:
             random_action = env.random_action()
             timestep = env.step(random_action)
             print(timestep)
             assert timestep.obs['observation'].shape == (244, )
             assert isinstance(timestep.done, bool)
             assert timestep.reward.shape == (1, )
+            if timestep.done:
+                break
         print(env.observation_space, env.action_space, env.reward_space)
-        env.close()
\ No newline at end of file
+        env.close()
+
+
+if __name__ == '__main__':
+    test = TestCrowdSimEnv()
+    # test.test_obs_dict()
+    # test.test_obs_2_dim_array()
+    test.test_obs_1_dim_array()
\ No newline at end of file

From 15d9a44cda18f6cd02697ab8eb970c6676a60e23 Mon Sep 17 00:00:00 2001
From: nighood <nighoodren@gmail.com>
Date: Mon, 6 May 2024 01:21:12 +0800
Subject: [PATCH 09/16] polish(rjy): fix new env info in collecter

---
 lzero/worker/muzero_collector.py            | 12 ++++++++----
 zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py |  1 +
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/lzero/worker/muzero_collector.py b/lzero/worker/muzero_collector.py
index ae516960c..3c1a630dd 100644
--- a/lzero/worker/muzero_collector.py
+++ b/lzero/worker/muzero_collector.py
@@ -573,8 +573,9 @@ def collect(self,
                     reward = timestep.info['eval_episode_return']
                     if timestep.info.get('performance_info') is not None:
                         mean_aoi = timestep.info['performance_info']['mean_aoi']
+                        mean_transmit_data = timestep.info['performance_info']['mean_transmit_data']
                         mean_energy_consumption = timestep.info['performance_info']['mean_energy_consumption']
-                        collected_data_amount = timestep.info['performance_info']['collected_data_amount']
+                        transmitted_data_ratio = timestep.info['performance_info']['transmitted_data_ratio']
                         human_coverage = timestep.info['performance_info']['human_coverage']
                         info = {
                             'reward': reward,
@@ -582,8 +583,9 @@ def collect(self,
                             'step': self._env_info[env_id]['step'],
                             'visit_entropy': visit_entropies_lst[env_id] / eps_steps_lst[env_id],
                             'mean_aoi': mean_aoi,
+                            'mean_transmit_data': mean_transmit_data,
                             'mean_energy_consumption': mean_energy_consumption,
-                            'collected_data_amount': collected_data_amount,
+                            'transmitted_data_ratio': transmitted_data_ratio,
                             'human_coverage': human_coverage,
                         }
                     else:
@@ -730,8 +732,9 @@ def _output_log(self, train_iter: int) -> None:
             if self._episode_info[0].get('mean_aoi') is not None:
                 episode_aoi = [d['mean_aoi'] for d in self._episode_info]
                 episode_energy_consumption = [d['mean_energy_consumption'] for d in self._episode_info]
-                episode_collected_data_amount = [d['collected_data_amount'] for d in self._episode_info]
+                episode_transmitted_data_ratio = [d['transmitted_data_ratio'] for d in self._episode_info]
                 episode_human_coverage = [d['human_coverage'] for d in self._episode_info]
+                mean_transmit_data = [d['mean_transmit_data'] for d in self._episode_info]
                 info = {
                     'episode_count': episode_count,
                     'envstep_count': envstep_count,
@@ -748,8 +751,9 @@ def _output_log(self, train_iter: int) -> None:
                     'total_duration': self._total_duration,
                     'visit_entropy': np.mean(visit_entropy),
                     'episode_mean_aoi': np.mean(episode_aoi),
+                    'episode_mean_transmit_data': np.mean(mean_transmit_data),
                     'episode_mean_energy_consumption': np.mean(episode_energy_consumption),
-                    'episode_mean_collected_data_amount': np.mean(episode_collected_data_amount),
+                    'episode_mean_transmitted_data_ratio': np.mean(episode_transmitted_data_ratio),
                     'episode_mean_human_coverage': np.mean(episode_human_coverage),
                 }
             else:
diff --git a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
index b84a92ba9..858bbe1a0 100644
--- a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
+++ b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
@@ -207,6 +207,7 @@ def step(self, action):
                 human_transmit_data_list[human_id] = 0
                 human.update(next_px, next_py, next_theta, transmitted_data=0)
 
+            self.cur_data_amount_timelist[human_id] = human.data_amount
             self.current_human_aoi_list[human_id] = human.aoi
             self.sync_human_df(human_id, self.current_timestep + 1, human.aoi, human.data_amount)
 

From 3c8804d65bb3c6ac8f434ef9f39b7c084281c20b Mon Sep 17 00:00:00 2001
From: nighood <nighoodren@gmail.com>
Date: Mon, 6 May 2024 21:40:03 +0800
Subject: [PATCH 10/16] feature(rjy): add sez mlp_multi-head

---
 lzero/agent/sampled_efficientzero.py          |   3 +
 lzero/model/sampled_efficientzero_model_md.py | 547 ++++++++++++++++++
 lzero/policy/sampled_efficientzero.py         |   2 +
 zoo/CrowdSim/config/crowdsim_sez_md_config.py | 118 ++++
 4 files changed, 670 insertions(+)
 create mode 100644 lzero/model/sampled_efficientzero_model_md.py
 create mode 100644 zoo/CrowdSim/config/crowdsim_sez_md_config.py

diff --git a/lzero/agent/sampled_efficientzero.py b/lzero/agent/sampled_efficientzero.py
index 079bdd11d..ece54b784 100644
--- a/lzero/agent/sampled_efficientzero.py
+++ b/lzero/agent/sampled_efficientzero.py
@@ -110,6 +110,9 @@ def __init__(
             elif self.cfg.policy.model.model_type == 'conv':
                 from lzero.model.sampled_efficientzero_model import SampledEfficientZeroModel
                 model = SampledEfficientZeroModel(**self.cfg.policy.model)
+            elif self.cfg.policy.model.model_type == 'mlp_md':
+                from lzero.model.sampled_efficientzero_model_md import SampledEfficientZeroModelMD
+                model = SampledEfficientZeroModelMD(**self.cfg.policy.model)
             else:
                 raise NotImplementedError
         if self.cfg.policy.cuda and torch.cuda.is_available():
diff --git a/lzero/model/sampled_efficientzero_model_md.py b/lzero/model/sampled_efficientzero_model_md.py
new file mode 100644
index 000000000..99092de4a
--- /dev/null
+++ b/lzero/model/sampled_efficientzero_model_md.py
@@ -0,0 +1,547 @@
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+from ding.model.common import ReparameterizationHead
+from ding.model.common.head import MultiHead, DiscreteHead
+from ding.torch_utils import MLP
+from ding.utils import MODEL_REGISTRY, SequenceType
+
+from .common import EZNetworkOutput, RepresentationNetworkMLP
+from .efficientzero_model_mlp import DynamicsNetworkMLP
+from .utils import renormalize, get_params_mean
+
+
+@MODEL_REGISTRY.register('SampledEfficientZeroModelMD')
+class SampledEfficientZeroModelMD(nn.Module):
+
+    def __init__(
+        self,
+        agent_num: int,
+        single_agent_action_size: int,
+        output_separate_logit: bool = False,
+        observation_shape: int = 2,
+        action_space_size: int = 6,
+        latent_state_dim: int = 256,
+        lstm_hidden_size: int = 512,
+        fc_reward_layers: SequenceType = [32],
+        fc_value_layers: SequenceType = [32],
+        fc_policy_layers: SequenceType = [32],
+        reward_support_size: int = 601,
+        value_support_size: int = 601,
+        proj_hid: int = 1024,
+        proj_out: int = 1024,
+        pred_hid: int = 512,
+        pred_out: int = 1024,
+        self_supervised_learning_loss: bool = True,
+        categorical_distribution: bool = True,
+        activation: Optional[nn.Module] = nn.ReLU(inplace=True),
+        last_linear_layer_init_zero: bool = True,
+        state_norm: bool = False,
+        # ==============================================================
+        # specific sampled related config
+        # ==============================================================
+        continuous_action_space: bool = False,
+        num_of_sampled_actions: int = 6,
+        sigma_type='conditioned',
+        fixed_sigma_value: float = 0.3,
+        bound_type: str = None,
+        norm_type: str = 'BN',
+        discrete_action_encoding_type: str = 'one_hot',
+        res_connection_in_dynamics: bool = False,
+        *args,
+        **kwargs,
+    ):
+        """
+        Overview:
+            The definition of the network model of Sampled EfficientZero, which is a generalization version for 1D vector obs.
+            The networks are mainly built on fully connected layers.
+            Sampled EfficientZero model consists of a representation network, a dynamics network and a prediction network.
+            The representation network is an MLP network which maps the raw observation to a latent state.
+            The dynamics network is an MLP+LSTM network which predicts the next latent state, reward_hidden_state and value_prefix given the current latent state and action.
+            The prediction network is an MLP network which predicts the value and policy given the current latent state.
+        Arguments:
+            - observation_shape (:obj:`int`): Observation space shape, e.g. 8 for Lunarlander.
+            - action_space_size: (:obj:`int`): Action space size, which is an integer number. For discrete action space, it is the num of discrete actions, \
+                e.g. 4 for Lunarlander. For continuous action space, it is the dimension of the continuous action, e.g. 4 for bipedalwalker.
+            - latent_state_dim (:obj:`int`): The dimension of latent state, such as 256.
+            - lstm_hidden_size (:obj:`int`): The hidden size of LSTM in dynamics network to predict value_prefix.
+            - fc_reward_layers (:obj:`SequenceType`): The number of hidden layers of the reward head (MLP head).
+            - fc_value_layers (:obj:`SequenceType`): The number of hidden layers used in value head (MLP head).
+            - fc_policy_layers (:obj:`SequenceType`): The number of hidden layers used in policy head (MLP head).
+            - reward_support_size (:obj:`int`): The size of categorical reward output
+            - value_support_size (:obj:`int`): The size of categorical value output.
+            - proj_hid (:obj:`int`): The size of projection hidden layer.
+            - proj_out (:obj:`int`): The size of projection output layer.
+            - pred_hid (:obj:`int`): The size of prediction hidden layer.
+            - pred_out (:obj:`int`): The size of prediction output layer.
+            - self_supervised_learning_loss (:obj:`bool`): Whether to use self_supervised_learning related networks in Sampled EfficientZero model, default set it to False.
+            - categorical_distribution (:obj:`bool`): Whether to use discrete support to represent categorical distribution for value, reward/value_prefix.
+            - activation (:obj:`Optional[nn.Module]`): Activation function used in network, which often use in-place \
+                operation to speedup, e.g. ReLU(inplace=True).
+            - last_linear_layer_init_zero (:obj:`bool`): Whether to use zero initializations for the last layer of value/policy mlp, default sets it to True.
+            - state_norm (:obj:`bool`): Whether to use normalization for latent states, default sets it to True.
+            # ==============================================================
+            # specific sampled related config
+            # ==============================================================
+            - continuous_action_space (:obj:`bool`): The type of action space. default set it to False.
+            - num_of_sampled_actions (:obj:`int`): the number of sampled actions, i.e. the K in original Sampled MuZero paper.
+            # see ``ReparameterizationHead`` in ``ding.model.common.head`` for more details about the following arguments.
+            - sigma_type (:obj:`str`): the type of sigma in policy head of prediction network, options={'conditioned', 'fixed'}.
+            - fixed_sigma_value (:obj:`float`): the fixed sigma value in policy head of prediction network,
+            - bound_type (:obj:`str`): The type of bound in networks.  Default sets it to None.
+            - norm_type (:obj:`str`): The type of normalization in networks. default set it to 'BN'.
+            - discrete_action_encoding_type (:obj:`str`): The type of encoding for discrete action. Default sets it to 'one_hot'. options = {'one_hot', 'not_one_hot'}
+            - res_connection_in_dynamics (:obj:`bool`): Whether to use residual connection for dynamics network, default set it to False.
+        """
+        super(SampledEfficientZeroModelMD, self).__init__()
+        if not categorical_distribution:
+            self.reward_support_size = 1
+            self.value_support_size = 1
+        else:
+            self.reward_support_size = reward_support_size
+            self.value_support_size = value_support_size
+
+        self.continuous_action_space = continuous_action_space
+        self.observation_shape = observation_shape
+        self.action_space_size = action_space_size
+        # The dim of action space. For discrete action space, it is 1.
+        # For continuous action space, it is the dimension of continuous action.
+        self.action_space_dim = action_space_size if self.continuous_action_space else 1
+        assert discrete_action_encoding_type in ['one_hot', 'not_one_hot'], discrete_action_encoding_type
+        self.discrete_action_encoding_type = discrete_action_encoding_type
+        if self.continuous_action_space:
+            self.action_encoding_dim = action_space_size
+        else:
+            if self.discrete_action_encoding_type == 'one_hot':
+                self.action_encoding_dim = action_space_size
+            elif self.discrete_action_encoding_type == 'not_one_hot':
+                self.action_encoding_dim = 1
+
+        self.lstm_hidden_size = lstm_hidden_size
+        self.latent_state_dim = latent_state_dim
+        self.fc_reward_layers = fc_reward_layers
+        self.fc_value_layers = fc_value_layers
+        self.fc_policy_layers = fc_policy_layers
+        self.proj_hid = proj_hid
+        self.proj_out = proj_out
+        self.pred_hid = pred_hid
+        self.pred_out = pred_out
+
+        self.last_linear_layer_init_zero = last_linear_layer_init_zero
+        self.state_norm = state_norm
+        self.self_supervised_learning_loss = self_supervised_learning_loss
+
+        self.sigma_type = sigma_type
+        self.fixed_sigma_value = fixed_sigma_value
+        self.bound_type = bound_type
+        self.norm_type = norm_type
+        self.num_of_sampled_actions = num_of_sampled_actions
+        self.res_connection_in_dynamics = res_connection_in_dynamics
+
+        self.representation_network = RepresentationNetworkMLP(
+            observation_shape=self.observation_shape, hidden_channels=self.latent_state_dim, norm_type=norm_type
+        )
+
+        self.dynamics_network = DynamicsNetworkMLP(
+            action_encoding_dim=self.action_encoding_dim,
+            num_channels=self.latent_state_dim + self.action_encoding_dim,
+            common_layer_num=2,
+            lstm_hidden_size=self.lstm_hidden_size,
+            fc_reward_layers=self.fc_reward_layers,
+            output_support_size=self.reward_support_size,
+            last_linear_layer_init_zero=self.last_linear_layer_init_zero,
+            norm_type=norm_type,
+            res_connection_in_dynamics=self.res_connection_in_dynamics,
+        )
+
+        self.prediction_network = PredictionNetworkMD(
+            agent_num=agent_num,
+            single_agent_action_size=single_agent_action_size,
+            continuous_action_space=self.continuous_action_space,
+            action_space_size=self.action_space_size,
+            num_channels=self.latent_state_dim,
+            fc_value_layers=self.fc_value_layers,
+            output_support_size=self.value_support_size,
+            last_linear_layer_init_zero=self.last_linear_layer_init_zero,
+            sigma_type=self.sigma_type,
+            fixed_sigma_value=self.fixed_sigma_value,
+            bound_type=self.bound_type,
+            norm_type=self.norm_type,
+            output_separate_logit=output_separate_logit,
+        )
+
+        if self.self_supervised_learning_loss:
+            # self_supervised_learning_loss related network proposed in EfficientZero
+            self.projection_input_dim = latent_state_dim
+            self.projection = nn.Sequential(
+                nn.Linear(self.projection_input_dim, self.proj_hid), nn.BatchNorm1d(self.proj_hid), activation,
+                nn.Linear(self.proj_hid, self.proj_hid), nn.BatchNorm1d(self.proj_hid), activation,
+                nn.Linear(self.proj_hid, self.proj_out), nn.BatchNorm1d(self.proj_out)
+            )
+            self.prediction_head = nn.Sequential(
+                nn.Linear(self.proj_out, self.pred_hid),
+                nn.BatchNorm1d(self.pred_hid),
+                activation,
+                nn.Linear(self.pred_hid, self.pred_out),
+            )
+
+    def initial_inference(self, obs: torch.Tensor) -> EZNetworkOutput:
+        """
+         Overview:
+            Initial inference of SampledEfficientZero model, which is the first step of the SampledEfficientZero model.
+            To perform the initial inference, we first use the representation network to obtain the "latent_state" of the observation.
+            Then we use the prediction network to predict the "value" and "policy_logits" of the "latent_state", and
+            also prepare the zeros-like ``reward_hidden_state`` for the next step of the Sampled EfficientZero model.
+        Arguments:
+            - obs (:obj:`torch.Tensor`): The 1D vector observation data.
+        Returns (EZNetworkOutput):
+            - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation.
+            - value_prefix (:obj:`torch.Tensor`): The predicted prefix sum of value for input state. \
+                In initial inference, we set it to zero vector.
+            - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action.
+            - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state.
+            - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The hidden state of LSTM about reward. In initial inference, \
+                we set it to the zeros-like hidden state (H and C).
+        Shapes:
+            - obs (:obj:`torch.Tensor`): :math:`(B, obs_shape)`, where B is batch_size.
+            - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size.
+            - value_prefix (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size.
+            - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size.
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The shape of each element is :math:`(1, B, lstm_hidden_size)`, where B is batch_size.
+        """
+        batch_size = obs.size(0)
+        obs = obs.to(torch.float32)
+        latent_state = self._representation(obs)
+        policy_logits, value = self._prediction(latent_state)
+        # zero initialization for reward hidden states
+        # (hn, cn), each element shape is (layer_num=1, batch_size, lstm_hidden_size)
+        reward_hidden_state = (
+            torch.zeros(1, batch_size,
+                        self.lstm_hidden_size).to(obs.device), torch.zeros(1, batch_size,
+                                                                           self.lstm_hidden_size).to(obs.device)
+        )
+        return EZNetworkOutput(value, [0. for _ in range(batch_size)], policy_logits, latent_state, reward_hidden_state)
+
+    def recurrent_inference(
+            self, latent_state: torch.Tensor, reward_hidden_state: torch.Tensor, action: torch.Tensor
+    ) -> EZNetworkOutput:
+        """
+        Overview:
+            Recurrent inference of Sampled EfficientZero model, which is the rollout step of the Sampled EfficientZero model.
+            To perform the recurrent inference, we first use the dynamics network to predict ``next_latent_state``,
+            ``reward_hidden_state``, ``value_prefix`` by the given current ``latent_state`` and ``action``.
+             We then use the prediction network to predict the ``value`` and ``policy_logits``.
+        Arguments:
+            - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state.
+            - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The input hidden state of LSTM about reward.
+            - action (:obj:`torch.Tensor`): The predicted action to rollout.
+        Returns (EZNetworkOutput):
+            - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation.
+            - value_prefix (:obj:`torch.Tensor`): The predicted prefix sum of value for input state.
+            - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action.
+            - next_latent_state (:obj:`torch.Tensor`): The predicted next latent state.
+            - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The output hidden state of LSTM about reward.
+        Shapes:
+            - action (:obj:`torch.Tensor`): :math:`(B, )`, where B is batch_size.
+            - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size.
+            - value_prefix (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size.
+            - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size.
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - next_latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The shape of each element is :math:`(1, B, lstm_hidden_size)`, where B is batch_size.
+         """
+        next_latent_state, reward_hidden_state, value_prefix = self._dynamics(latent_state, reward_hidden_state, action)
+        policy_logits, value = self._prediction(next_latent_state)
+        return EZNetworkOutput(value, value_prefix, policy_logits, next_latent_state, reward_hidden_state)
+
+    def _representation(self, observation: torch.Tensor) -> Tuple[torch.Tensor]:
+        """
+         Overview:
+             Use the representation network to encode the observations into latent state.
+         Arguments:
+             - obs (:obj:`torch.Tensor`): The 1D vector  observation data.
+         Returns:
+             - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state.
+         Shapes:
+             - obs (:obj:`torch.Tensor`): :math:`(B, obs_shape)`, where B is batch_size.
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+         """
+        latent_state = self.representation_network(observation)
+        if self.state_norm:
+            latent_state = renormalize(latent_state)
+        return latent_state
+
+    def _prediction(self, latent_state: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Overview:
+            Use the representation network to encode the observations into latent state.
+        Arguments:
+            - obs (:obj:`torch.Tensor`): The 1D vector observation data.
+        Returns:
+            - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action.
+            - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation.
+        Shapes:
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size.
+            - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size.
+        """
+        policy, value = self.prediction_network(latent_state)
+        return policy, value
+
+    def _dynamics(self, latent_state: torch.Tensor, reward_hidden_state: Tuple,
+                  action: torch.Tensor) -> Tuple[torch.Tensor, Tuple[torch.Tensor], torch.Tensor]:
+        """
+        Overview:
+            Concatenate ``latent_state`` and ``action`` and use the dynamics network to predict ``next_latent_state``
+            ``value_prefix`` and ``next_reward_hidden_state``.
+        Arguments:
+            - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state.
+            - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The input hidden state of LSTM about reward.
+            - action (:obj:`torch.Tensor`): The predicted action to rollout.
+        Returns:
+            - next_latent_state (:obj:`torch.Tensor`): The predicted latent state of the next timestep.
+            - next_reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The output hidden state of LSTM about reward.
+            - value_prefix (:obj:`torch.Tensor`): The predicted prefix sum of value for input state.
+        Shapes:
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - action (:obj:`torch.Tensor`): :math:`(B, )`, where B is batch_size.
+            - next_latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - value_prefix (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size.
+          """
+        # NOTE: the discrete action encoding type is important for some environments
+
+        if not self.continuous_action_space:
+            # discrete action space
+            if self.discrete_action_encoding_type == 'one_hot':
+                # Stack latent_state with the one hot encoded action
+                if len(action.shape) == 1:
+                    # (batch_size, ) -> (batch_size, 1)
+                    # e.g.,  torch.Size([8]) ->  torch.Size([8, 1])
+                    action = action.unsqueeze(-1)
+
+                # transform action to one-hot encoding.
+                # action_one_hot shape: (batch_size, action_space_size), e.g., (8, 4)
+                action_one_hot = torch.zeros(action.shape[0], self.action_space_size, device=action.device)
+                # transform action to torch.int64
+                action = action.long()
+                action_one_hot.scatter_(1, action, 1)
+                action_encoding = action_one_hot
+            elif self.discrete_action_encoding_type == 'not_one_hot':
+                action_encoding = action / self.action_space_size
+                if len(action_encoding.shape) == 1:
+                    # (batch_size, ) -> (batch_size, 1)
+                    # e.g., torch.Size([8]) ->  torch.Size([8, 1])
+                    action_encoding = action_encoding.unsqueeze(-1)
+        else:
+            # continuous action space
+            if len(action.shape) == 1:
+                # (batch_size, ) -> (batch_size, 1)
+                # e.g., torch.Size([8]) ->  torch.Size([8, 1])
+                action = action.unsqueeze(-1)
+            elif len(action.shape) == 3:
+                # (batch_size, action_dim, 1) -> (batch_size,  action_dim)
+                # e.g., torch.Size([8, 2, 1]) ->  torch.Size([8, 2])
+                action = action.squeeze(-1)
+
+            action_encoding = action
+
+        action_encoding = action_encoding.to(latent_state.device).float()
+        # state_action_encoding shape: (batch_size, latent_state[1] + action_dim]) or
+        # (batch_size, latent_state[1] + action_space_size]) depending on the discrete_action_encoding_type.
+        state_action_encoding = torch.cat((latent_state, action_encoding), dim=1)
+
+        next_latent_state, next_reward_hidden_state, value_prefix = self.dynamics_network(
+            state_action_encoding, reward_hidden_state
+        )
+
+        if not self.state_norm:
+            return next_latent_state, next_reward_hidden_state, value_prefix
+        else:
+            next_latent_state_normalized = renormalize(next_latent_state)
+            return next_latent_state_normalized, next_reward_hidden_state, value_prefix
+
+    def project(self, latent_state: torch.Tensor, with_grad=True) -> torch.Tensor:
+        """
+        Overview:
+            Project the latent state to a lower dimension to calculate the self-supervised loss, which is proposed in EfficientZero.
+            For more details, please refer to the paper ``Exploring Simple Siamese Representation Learning``.
+        Arguments:
+            - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state.
+            - with_grad (:obj:`bool`): Whether to calculate gradient for the projection result.
+        Returns:
+            - proj (:obj:`torch.Tensor`): The result embedding vector of projection operation.
+        Shapes:
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - proj (:obj:`torch.Tensor`): :math:`(B, projection_output_dim)`, where B is batch_size.
+
+        Examples:
+            >>> latent_state = torch.randn(256, 64)
+            >>> output = self.project(latent_state)
+            >>> output.shape # (256, 1024)
+         """
+        proj = self.projection(latent_state)
+
+        if with_grad:
+            # with grad, use prediction_head
+            return self.prediction_head(proj)
+        else:
+            return proj.detach()
+
+    def get_params_mean(self):
+        return get_params_mean(self)
+
+
+class PredictionNetworkMD(nn.Module):
+
+    def __init__(
+        self,
+        agent_num,
+        single_agent_action_size,
+        continuous_action_space,
+        action_space_size,
+        num_channels,
+        common_layer_num: int = 2,
+        fc_value_layers: SequenceType = [32],
+        output_support_size: int = 601,
+        last_linear_layer_init_zero: bool = True,
+        activation: Optional[nn.Module] = nn.ReLU(inplace=True),
+        # ==============================================================
+        # specific sampled related config
+        # ==============================================================
+        sigma_type='conditioned',
+        fixed_sigma_value: float = 0.3,
+        bound_type: str = None,
+        norm_type: str = 'BN',
+        output_separate_logit: bool = False,
+    ):
+        """
+        Overview:
+            The definition of policy and value prediction network, which is used to predict value and policy by the
+            given latent state.
+            The networks are mainly built on fully connected layers.
+        Arguments:
+            - agent_num (:obj:`int`): The number of agents in the environment.
+            - single_agent_action_size (:obj:`int`): The number of actions for each agent.
+            - continuous_action_space (:obj:`bool`): The type of action space. default set it to False.
+            - action_space_size: (:obj:`int`): Action space size, usually an integer number. For discrete action \
+                space, it is the number of discrete actions. For continuous action space, it is the dimension of \
+                continuous action.
+            - num_channels (:obj:`int`): The num of channels in latent states.
+            - num_res_blocks (:obj:`int`): The number of res blocks.
+            - fc_value_layers (:obj:`SequenceType`): hidden layers of the value prediction head (MLP head).
+            - output_support_size (:obj:`int`): dim of value output.
+            - last_linear_layer_init_zero (:obj:`bool`): Whether to use zero initializations for the last layer of value/policy mlp, default sets it to True.
+            - output_separate_logit (:obj:`bool`): Whether to output separate logit for each action.
+            # ==============================================================
+            # specific sampled related config
+            # ==============================================================
+            # see ``ReparameterizationHead`` in ``ding.model.common.head`` for more details about thee following arguments.
+            - sigma_type (:obj:`str`): the type of sigma in policy head of prediction network, options={'conditioned', 'fixed'}.
+            - fixed_sigma_value (:obj:`float`): the fixed sigma value in policy head of prediction network,
+            - bound_type (:obj:`str`): The type of bound in networks.  default set it to None.
+            - norm_type (:obj:`str`): The type of normalization in networks. default set it to 'BN'.
+        """
+        super().__init__()
+        self.num_channels = num_channels
+        self.continuous_action_space = continuous_action_space
+        self.norm_type = norm_type
+        self.sigma_type = sigma_type
+        self.fixed_sigma_value = fixed_sigma_value
+        self.bound_type = bound_type
+        self.action_space_size = action_space_size
+        if self.continuous_action_space:
+            self.action_encoding_dim = self.action_space_size
+        else:
+            self.action_encoding_dim = 1
+
+        # ******* common backbone ******
+        self.fc_prediction_common = MLP(
+            in_channels=self.num_channels,
+            hidden_channels=self.num_channels,
+            out_channels=self.num_channels,
+            layer_num=common_layer_num,
+            activation=activation,
+            norm_type=norm_type,
+            output_activation=True,
+            output_norm=True,
+            # last_linear_layer_init_zero=False is important for convergence
+            last_linear_layer_init_zero=False,
+        )
+
+        # ******* value and policy head ******
+        self.fc_value_head = MLP(
+            in_channels=self.num_channels,
+            hidden_channels=fc_value_layers[0],
+            out_channels=output_support_size,
+            layer_num=2,
+            activation=activation,
+            norm_type=norm_type,
+            output_activation=False,
+            output_norm=False,
+            # last_linear_layer_init_zero=True is beneficial for convergence speed.
+            last_linear_layer_init_zero=last_linear_layer_init_zero
+        )
+
+        # sampled related core code
+        if self.continuous_action_space:
+            self.policy_multi_head = MultiHead(
+                head_cls=ReparameterizationHead,
+                hidden_size=self.num_channels,
+                output_size_list=[single_agent_action_size for _ in range(agent_num)],
+                layer_num=2,
+                sigma_type=self.sigma_type,
+                fixed_sigma_value=self.fixed_sigma_value,
+                activation=nn.ReLU(),
+                norm_type=None,
+                bound_type=self.bound_type
+            )
+        else:
+            self.policy_multi_head = MultiHead(
+            head_cls=DiscreteHead,
+            hidden_size=self.num_channels,
+            output_size_list=[single_agent_action_size for _ in range(agent_num)],
+        )
+        self.output_separate_logit = output_separate_logit
+
+    def forward(self, latent_state: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Overview:
+             Forward computation of the prediction network.
+        Arguments:
+             - latent_state (:obj:`torch.Tensor`): input tensor with shape (B, in_channels).
+        Returns:
+             - policy (:obj:`torch.Tensor`): policy tensor. If action space is discrete, shape is (B, action_space_size).
+                If action space is continuous, shape is (B, action_space_size * 2).
+             - value (:obj:`torch.Tensor`): value tensor with shape (B, output_support_size).
+        """
+        x_prediction_common = self.fc_prediction_common(latent_state)
+        value = self.fc_value_head(x_prediction_common)
+
+        # sampled related core code
+        if not self.continuous_action_space:
+            # policy_list: {'logit': [policy1, policy2, ...],}
+            # policyi shape: (B, action_space_size)
+            policy_list = self.policy_multi_head(x_prediction_common)['logit']
+            if not self.output_separate_logit:
+                # The joint action space policy is the product of each agent policy
+                # policy shape: (B, action_space_size^^agent_num)
+                batch_size = latent_state.size(0)
+                joint_logits_batches = []
+                for i in range(batch_size):
+                    current_batch = [policy[i] for policy in policy_list]
+                    cartesian_prod_result = torch.cartesian_prod(*current_batch)
+                    joint_logits = cartesian_prod_result.prod(dim=1)
+                    joint_logits_batches.append(joint_logits)
+                policy = torch.stack(joint_logits_batches)
+            else:
+                # policy_list: [policy1, policy2, ...]
+                # policy sahpe: (B, agent_num, action_space_size)
+                policy = torch.stack(policy_list, dim=1)
+        elif self.continuous_action_space:
+            # policy = torch.cat([policy['mu'], policy['sigma']], dim=-1)
+            # TODO(rjy): complete the continuous action space policy
+            pass
+
+        return policy, value
diff --git a/lzero/policy/sampled_efficientzero.py b/lzero/policy/sampled_efficientzero.py
index 7003f6808..687fe631b 100644
--- a/lzero/policy/sampled_efficientzero.py
+++ b/lzero/policy/sampled_efficientzero.py
@@ -234,6 +234,8 @@ def default_model(self) -> Tuple[str, List[str]]:
             return 'SampledEfficientZeroModel', ['lzero.model.sampled_efficientzero_model']
         elif self._cfg.model.model_type == "mlp":
             return 'SampledEfficientZeroModelMLP', ['lzero.model.sampled_efficientzero_model_mlp']
+        elif self._cfg.model.model_type == "mlp_md":
+            return 'SampledEfficientZeroModelMD', ['lzero.model.sampled_efficientzero_model_md']
         else:
             raise ValueError("model type {} is not supported".format(self._cfg.model.model_type))
 
diff --git a/zoo/CrowdSim/config/crowdsim_sez_md_config.py b/zoo/CrowdSim/config/crowdsim_sez_md_config.py
new file mode 100644
index 000000000..08ace6a98
--- /dev/null
+++ b/zoo/CrowdSim/config/crowdsim_sez_md_config.py
@@ -0,0 +1,118 @@
+from easydict import EasyDict
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = '0'
+# ==============================================================
+# begin of the most frequently changed config specified by the user
+# ==============================================================
+collector_env_num = 8
+n_episode = 8
+evaluator_env_num = 3
+num_simulations = 25
+update_per_collect = 100
+batch_size = 256
+max_env_step = int(3e5)
+reanalyze_ratio = 0.
+robot_num = 2
+human_num = 59  # purdue
+# human_num = 33  # NCSU
+# human_num = 92  # KAIST
+one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]
+K = 10
+# ==============================================================
+# end of the most frequently changed config specified by the user
+# ==============================================================
+
+CrowdSim_sez_config = dict(
+    exp_name=
+    f'result/old_env/new_CrowdSim_sez_md_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0',
+    env=dict(
+        obs_mode='1-dim-array',
+        env_name='CrowdSim-v0',
+        dataset = 'purdue',
+        robot_num = robot_num,
+        human_num = human_num, 
+        one_uav_action_space = one_uav_action_space,
+        continuous=False,
+        manually_discretization=False,
+        collector_env_num=collector_env_num,
+        evaluator_env_num=evaluator_env_num,
+        n_evaluator_episode=evaluator_env_num,
+        manager=dict(shared_memory=False, ),
+    ),
+    policy=dict(
+        model=dict(
+            agent_num = robot_num,
+            observation_shape=(robot_num + human_num)*4,
+            obs_mode='1-dim-array',
+            robot_state_dim = 4,
+            human_state_dim = 4,
+            robot_num = robot_num,
+            human_num = human_num,
+            single_agent_action_size=len(one_uav_action_space),
+            action_space_size=(len(one_uav_action_space))**robot_num,
+            model_type='mlp_md', 
+            output_separate_logit=False,    # not output separate logit for each action.
+            continuous_action_space=False,
+            num_of_sampled_actions=K,
+            lstm_hidden_size=128,
+            latent_state_dim=128,
+            self_supervised_learning_loss=True,  # NOTE: default is False.
+            discrete_action_encoding_type='one_hot',
+            res_connection_in_dynamics=True,
+            norm_type='BN',
+        ),
+        cuda=True,
+        env_type='not_board_games',
+        game_segment_length=200,
+        update_per_collect=update_per_collect,
+        batch_size=batch_size,
+        optim_type='Adam',
+        lr_piecewise_constant_decay=False,
+        learning_rate=0.003,
+        ssl_loss_weight=2,  # NOTE: default is 0.
+        grad_clip_value=0.5,
+        num_simulations=num_simulations,
+        reanalyze_ratio=reanalyze_ratio,
+        n_episode=n_episode,
+        eval_freq=int(1e3),
+        replay_buffer_size=int(1e6),  # the size/capacity of replay_buffer, in the terms of transitions.
+        collector_env_num=collector_env_num,
+        evaluator_env_num=evaluator_env_num,
+    ),
+)
+
+CrowdSim_sez_config = EasyDict(CrowdSim_sez_config)
+main_config = CrowdSim_sez_config
+
+CrowdSim_sez_create_config = dict(
+    env=dict(
+        type='crowdsim_lightzero',
+        import_names=['zoo.CrowdSim.envs.crowdsim_lightzero_env'],
+    ),
+    env_manager=dict(type='base'),
+    policy=dict(
+        type='sampled_efficientzero',
+        import_names=['lzero.policy.sampled_efficientzero'],
+    ),
+    collector=dict(
+        type='episode_muzero',
+        import_names=['lzero.worker.muzero_collector'],
+    )
+)
+CrowdSim_sez_create_config = EasyDict(CrowdSim_sez_create_config)
+create_config = CrowdSim_sez_create_config
+
+if __name__ == "__main__":
+    # Users can use different train entry by specifying the entry_type.
+    entry_type = "train_muzero"  # options={"train_muzero", "train_muzero_with_gym_env"}
+
+    if entry_type == "train_muzero":
+        from lzero.entry import train_muzero
+    elif entry_type == "train_muzero_with_gym_env":
+        """
+        The ``train_muzero_with_gym_env`` entry means that the environment used in the training process is generated by wrapping the original gym environment with LightZeroEnvWrapper.
+        Users can refer to lzero/envs/wrappers for more details.
+        """
+        from lzero.entry import train_muzero_with_gym_env as train_muzero
+
+    train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step)

From c6723a013a4b74938f2a294310fc06d1206c776d Mon Sep 17 00:00:00 2001
From: nighood <nighoodren@gmail.com>
Date: Tue, 7 May 2024 00:01:54 +0800
Subject: [PATCH 11/16] feature(rjy): set the environment to two modes

---
 .../config/crowdsim_muzero_md_config.py       |  5 +-
 zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py   | 48 ++++++++++++++-----
 .../envs/test_crowdsim_lightzero_env.py       |  1 +
 3 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/zoo/CrowdSim/config/crowdsim_muzero_md_config.py b/zoo/CrowdSim/config/crowdsim_muzero_md_config.py
index ec4453ff8..c0b3b1018 100644
--- a/zoo/CrowdSim/config/crowdsim_muzero_md_config.py
+++ b/zoo/CrowdSim/config/crowdsim_muzero_md_config.py
@@ -1,6 +1,6 @@
 from easydict import EasyDict
 import os
-os.environ["CUDA_VISIBLE_DEVICES"] = '2'
+os.environ["CUDA_VISIBLE_DEVICES"] = '1'
 # ==============================================================
 # begin of the most frequently changed config specified by the user
 # ==============================================================
@@ -23,8 +23,9 @@
 
 CrowdSim_muzero_config = dict(
     exp_name=
-    f'result/CrowdSim_muzeromd_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0',
+    f'result/new_env/new_CrowdSim_muzeromd_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0',
     env=dict(
+        env_mode = 'easy',
         obs_mode='1-dim-array',
         env_name='CrowdSim-v0',
         dataset = 'purdue',
diff --git a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
index 858bbe1a0..ba855b35d 100644
--- a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
+++ b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
@@ -29,6 +29,7 @@ def __init__(self, dataset, custom_config=None):
         self.config = get_selected_config(dataset)
         self.config.update(custom_config)
 
+        self.env_mode = self.config.env_mode    # 'easy' or 'hard'
         self.human_num = self.config.human_num
         self.robot_num = self.config.robot_num
         self.num_timestep = self.config.num_timestep    # max timestep
@@ -67,6 +68,7 @@ def __init__(self, dataset, custom_config=None):
         self.human_df['aoi'] = -1  # 加入aoi记录aoi
         self.human_df['data_amount'] = -1    # record the remaining data amount of each human
         self.human_df['energy'] = -1  # 加入energy记录energy
+        logging.info('Env mode:', self.env_mode)
         logging.info('human number: {}'.format(self.human_num))
         logging.info('Robot number: {}'.format(self.robot_num))
 
@@ -195,17 +197,36 @@ def step(self, action):
             next_px, next_py, next_theta = get_human_position_from_list(self.current_timestep + 1, human_id,
                                                                         selected_data, selected_next_data, self.config)
             should_reset = judge_aoi_update([next_px, next_py], new_robot_position, self.config)
-            if should_reset:
-                # if the human is in the range of the robot, then part of human's data will be transmitted
-                last_data_amount = human.data_amount
-                human.update(next_px, next_py, next_theta, transmitted_data=self.transmit_v)
-                human_transmit_data_list[human_id] = min(last_data_amount, self.transmit_v)
-                num_updated_human += 1
+            if self.env_mode == 'easy':
+                if should_reset:
+                    # if the human is in the range of the robot, then part of human's data will be transmitted
+                    if human.aoi > 1:
+                        human_transmit_data_list[human_id] = human.aoi
+                    else:
+                        human_transmit_data_list[human_id] = 1
+
+                    human.set(next_px, next_py, next_theta, aoi=1, data_amount=human.aoi)
+                    num_updated_human += 1
+                else:
+                    # if the human is not in the range of the robot, then update the aoi of the human
+                    human_transmit_data_list[human_id] = 0
+                    new_aoi = human.aoi + 1
+                    human.set(next_px, next_py, next_theta, aoi=new_aoi, data_amount=human.aoi)
+                    
+            elif self.env_mode == 'hard':
+                if should_reset:
+                    # if the human is in the range of the robot, then part of human's data will be transmitted
+                    last_data_amount = human.data_amount
+                    human.update(next_px, next_py, next_theta, transmitted_data=self.transmit_v)
+                    human_transmit_data_list[human_id] = min(last_data_amount, self.transmit_v)
+                    num_updated_human += 1
+                else:
+                    # if the human is not in the range of the robot, then no data will be transmitted, \
+                    # and update aoi and caculate new collected data amount
+                    human_transmit_data_list[human_id] = 0
+                    human.update(next_px, next_py, next_theta, transmitted_data=0)
             else:
-                # if the human is not in the range of the robot, then no data will be transmitted, \
-                # and update aoi and caculate new collected data amount
-                human_transmit_data_list[human_id] = 0
-                human.update(next_px, next_py, next_theta, transmitted_data=0)
+                raise ValueError("env_mode should be 'easy' or 'hard'")
 
             self.cur_data_amount_timelist[human_id] = human.data_amount
             self.current_human_aoi_list[human_id] = human.aoi
@@ -215,7 +236,12 @@ def step(self, action):
         self.update_human_timelist[self.current_timestep] = num_updated_human
         delta_sum_transmit_data = np.sum(human_transmit_data_list)
         self.data_transmission += (delta_sum_transmit_data * 0.3)  # Mb, 0.02M/s per person
-        self.total_generated_data_amount += self.generate_data_amount_per_step
+        if self.env_mode == 'easy':
+            # in easy mode, the data amount generated per step is equal to the number of humans
+            self.total_generated_data_amount = self.num_timestep*self.human_num
+        elif self.env_mode == 'hard':
+            # in hard mode, the data amount generated per step is equal to the sum of the data amount of all humans
+            self.total_generated_data_amount += self.generate_data_amount_per_step
 
         # TODO: need to be well-defined
         reward = self.mean_aoi_timelist[self.current_timestep] - self.mean_aoi_timelist[self.current_timestep + 1] \
diff --git a/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py b/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py
index 0a5266143..09f36d6b7 100644
--- a/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py
+++ b/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py
@@ -10,6 +10,7 @@
         human_num = 59, # purdue
         one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]],
         obs_mode = '2-dim-array',
+        env_mode = 'easy',
         )
 
 @ pytest.mark.envtest

From c4e9d584ef8ac5f0adc11c7c7c66c2706eb9a442 Mon Sep 17 00:00:00 2001
From: nighood <nighoodren@gmail.com>
Date: Tue, 7 May 2024 16:46:37 +0800
Subject: [PATCH 12/16] feature(rjy): add ez multi-head model

---
 lzero/agent/efficientzero.py                  |   3 +
 lzero/model/efficientzero_model_md.py         | 479 ++++++++++++++++++
 lzero/policy/efficientzero.py                 |   2 +
 .../crowdsim_efficientzero_md_config.py       | 114 +++++
 4 files changed, 598 insertions(+)
 create mode 100644 lzero/model/efficientzero_model_md.py
 create mode 100644 zoo/CrowdSim/config/crowdsim_efficientzero_md_config.py

diff --git a/lzero/agent/efficientzero.py b/lzero/agent/efficientzero.py
index 421cea881..f06cd844b 100644
--- a/lzero/agent/efficientzero.py
+++ b/lzero/agent/efficientzero.py
@@ -110,6 +110,9 @@ def __init__(
             elif self.cfg.policy.model.model_type == 'conv':
                 from lzero.model.efficientzero_model import EfficientZeroModel
                 model = EfficientZeroModel(**self.cfg.policy.model)
+            elif self.cfg.policy.model.model_type == 'mlp_md':
+                from lzero.model.efficientzero_model_md import EfficientZeroModelMD
+                model = EfficientZeroModelMD(**self.cfg.policy.model)
             else:
                 raise NotImplementedError
         if self.cfg.policy.cuda and torch.cuda.is_available():
diff --git a/lzero/model/efficientzero_model_md.py b/lzero/model/efficientzero_model_md.py
new file mode 100644
index 000000000..d520995d8
--- /dev/null
+++ b/lzero/model/efficientzero_model_md.py
@@ -0,0 +1,479 @@
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+from ding.torch_utils import MLP
+from ding.utils import MODEL_REGISTRY, SequenceType
+from numpy import ndarray
+
+from .common import EZNetworkOutput, RepresentationNetworkMLP
+from .muzero_model_md import PredictionNetworkMD
+from .utils import renormalize, get_params_mean, get_dynamic_mean, get_reward_mean
+
+
+@MODEL_REGISTRY.register('EfficientZeroModelMD')
+class EfficientZeroModelMD(nn.Module):
+
+    def __init__(
+        self,
+        agent_num: int,
+        output_separate_logit: bool = False,
+        observation_shape: int = 2,
+        single_agent_action_size: int = 5,
+        action_space_size: int = 6,
+        lstm_hidden_size: int = 512,
+        latent_state_dim: int = 256,
+        fc_reward_layers: SequenceType = [32],
+        fc_value_layers: SequenceType = [32],
+        reward_support_size: int = 601,
+        value_support_size: int = 601,
+        proj_hid: int = 1024,
+        proj_out: int = 1024,
+        pred_hid: int = 512,
+        pred_out: int = 1024,
+        self_supervised_learning_loss: bool = True,
+        categorical_distribution: bool = True,
+        last_linear_layer_init_zero: bool = True,
+        state_norm: bool = False,
+        activation: Optional[nn.Module] = nn.ReLU(inplace=True),
+        norm_type: Optional[str] = 'BN',
+        discrete_action_encoding_type: str = 'one_hot',
+        res_connection_in_dynamics: bool = False,
+        *args,
+        **kwargs,
+    ):
+        """
+        Overview:
+            The definition of the network model of EfficientZero, which is a generalization version for 1D vector obs.
+            The networks are mainly built on fully connected layers.
+            Sampled EfficientZero model consists of a representation network, a dynamics network and a prediction network.
+            The representation network is an MLP network which maps the raw observation to a latent state.
+            The dynamics network is an MLP+LSTM network which predicts the next latent state, reward_hidden_state and value_prefix given the current latent state and action.
+            The prediction network is an MLP network which predicts the value and policy given the current latent state.
+        Arguments:
+            - observation_shape (:obj:`int`): Observation space shape, e.g. 8 for Lunarlander.
+            - action_space_size: (:obj:`int`): Action space size, e.g. 4 for Lunarlander.
+            - lstm_hidden_size (:obj:`int`): The hidden size of LSTM in dynamics network to predict value_prefix.
+            - latent_state_dim (:obj:`int`): The dimension of latent state, such as 256.
+            - fc_reward_layers (:obj:`SequenceType`): The number of hidden layers of the reward head (MLP head).
+            - fc_value_layers (:obj:`SequenceType`): The number of hidden layers used in value head (MLP head).
+            - fc_policy_layers (:obj:`SequenceType`): The number of hidden layers used in policy head (MLP head).
+            - reward_support_size (:obj:`int`): The size of categorical reward output
+            - value_support_size (:obj:`int`): The size of categorical value output.
+            - proj_hid (:obj:`int`): The size of projection hidden layer.
+            - proj_out (:obj:`int`): The size of projection output layer.
+            - pred_hid (:obj:`int`): The size of prediction hidden layer.
+            - pred_out (:obj:`int`): The size of prediction output layer.
+            - self_supervised_learning_loss (:obj:`bool`): Whether to use self_supervised_learning related networks in Sampled EfficientZero model, default set it to False.
+            - categorical_distribution (:obj:`bool`): Whether to use discrete support to represent categorical distribution for value, reward/value_prefix.
+            - last_linear_layer_init_zero (:obj:`bool`): Whether to use zero initializations for the last layer of value/policy mlp, default sets it to True.
+            - state_norm (:obj:`bool`): Whether to use normalization for latent states, default sets it to True.
+            - activation (:obj:`Optional[nn.Module]`): Activation function used in network, which often use in-place \
+                operation to speedup, e.g. ReLU(inplace=True).
+            - discrete_action_encoding_type (:obj:`str`): The type of encoding for discrete action. Default sets it to 'one_hot'. options = {'one_hot', 'not_one_hot'}
+            - norm_type (:obj:`str`): The type of normalization in networks. defaults to 'BN'.
+            - res_connection_in_dynamics (:obj:`bool`): Whether to use residual connection for dynamics network, default set it to False.
+        """
+        super(EfficientZeroModelMD, self).__init__()
+        if not categorical_distribution:
+            self.reward_support_size = 1
+            self.value_support_size = 1
+        else:
+            self.reward_support_size = reward_support_size
+            self.value_support_size = value_support_size
+
+        self.action_space_size = action_space_size
+        self.continuous_action_space = False
+        # The dim of action space. For discrete action space, it is 1.
+        # For continuous action space, it is the dimension of continuous action.
+        self.action_space_dim = action_space_size if self.continuous_action_space else 1
+        assert discrete_action_encoding_type in ['one_hot', 'not_one_hot'], discrete_action_encoding_type
+        self.discrete_action_encoding_type = discrete_action_encoding_type
+        if self.continuous_action_space:
+            self.action_encoding_dim = action_space_size
+        else:
+            if self.discrete_action_encoding_type == 'one_hot':
+                self.action_encoding_dim = action_space_size
+            elif self.discrete_action_encoding_type == 'not_one_hot':
+                self.action_encoding_dim = 1
+
+        self.lstm_hidden_size = lstm_hidden_size
+        self.proj_hid = proj_hid
+        self.proj_out = proj_out
+        self.pred_hid = pred_hid
+        self.pred_out = pred_out
+        self.self_supervised_learning_loss = self_supervised_learning_loss
+        self.last_linear_layer_init_zero = last_linear_layer_init_zero
+        self.state_norm = state_norm
+        self.res_connection_in_dynamics = res_connection_in_dynamics
+
+        self.representation_network = RepresentationNetworkMLP(
+            observation_shape=observation_shape, hidden_channels=latent_state_dim, norm_type=norm_type
+        )
+
+        self.dynamics_network = DynamicsNetworkMLP(
+            action_encoding_dim=self.action_encoding_dim,
+            num_channels=latent_state_dim + self.action_encoding_dim,
+            common_layer_num=2,
+            lstm_hidden_size=lstm_hidden_size,
+            fc_reward_layers=fc_reward_layers,
+            output_support_size=self.reward_support_size,
+            last_linear_layer_init_zero=self.last_linear_layer_init_zero,
+            norm_type=norm_type,
+            res_connection_in_dynamics=self.res_connection_in_dynamics,
+        )
+
+        self.prediction_network = PredictionNetworkMD(
+            agent_num=agent_num,
+            single_agent_action_size=single_agent_action_size,
+            num_channels=latent_state_dim,
+            fc_value_layers=fc_value_layers,
+            output_support_size=self.value_support_size,
+            last_linear_layer_init_zero=self.last_linear_layer_init_zero,
+            norm_type=norm_type,
+            output_separate_logit=output_separate_logit,
+        )
+
+        if self.self_supervised_learning_loss:
+            # self_supervised_learning_loss related network proposed in EfficientZero
+            self.projection_input_dim = latent_state_dim
+
+            self.projection = nn.Sequential(
+                nn.Linear(self.projection_input_dim, self.proj_hid), nn.BatchNorm1d(self.proj_hid), activation,
+                nn.Linear(self.proj_hid, self.proj_hid), nn.BatchNorm1d(self.proj_hid), activation,
+                nn.Linear(self.proj_hid, self.proj_out), nn.BatchNorm1d(self.proj_out)
+            )
+            self.prediction_head = nn.Sequential(
+                nn.Linear(self.proj_out, self.pred_hid),
+                nn.BatchNorm1d(self.pred_hid),
+                activation,
+                nn.Linear(self.pred_hid, self.pred_out),
+            )
+
+    def initial_inference(self, obs: torch.Tensor) -> EZNetworkOutput:
+        """
+        Overview:
+            Initial inference of EfficientZero model, which is the first step of the EfficientZero model.
+            To perform the initial inference, we first use the representation network to obtain the "latent_state" of the observation.
+            Then we use the prediction network to predict the "value" and "policy_logits" of the "latent_state", and
+            also prepare the zeros-like ``reward_hidden_state`` for the next step of the EfficientZero model.
+        Arguments:
+            - obs (:obj:`torch.Tensor`): The 1D vector observation data.
+        Returns (EZNetworkOutput):
+            - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation.
+            - value_prefix (:obj:`torch.Tensor`): The predicted prefix sum of value for input state. \
+                In initial inference, we set it to zero vector.
+            - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action.
+            - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state.
+            - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The hidden state of LSTM about reward. In initial inference, \
+                we set it to the zeros-like hidden state (H and C).
+        Shapes:
+            - obs (:obj:`torch.Tensor`): :math:`(B, obs_shape)`, where B is batch_size.
+            - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size.
+            - value_prefix (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size.
+            - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size.
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The shape of each element is :math:`(1, B, lstm_hidden_size)`, where B is batch_size.
+        """
+        batch_size = obs.size(0)
+        latent_state = self._representation(obs)
+        policy_logits, value = self._prediction(latent_state)
+        # zero initialization for reward hidden states
+        # (hn, cn), each element shape is (layer_num=1, batch_size, lstm_hidden_size)
+        reward_hidden_state = (
+            torch.zeros(1, batch_size,
+                        self.lstm_hidden_size).to(obs.device), torch.zeros(1, batch_size,
+                                                                           self.lstm_hidden_size).to(obs.device)
+        )
+        return EZNetworkOutput(value, [0. for _ in range(batch_size)], policy_logits, latent_state, reward_hidden_state)
+
+    def recurrent_inference(
+            self, latent_state: torch.Tensor, reward_hidden_state: torch.Tensor, action: torch.Tensor
+    ) -> EZNetworkOutput:
+        """
+        Overview:
+            Recurrent inference of EfficientZero model, which is the rollout step of the EfficientZero model.
+            To perform the recurrent inference, we first use the dynamics network to predict ``next_latent_state``,
+            ``reward_hidden_state``, ``value_prefix`` by the given current ``latent_state`` and ``action``.
+             We then use the prediction network to predict the ``value`` and ``policy_logits``.
+        Arguments:
+            - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state.
+            - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The input hidden state of LSTM about reward.
+            - action (:obj:`torch.Tensor`): The predicted action to rollout.
+        Returns (EZNetworkOutput):
+            - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation.
+            - value_prefix (:obj:`torch.Tensor`): The predicted prefix sum of value for input state.
+            - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action.
+            - next_latent_state (:obj:`torch.Tensor`): The predicted next latent state.
+            - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The output hidden state of LSTM about reward.
+        Shapes:
+            - action (:obj:`torch.Tensor`): :math:`(B, )`, where B is batch_size.
+            - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size.
+            - value_prefix (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size.
+            - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size.
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - next_latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The shape of each element is :math:`(1, B, lstm_hidden_size)`, where B is batch_size.
+        """
+        next_latent_state, reward_hidden_state, value_prefix = self._dynamics(latent_state, reward_hidden_state, action)
+        policy_logits, value = self._prediction(next_latent_state)
+        return EZNetworkOutput(value, value_prefix, policy_logits, next_latent_state, reward_hidden_state)
+
+    def _representation(self, observation: torch.Tensor) -> Tuple[torch.Tensor]:
+        """
+         Overview:
+             Use the representation network to encode the observations into latent state.
+         Arguments:
+             - obs (:obj:`torch.Tensor`): The 1D vector  observation data.
+         Returns:
+             - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state.
+         Shapes:
+             - obs (:obj:`torch.Tensor`): :math:`(B, obs_shape)`, where B is batch_size.
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+        """
+        observation = observation.float()
+        latent_state = self.representation_network(observation)
+        if self.state_norm:
+            latent_state = renormalize(latent_state)
+        return latent_state
+
+    def _prediction(self, latent_state: torch.Tensor) -> Tuple[torch.Tensor]:
+        """
+        Overview:
+            Use the representation network to encode the observations into latent state.
+        Arguments:
+            - obs (:obj:`torch.Tensor`): The 1D vector observation data.
+        Returns:
+            - policy_logits (:obj:`torch.Tensor`): The output logit to select discrete action.
+            - value (:obj:`torch.Tensor`): The output value of input state to help policy improvement and evaluation.
+        Shapes:
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - policy_logits (:obj:`torch.Tensor`): :math:`(B, action_dim)`, where B is batch_size.
+            - value (:obj:`torch.Tensor`): :math:`(B, value_support_size)`, where B is batch_size.
+        """
+        policy_logits, value = self.prediction_network(latent_state)
+        return policy_logits, value
+
+    def _dynamics(self, latent_state: torch.Tensor, reward_hidden_state: Tuple,
+                  action: torch.Tensor) -> Tuple[torch.Tensor, Tuple[torch.Tensor], torch.Tensor]:
+        """
+        Overview:
+            Concatenate ``latent_state`` and ``action`` and use the dynamics network to predict ``next_latent_state``
+            ``value_prefix`` and ``next_reward_hidden_state``.
+        Arguments:
+            - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state.
+            - reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The input hidden state of LSTM about reward.
+            - action (:obj:`torch.Tensor`): The predicted action to rollout.
+        Returns:
+            - next_latent_state (:obj:`torch.Tensor`): The predicted latent state of the next timestep.
+            - next_reward_hidden_state (:obj:`Tuple[torch.Tensor]`): The output hidden state of LSTM about reward.
+            - value_prefix (:obj:`torch.Tensor`): The predicted prefix sum of value for input state.
+        Shapes:
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - action (:obj:`torch.Tensor`): :math:`(B, )`, where B is batch_size.
+            - next_latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - value_prefix (:obj:`torch.Tensor`): :math:`(B, reward_support_size)`, where B is batch_size.
+        """
+        # NOTE: the discrete action encoding type is important for some environments
+
+        # discrete action space
+        if self.discrete_action_encoding_type == 'one_hot':
+            # Stack latent_state with the one hot encoded action
+            if len(action.shape) == 1:
+                # (batch_size, ) -> (batch_size, 1)
+                # e.g.,  torch.Size([8]) ->  torch.Size([8, 1])
+                action = action.unsqueeze(-1)
+
+            # transform action to one-hot encoding.
+            # action_one_hot shape: (batch_size, action_space_size), e.g., (8, 4)
+            action_one_hot = torch.zeros(action.shape[0], self.action_space_size, device=action.device)
+            # transform action to torch.int64
+            action = action.long()
+            action_one_hot.scatter_(1, action, 1)
+            action_encoding = action_one_hot
+        elif self.discrete_action_encoding_type == 'not_one_hot':
+            action_encoding = action / self.action_space_size
+            if len(action_encoding.shape) == 1:
+                # (batch_size, ) -> (batch_size, 1)
+                # e.g.,  torch.Size([8]) ->  torch.Size([8, 1])
+                action_encoding = action_encoding.unsqueeze(-1)
+
+        action_encoding = action_encoding.to(latent_state.device).float()
+        # state_action_encoding shape: (batch_size, latent_state[1] + action_dim]) or
+        # (batch_size, latent_state[1] + action_space_size]) depending on the discrete_action_encoding_type.
+        state_action_encoding = torch.cat((latent_state, action_encoding), dim=1)
+
+        # NOTE: the key difference with MuZero
+        next_latent_state, next_reward_hidden_state, value_prefix = self.dynamics_network(
+            state_action_encoding, reward_hidden_state
+        )
+
+        if self.state_norm:
+            next_latent_state = renormalize(next_latent_state)
+        return next_latent_state, next_reward_hidden_state, value_prefix
+
+    def project(self, latent_state: torch.Tensor, with_grad=True):
+        """
+        Overview:
+            Project the latent state to a lower dimension to calculate the self-supervised loss, which is proposed in EfficientZero.
+            For more details, please refer to the paper ``Exploring Simple Siamese Representation Learning``.
+        Arguments:
+            - latent_state (:obj:`torch.Tensor`): The encoding latent state of input state.
+            - with_grad (:obj:`bool`): Whether to calculate gradient for the projection result.
+        Returns:
+            - proj (:obj:`torch.Tensor`): The result embedding vector of projection operation.
+        Shapes:
+            - latent_state (:obj:`torch.Tensor`): :math:`(B, H)`, where B is batch_size, H is the dimension of latent state.
+            - proj (:obj:`torch.Tensor`): :math:`(B, projection_output_dim)`, where B is batch_size.
+
+        Examples:
+            >>> latent_state = torch.randn(256, 64)
+            >>> output = self.project(latent_state)
+            >>> output.shape # (256, 1024)
+         """
+        proj = self.projection(latent_state)
+
+        if with_grad:
+            # with grad, use prediction_head
+            return self.prediction_head(proj)
+        else:
+            return proj.detach()
+
+    def get_params_mean(self) -> float:
+        return get_params_mean(self)
+
+
+class DynamicsNetworkMLP(nn.Module):
+
+    def __init__(
+        self,
+        action_encoding_dim: int = 2,
+        num_channels: int = 64,
+        common_layer_num: int = 2,
+        fc_reward_layers: SequenceType = [32],
+        output_support_size: int = 601,
+        lstm_hidden_size: int = 512,
+        last_linear_layer_init_zero: bool = True,
+        activation: Optional[nn.Module] = nn.ReLU(inplace=True),
+        norm_type: Optional[str] = 'BN',
+        res_connection_in_dynamics: bool = False,
+    ):
+        """
+        Overview:
+            The definition of dynamics network in EfficientZero algorithm, which is used to predict next latent state
+            value_prefix and reward_hidden_state by the given current latent state and action.
+            The networks are mainly built on fully connected layers.
+        Arguments:
+            - action_encoding_dim (:obj:`int`): The dimension of action encoding.
+            - num_channels (:obj:`int`): The num of channels in latent states.
+            - common_layer_num (:obj:`int`): The number of common layers in dynamics network.
+            - fc_reward_layers (:obj:`SequenceType`): The number of hidden layers of the reward head (MLP head).
+            - output_support_size (:obj:`int`): The size of categorical reward output.
+            - lstm_hidden_size (:obj:`int`): The hidden size of lstm in dynamics network.
+            - last_linear_layer_init_zero (:obj:`bool`): Whether to use zero initializationss for the last layer of value/policy head, default sets it to True.
+            - activation (:obj:`Optional[nn.Module]`): Activation function used in network, which often use in-place \
+                operation to speedup, e.g. ReLU(inplace=True).
+            - norm_type (:obj:`str`): The type of normalization in networks. defaults to 'BN'.
+            - res_connection_in_dynamics (:obj:`bool`): Whether to use residual connection in dynamics network.
+        """
+        super().__init__()
+        assert num_channels > action_encoding_dim, f'num_channels:{num_channels} <= action_encoding_dim:{action_encoding_dim}'
+
+        self.num_channels = num_channels
+        self.action_encoding_dim = action_encoding_dim
+        self.latent_state_dim = self.num_channels - self.action_encoding_dim
+        self.lstm_hidden_size = lstm_hidden_size
+        self.activation = activation
+        self.res_connection_in_dynamics = res_connection_in_dynamics
+
+        if self.res_connection_in_dynamics:
+            self.fc_dynamics_1 = MLP(
+                in_channels=self.num_channels,
+                hidden_channels=self.latent_state_dim,
+                layer_num=common_layer_num,
+                out_channels=self.latent_state_dim,
+                activation=activation,
+                norm_type=norm_type,
+                output_activation=True,
+                output_norm=True,
+                # last_linear_layer_init_zero=False is important for convergence
+                last_linear_layer_init_zero=False,
+            )
+            self.fc_dynamics_2 = MLP(
+                in_channels=self.latent_state_dim,
+                hidden_channels=self.latent_state_dim,
+                layer_num=common_layer_num,
+                out_channels=self.latent_state_dim,
+                activation=activation,
+                norm_type=norm_type,
+                output_activation=True,
+                output_norm=True,
+                # last_linear_layer_init_zero=False is important for convergence
+                last_linear_layer_init_zero=False,
+            )
+        else:
+            self.fc_dynamics = MLP(
+                in_channels=self.num_channels,
+                hidden_channels=self.latent_state_dim,
+                layer_num=common_layer_num,
+                out_channels=self.latent_state_dim,
+                activation=activation,
+                norm_type=norm_type,
+                output_activation=True,
+                output_norm=True,
+                # last_linear_layer_init_zero=False is important for convergence
+                last_linear_layer_init_zero=False,
+            )
+
+        # input_shape: （sequence_length，batch_size，input_size)
+        # output_shape: (sequence_length, batch_size, hidden_size)
+        self.lstm = nn.LSTM(input_size=self.latent_state_dim, hidden_size=self.lstm_hidden_size)
+
+        self.fc_reward_head = MLP(
+            in_channels=self.lstm_hidden_size,
+            hidden_channels=fc_reward_layers[0],
+            layer_num=2,
+            out_channels=output_support_size,
+            activation=self.activation,
+            norm_type=norm_type,
+            output_activation=False,
+            output_norm=False,
+            last_linear_layer_init_zero=last_linear_layer_init_zero
+        )
+
+    def forward(self, state_action_encoding: torch.Tensor, reward_hidden_state):
+        """
+        Overview:
+            Forward computation of the dynamics network. Predict next latent state given current state_action_encoding and reward hidden state.
+        Arguments:
+            - state_action_encoding (:obj:`torch.Tensor`): The state-action encoding, which is the concatenation of \
+                    latent state and action encoding, with shape (batch_size, num_channels, height, width).
+            - reward_hidden_state (:obj:`Tuple[torch.Tensor, torch.Tensor]`): The input hidden state of LSTM about reward.
+        Returns:
+            - next_latent_state (:obj:`torch.Tensor`): The next latent state, with shape (batch_size, latent_state_dim).
+            - next_reward_hidden_state (:obj:`torch.Tensor`): The input hidden state of LSTM about reward.
+            - value_prefix (:obj:`torch.Tensor`): The predicted prefix sum of value for input state.
+        """
+        if self.res_connection_in_dynamics:
+            # take the state encoding (latent_state), state_action_encoding[:, -self.action_encoding_dim]
+            # is action encoding
+            latent_state = state_action_encoding[:, :-self.action_encoding_dim]
+            x = self.fc_dynamics_1(state_action_encoding)
+            # the residual link: add state encoding to the state_action encoding
+            next_latent_state = x + latent_state
+            next_latent_state_ = self.fc_dynamics_2(next_latent_state)
+        else:
+            next_latent_state = self.fc_dynamics(state_action_encoding)
+            next_latent_state_ = next_latent_state
+
+        next_latent_state_unsqueeze = next_latent_state_.unsqueeze(0)
+        value_prefix, next_reward_hidden_state = self.lstm(next_latent_state_unsqueeze, reward_hidden_state)
+        value_prefix = self.fc_reward_head(value_prefix.squeeze(0))
+
+        return next_latent_state, next_reward_hidden_state, value_prefix
+
+    def get_dynamic_mean(self) -> float:
+        return get_dynamic_mean(self)
+
+    def get_reward_mean(self) -> Tuple[ndarray, float]:
+        return get_reward_mean(self)
diff --git a/lzero/policy/efficientzero.py b/lzero/policy/efficientzero.py
index 3a94baf51..b9e844cbe 100644
--- a/lzero/policy/efficientzero.py
+++ b/lzero/policy/efficientzero.py
@@ -218,6 +218,8 @@ def default_model(self) -> Tuple[str, List[str]]:
             return 'EfficientZeroModel', ['lzero.model.efficientzero_model']
         elif self._cfg.model.model_type == "mlp":
             return 'EfficientZeroModelMLP', ['lzero.model.efficientzero_model_mlp']
+        elif self._cfg.model.model_type == "mlp_md":
+            return 'EfficientZeroModelMD', ['lzero.model.efficientzero_model_md']
         else:
             raise ValueError("model type {} is not supported".format(self._cfg.model.model_type))
 
diff --git a/zoo/CrowdSim/config/crowdsim_efficientzero_md_config.py b/zoo/CrowdSim/config/crowdsim_efficientzero_md_config.py
new file mode 100644
index 000000000..16e395606
--- /dev/null
+++ b/zoo/CrowdSim/config/crowdsim_efficientzero_md_config.py
@@ -0,0 +1,114 @@
+from easydict import EasyDict
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = '0'
+# ==============================================================
+# begin of the most frequently changed config specified by the user
+# ==============================================================
+collector_env_num = 8
+n_episode = 8
+evaluator_env_num = 3
+num_simulations = 25
+update_per_collect = 100
+batch_size = 256
+max_env_step = int(3e5)
+reanalyze_ratio = 0.
+robot_num = 2
+human_num = 59  # purdue
+# human_num = 33  # NCSU
+# human_num = 92  # KAIST
+one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]
+# ==============================================================
+# end of the most frequently changed config specified by the user
+# ==============================================================
+
+CrowdSim_efficientzero_md_config = dict(
+    exp_name=
+    f'result/old_env/new_CrowdSim_ez_md_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0',
+    env=dict(
+        env_mode='easy',
+        obs_mode='1-dim-array',
+        env_name='CrowdSim-v0',
+        dataset = 'purdue',
+        robot_num = robot_num,
+        human_num = human_num,
+        one_uav_action_space = one_uav_action_space,
+        continuous=False,
+        manually_discretization=False,
+        collector_env_num=collector_env_num,
+        evaluator_env_num=evaluator_env_num,
+        n_evaluator_episode=evaluator_env_num,
+        manager=dict(shared_memory=False, ),
+    ),
+    policy=dict(
+        model=dict(
+            agent_num = robot_num,
+            observation_shape=(robot_num + human_num)*4,
+            obs_mode='1-dim-array',
+            robot_state_dim = 4,
+            human_state_dim = 4,
+            robot_num = robot_num,
+            human_num = human_num,
+            single_agent_action_size=len(one_uav_action_space),
+            action_space_size=(len(one_uav_action_space))**robot_num,
+            model_type='mlp_md', 
+            output_separate_logit=False,    # not output separate logit for each action.
+            lstm_hidden_size=128,
+            latent_state_dim=128,
+            discrete_action_encoding_type='one_hot',
+            res_connection_in_dynamics=True,
+            norm_type='BN',
+        ),
+        cuda=True,
+        env_type='not_board_games',
+        game_segment_length=200,
+        update_per_collect=update_per_collect,
+        batch_size=batch_size,
+        optim_type='Adam',
+        lr_piecewise_constant_decay=False,
+        learning_rate=0.003,
+        grad_clip_value=0.5,
+        num_simulations=num_simulations,
+        reanalyze_ratio=reanalyze_ratio,
+        n_episode=n_episode,
+        eval_freq=int(1e3),
+        replay_buffer_size=int(1e6),  # the size/capacity of replay_buffer, in the terms of transitions.
+        collector_env_num=collector_env_num,
+        evaluator_env_num=evaluator_env_num,
+    ),
+)
+
+CrowdSim_efficientzero_md_config = EasyDict(CrowdSim_efficientzero_md_config)
+main_config = CrowdSim_efficientzero_md_config
+
+CrowdSim_efficientzero_md_create_config = dict(
+    env=dict(
+        type='crowdsim_lightzero',
+        import_names=['zoo.CrowdSim.envs.crowdsim_lightzero_env'],
+    ),
+    env_manager=dict(type='base'),
+    policy=dict(
+        type='efficientzero',
+        import_names=['lzero.policy.efficientzero'],
+    ),
+    collector=dict(
+        type='episode_muzero',
+        import_names=['lzero.worker.muzero_collector'],
+    )
+)
+CrowdSim_efficientzero_md_create_config = EasyDict(CrowdSim_efficientzero_md_create_config)
+create_config = CrowdSim_efficientzero_md_create_config
+
+if __name__ == "__main__":
+    # Users can use different train entry by specifying the entry_type.
+    entry_type = "train_muzero"  # options={"train_muzero", "train_muzero_with_gym_env"}
+
+    if entry_type == "train_muzero":
+        from lzero.entry import train_muzero
+    elif entry_type == "train_muzero_with_gym_env":
+        """
+        The ``train_muzero_with_gym_env`` entry means that the environment used in the training process is generated by wrapping the original gym environment with LightZeroEnvWrapper.
+        Users can refer to lzero/envs/wrappers for more details.
+        """
+        from lzero.entry import train_muzero_with_gym_env as train_muzero
+
+    train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step)

From cb044af9a4bb78f8d5d902478c7350f3f60b2f88 Mon Sep 17 00:00:00 2001
From: nighood <nighoodren@gmail.com>
Date: Tue, 7 May 2024 19:39:53 +0800
Subject: [PATCH 13/16] polish(rjy): add v_trans in config

---
 .../config/crowdsim_efficientzero_md_config.py     | 14 ++++++--------
 zoo/CrowdSim/config/crowdsim_muzero_md_config.py   |  8 +++++---
 zoo/CrowdSim/config/crowdsim_sez_md_config.py      |  5 ++++-
 zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py        |  4 ++--
 4 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/zoo/CrowdSim/config/crowdsim_efficientzero_md_config.py b/zoo/CrowdSim/config/crowdsim_efficientzero_md_config.py
index 16e395606..ff2a0913c 100644
--- a/zoo/CrowdSim/config/crowdsim_efficientzero_md_config.py
+++ b/zoo/CrowdSim/config/crowdsim_efficientzero_md_config.py
@@ -1,6 +1,6 @@
 from easydict import EasyDict
 import os
-os.environ["CUDA_VISIBLE_DEVICES"] = '0'
+os.environ["CUDA_VISIBLE_DEVICES"] = '1'
 # ==============================================================
 # begin of the most frequently changed config specified by the user
 # ==============================================================
@@ -17,15 +17,17 @@
 # human_num = 33  # NCSU
 # human_num = 92  # KAIST
 one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]
+transmit_v = 20
 # ==============================================================
 # end of the most frequently changed config specified by the user
 # ==============================================================
 
 CrowdSim_efficientzero_md_config = dict(
     exp_name=
-    f'result/old_env/new_CrowdSim_ez_md_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0',
+    f'result/new_env/new_CrowdSim_ez_md_ssl_vt{transmit_v}_step{max_env_step}_uav{robot_num}__human{human_num}_seed0',
     env=dict(
-        env_mode='easy',
+        env_mode='hard',
+        transmit_v=transmit_v,
         obs_mode='1-dim-array',
         env_name='CrowdSim-v0',
         dataset = 'purdue',
@@ -70,7 +72,7 @@
         num_simulations=num_simulations,
         reanalyze_ratio=reanalyze_ratio,
         n_episode=n_episode,
-        eval_freq=int(1e3),
+        eval_freq=int(2e2),
         replay_buffer_size=int(1e6),  # the size/capacity of replay_buffer, in the terms of transitions.
         collector_env_num=collector_env_num,
         evaluator_env_num=evaluator_env_num,
@@ -90,10 +92,6 @@
         type='efficientzero',
         import_names=['lzero.policy.efficientzero'],
     ),
-    collector=dict(
-        type='episode_muzero',
-        import_names=['lzero.worker.muzero_collector'],
-    )
 )
 CrowdSim_efficientzero_md_create_config = EasyDict(CrowdSim_efficientzero_md_create_config)
 create_config = CrowdSim_efficientzero_md_create_config
diff --git a/zoo/CrowdSim/config/crowdsim_muzero_md_config.py b/zoo/CrowdSim/config/crowdsim_muzero_md_config.py
index c0b3b1018..a79eb0bbe 100644
--- a/zoo/CrowdSim/config/crowdsim_muzero_md_config.py
+++ b/zoo/CrowdSim/config/crowdsim_muzero_md_config.py
@@ -17,15 +17,17 @@
 # human_num = 33  # NCSU
 # human_num = 92  # KAIST
 one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]
+transmit_v = 20
 # ==============================================================
 # end of the most frequently changed config specified by the user
 # ==============================================================
 
 CrowdSim_muzero_config = dict(
     exp_name=
-    f'result/new_env/new_CrowdSim_muzeromd_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0',
+    f'result/new_env/new_CrowdSim_vt{transmit_v}_muzero_md_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0',
     env=dict(
-        env_mode = 'easy',
+        env_mode = 'hard',
+        transmit_v=transmit_v,
         obs_mode='1-dim-array',
         env_name='CrowdSim-v0',
         dataset = 'purdue',
@@ -74,7 +76,7 @@
         num_simulations=num_simulations,
         reanalyze_ratio=reanalyze_ratio,
         n_episode=n_episode,
-        eval_freq=int(1e3),
+        eval_freq=int(2e2),
         replay_buffer_size=int(1e6),  # the size/capacity of replay_buffer, in the terms of transitions.
         collector_env_num=collector_env_num,
         evaluator_env_num=evaluator_env_num,
diff --git a/zoo/CrowdSim/config/crowdsim_sez_md_config.py b/zoo/CrowdSim/config/crowdsim_sez_md_config.py
index 08ace6a98..ae2e5074c 100644
--- a/zoo/CrowdSim/config/crowdsim_sez_md_config.py
+++ b/zoo/CrowdSim/config/crowdsim_sez_md_config.py
@@ -18,15 +18,18 @@
 # human_num = 92  # KAIST
 one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]
 K = 10
+transmit_v = 20
 # ==============================================================
 # end of the most frequently changed config specified by the user
 # ==============================================================
 
 CrowdSim_sez_config = dict(
     exp_name=
-    f'result/old_env/new_CrowdSim_sez_md_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0',
+    f'result/new_env/new_CrowdSim_vt{transmit_v}_sez_md_ssl_K{K}_step{max_env_step}_uav{robot_num}__human{human_num}_seed0',
     env=dict(
+        env_mode='hard',
         obs_mode='1-dim-array',
+        transmit_v=transmit_v,
         env_name='CrowdSim-v0',
         dataset = 'purdue',
         robot_num = robot_num,
diff --git a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
index ba855b35d..b590ecb9f 100644
--- a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
+++ b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
@@ -44,7 +44,7 @@ def __init__(self, dataset, custom_config=None):
         self.observation_space = gym.spaces.Box(low=float("-inf"), high=float("inf"), shape=(self.robot_num+self.human_num, 4), dtype=np.float32)
 
         # load_dataset
-        self.transmit_v = 20  # 5*0.3Mb/s
+        self.transmit_v = self.config.transmit_v  # 5*0.3Mb/s
         self.nlon = self.config.nlon
         self.nlat = self.config.nlat
         self.lower_left = self.config.lower_left
@@ -68,7 +68,7 @@ def __init__(self, dataset, custom_config=None):
         self.human_df['aoi'] = -1  # 加入aoi记录aoi
         self.human_df['data_amount'] = -1    # record the remaining data amount of each human
         self.human_df['energy'] = -1  # 加入energy记录energy
-        logging.info('Env mode:', self.env_mode)
+        logging.info('Env mode: {}'.format(self.env_mode))
         logging.info('human number: {}'.format(self.human_num))
         logging.info('Robot number: {}'.format(self.robot_num))
 

From 715b5b82134bc5f1450c798d2e4e11551e9f8fc3 Mon Sep 17 00:00:00 2001
From: nighood <nighoodren@gmail.com>
Date: Tue, 11 Jun 2024 15:37:52 +0800
Subject: [PATCH 14/16] fix(rjy): fix env bug

---
 lzero/worker/muzero_evaluator.py              |  9 +-
 zoo/CrowdSim/entry/eval_crowdsim.py           | 83 +++++++++++++++++++
 zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py   |  6 +-
 zoo/CrowdSim/envs/Crowdsim/env/model/agent.py |  3 +-
 zoo/CrowdSim/envs/crowdsim_lightzero_env.py   |  6 +-
 5 files changed, 97 insertions(+), 10 deletions(-)
 create mode 100644 zoo/CrowdSim/entry/eval_crowdsim.py

diff --git a/lzero/worker/muzero_evaluator.py b/lzero/worker/muzero_evaluator.py
index a718d33b7..d96cf90f7 100644
--- a/lzero/worker/muzero_evaluator.py
+++ b/lzero/worker/muzero_evaluator.py
@@ -342,11 +342,12 @@ def eval(
                             # Env reset is done by env_manager automatically.
                             self._policy.reset([env_id])
                             reward = t.info['eval_episode_return']
-                            # 'performance_info' and 'episode_info' only choose one 
+                            saved_info = {'eval_episode_return': t.info['eval_episode_return']} 
                             if 'performance_info' in t.info:
-                                eval_monitor.update_info(env_id, t.info['performance_info'])
-                            elif 'episode_info' in t.info:
-                                eval_monitor.update_info(env_id, t.info['episode_info'])
+                                saved_info.update(t.info['performance_info'])
+                            if 'episode_info' in t.info:
+                                saved_info.update(t.info['episode_info'])
+                            eval_monitor.update_info(env_id, saved_info)
                             eval_monitor.update_reward(env_id, reward)
                             self._logger.info(
                                 "[EVALUATOR]env {} finish episode, final reward: {}, current episode: {}".format(
diff --git a/zoo/CrowdSim/entry/eval_crowdsim.py b/zoo/CrowdSim/entry/eval_crowdsim.py
new file mode 100644
index 000000000..13297c76f
--- /dev/null
+++ b/zoo/CrowdSim/entry/eval_crowdsim.py
@@ -0,0 +1,83 @@
+from lzero.entry import eval_muzero
+import numpy as np
+
+if __name__ == "__main__":
+    """
+    Overview:
+        Main script to evaluate the MuZero model on Atari games. The script will loop over multiple seeds,
+        evaluating a certain number of episodes per seed. Results are aggregated and printed.
+
+    Variables:
+        - model_path (:obj:`Optional[str]`): The pretrained model path, pointing to the ckpt file of the pretrained model. 
+          The path is usually something like ``exp_name/ckpt/ckpt_best.pth.tar``.
+        - seeds (:obj:`List[int]`): List of seeds to use for the evaluations.
+        - num_episodes_each_seed (:obj:`int`): Number of episodes to evaluate for each seed.
+        - total_test_episodes (:obj:`int`): Total number of test episodes, calculated as num_episodes_each_seed * len(seeds).
+        - returns_mean_seeds (:obj:`np.array`): Array of mean return values for each seed.
+        - returns_seeds (:obj:`np.array`): Array of all return values for each seed.
+    """
+    # Importing the necessary configuration files from the atari muzero configuration in the zoo directory.
+    # module_path = '/home/nighoodRen/LightZero/result/new_env/new_CrowdSim_vt20_muzero_md_ssl_step300000_uav2__human59_seed0'
+    # import sys
+    # if module_path not in sys.path:
+    #     sys.path.append(module_path)
+    # # 导入模块中的内容
+    # from formatted_total_config import main_config, create_config
+    # from result.new_env.new_CrowdSim_vt20_muzero_md_ssl_step300000_uav2__human59_seed0.formatted_total_config import main_config, create_config
+    from zoo.CrowdSim.config.crowdsim_muzero_md_config import main_config, create_config
+
+    # model_path is the path to the trained MuZero model checkpoint.
+    # If no path is provided, the script will use the default model.
+    model_path = '/home/nighoodRen/LightZero/result/old_env/CrowdSim_muzeromd_ssl_step300000_uav2__human59_seed0_240503_022923/ckpt/ckpt_best.pth.tar'
+    main_config.exp_name = '/home/nighoodRen/LightZero/result/old_env/CrowdSim_muzeromd_ssl_step300000_uav2__human59_seed0_240503_022923/' + 'eval'   # original result folder/eval
+    # seeds is a list of seed values for the random number generator, used to initialize the environment.
+    seeds = [0]
+    # num_episodes_each_seed is the number of episodes to run for each seed.
+    num_episodes_each_seed = 1
+    # total_test_episodes is the total number of test episodes, calculated as the product of the number of seeds and the number of episodes per seed
+    total_test_episodes = num_episodes_each_seed * len(seeds)
+
+    # Setting the type of the environment manager to 'base' for the visualization purposes.
+    create_config.env_manager.type = 'base'
+    # The number of environments to evaluate concurrently. Set to 1 for visualization purposes.
+    main_config.env.evaluator_env_num = 1
+    # The total number of evaluation episodes that should be run.
+    main_config.env.n_evaluator_episode = total_test_episodes
+    # A boolean flag indicating whether to render the environments in real-time.
+    main_config.env.render_mode_human = False
+
+    # A boolean flag indicating whether to save the video of the environment.
+    main_config.env.save_replay = True
+    # The path where the recorded video will be saved.
+    main_config.env.replay_path = main_config.exp_name + '/video'   # current result folder/eval
+    
+    # The maximum number of steps for each episode during evaluation. This may need to be adjusted based on the specific characteristics of the environment.
+    main_config.env.eval_max_episode_steps = int(20)
+
+    # These lists will store the mean and total rewards for each seed.
+    returns_mean_seeds = []
+    returns_seeds = []
+
+    # The main evaluation loop. For each seed, the MuZero model is evaluated and the mean and total rewards are recorded.
+    for seed in seeds:
+        returns_mean, returns = eval_muzero(
+            [main_config, create_config],
+            seed=seed,
+            num_episodes_each_seed=num_episodes_each_seed,
+            print_seed_details=False,
+            model_path=model_path
+        )
+        print(returns_mean, returns)
+        returns_mean_seeds.append(returns_mean)
+        returns_seeds.append(returns)
+
+    # Convert the list of mean and total rewards into numpy arrays for easier statistical analysis.
+    returns_mean_seeds = np.array(returns_mean_seeds)
+    returns_seeds = np.array(returns_seeds)
+
+    # Printing the evaluation results. The average reward and the total reward for each seed are displayed, followed by the mean reward across all seeds.
+    print("=" * 20)
+    print(f"We evaluated a total of {len(seeds)} seeds. For each seed, we evaluated {num_episodes_each_seed} episode(s).")
+    print(f"For seeds {seeds}, the mean returns are {returns_mean_seeds}, and the returns are {returns_seeds}.")
+    print("Across all seeds, the mean reward is:", returns_mean_seeds.mean())
+    print("=" * 20)
\ No newline at end of file
diff --git a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
index b590ecb9f..7ddf4550a 100644
--- a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
+++ b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
@@ -74,7 +74,7 @@ def __init__(self, dataset, custom_config=None):
 
         # for debug
         self.current_human_aoi_list = np.zeros([self.human_num, ])
-        self.mean_aoi_timelist = np.ones([self.config.num_timestep + 1, ])
+        self.mean_aoi_timelist = np.zeros([self.config.num_timestep + 1, ])
         self.cur_data_amount_timelist = np.zeros([self.human_num, ])
         self.robot_energy_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num])
         self.robot_x_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num])
@@ -205,7 +205,7 @@ def step(self, action):
                     else:
                         human_transmit_data_list[human_id] = 1
 
-                    human.set(next_px, next_py, next_theta, aoi=1, data_amount=human.aoi)
+                    human.set(next_px, next_py, next_theta, aoi=0, data_amount=0)
                     num_updated_human += 1
                 else:
                     # if the human is not in the range of the robot, then update the aoi of the human
@@ -218,7 +218,7 @@ def step(self, action):
                     # if the human is in the range of the robot, then part of human's data will be transmitted
                     last_data_amount = human.data_amount
                     human.update(next_px, next_py, next_theta, transmitted_data=self.transmit_v)
-                    human_transmit_data_list[human_id] = min(last_data_amount, self.transmit_v)
+                    human_transmit_data_list[human_id] = min(last_data_amount + human.collect_v, self.transmit_v)
                     num_updated_human += 1
                 else:
                     # if the human is not in the range of the robot, then no data will be transmitted, \
diff --git a/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py b/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py
index aabc184d2..db6c47b31 100644
--- a/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py
+++ b/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py
@@ -26,7 +26,7 @@ def act(self, state, current_timestep):
 
 
 class Human():
-    collect_v_prob = {1: 0.3, 2: 0.6}
+    # collect_v_prob = {1: 1, 2: 0}
     def __init__(self, id, config):
         self.id = id
         self.config = config
@@ -36,6 +36,7 @@ def __init__(self, id, config):
         self.aoi = 0
         self.data_queue = InformationQueue()
         self.data_amount = 0
+        self.collect_v_prob = getattr(self.config, 'collect_v_prob', {1: 1, 2: 0})
         self.collect_v = random.choices(list(self.collect_v_prob.keys()), list(self.collect_v_prob.values()))[0]
 
     def set(self, px, py, theta, aoi, data_amount):
diff --git a/zoo/CrowdSim/envs/crowdsim_lightzero_env.py b/zoo/CrowdSim/envs/crowdsim_lightzero_env.py
index 4aca9dd1a..bcecc89bc 100644
--- a/zoo/CrowdSim/envs/crowdsim_lightzero_env.py
+++ b/zoo/CrowdSim/envs/crowdsim_lightzero_env.py
@@ -19,7 +19,7 @@ class CrowdSimEnv(BaseEnv):
     def __init__(self, cfg: dict = {}) -> None:
         self._cfg = cfg
         self._init_flag = False
-        self._replay_path = None
+        self._replay_path = cfg.get('replay_path', None)
         self._robot_num = self._cfg.robot_num
         self._human_num = self._cfg.human_num
         self._observation_space = gym.spaces.Dict({
@@ -124,7 +124,9 @@ def step(self, action: Union[int, np.ndarray]) -> BaseEnvTimestep:
         if self._replay_path is not None:
             self._frame.append(self._env.render())
             if done:
-                import imageio
+                import imageio, os
+                if not os.path.exists(self._replay_path):
+                    os.makedirs(self._replay_path)
                 imageio.mimsave(self._replay_path + '/replay.gif', self._frame)
         return BaseEnvTimestep(obs, rew, done, info)
 

From fecf5d37a3438ffdd5f049398c0737219d49a904 Mon Sep 17 00:00:00 2001
From: nighood <nighoodren@gmail.com>
Date: Fri, 14 Jun 2024 15:43:26 +0800
Subject: [PATCH 15/16] feature(rjy): add entropy info/set margin

---
 lzero/worker/muzero_collector.py              |  3 ++
 .../config/crowdsim_muzero_md_config.py       | 11 +++--
 zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py   | 49 ++++++++++++-------
 zoo/CrowdSim/envs/Crowdsim/env/model/agent.py |  2 +-
 zoo/CrowdSim/envs/crowdsim_lightzero_env.py   |  2 +
 .../envs/test_crowdsim_lightzero_env.py       |  5 +-
 6 files changed, 49 insertions(+), 23 deletions(-)

diff --git a/lzero/worker/muzero_collector.py b/lzero/worker/muzero_collector.py
index 3c1a630dd..7b8f67ed3 100644
--- a/lzero/worker/muzero_collector.py
+++ b/lzero/worker/muzero_collector.py
@@ -587,6 +587,7 @@ def collect(self,
                             'mean_energy_consumption': mean_energy_consumption,
                             'transmitted_data_ratio': transmitted_data_ratio,
                             'human_coverage': human_coverage,
+                            "distribution_entropy":  timestep.info['performance_info']['distribution_entropy']
                         }
                     else:
                         info = {
@@ -735,6 +736,7 @@ def _output_log(self, train_iter: int) -> None:
                 episode_transmitted_data_ratio = [d['transmitted_data_ratio'] for d in self._episode_info]
                 episode_human_coverage = [d['human_coverage'] for d in self._episode_info]
                 mean_transmit_data = [d['mean_transmit_data'] for d in self._episode_info]
+                mean_distribution_entropy = [d['distribution_entropy'] for d in self._episode_info]
                 info = {
                     'episode_count': episode_count,
                     'envstep_count': envstep_count,
@@ -755,6 +757,7 @@ def _output_log(self, train_iter: int) -> None:
                     'episode_mean_energy_consumption': np.mean(episode_energy_consumption),
                     'episode_mean_transmitted_data_ratio': np.mean(episode_transmitted_data_ratio),
                     'episode_mean_human_coverage': np.mean(episode_human_coverage),
+                    'episode_mean_distribution_entropy': np.mean(mean_distribution_entropy),
                 }
             else:
                 info = {
diff --git a/zoo/CrowdSim/config/crowdsim_muzero_md_config.py b/zoo/CrowdSim/config/crowdsim_muzero_md_config.py
index a79eb0bbe..e80476efe 100644
--- a/zoo/CrowdSim/config/crowdsim_muzero_md_config.py
+++ b/zoo/CrowdSim/config/crowdsim_muzero_md_config.py
@@ -7,8 +7,9 @@
 collector_env_num = 8
 n_episode = 8
 evaluator_env_num = 3
-num_simulations = 25
-update_per_collect = 100
+num_simulations = 50
+# num_simulations = 25
+update_per_collect = 250
 batch_size = 256
 max_env_step = int(3e5)
 reanalyze_ratio = 0.
@@ -28,6 +29,7 @@
     env=dict(
         env_mode = 'hard',
         transmit_v=transmit_v,
+        collect_v_prob = {'1': 1, '2': 0},
         obs_mode='1-dim-array',
         env_name='CrowdSim-v0',
         dataset = 'purdue',
@@ -65,6 +67,7 @@
         ),
         cuda=True,
         env_type='not_board_games',
+        # game_segment_length=120,
         game_segment_length=200,
         update_per_collect=update_per_collect,
         batch_size=batch_size,
@@ -72,7 +75,7 @@
         lr_piecewise_constant_decay=False,
         learning_rate=0.003,
         ssl_loss_weight=2,  # NOTE: default is 0.
-        grad_clip_value=0.5,
+        grad_clip_value=10,
         num_simulations=num_simulations,
         reanalyze_ratio=reanalyze_ratio,
         n_episode=n_episode,
@@ -91,7 +94,7 @@
         type='crowdsim_lightzero',
         import_names=['zoo.CrowdSim.envs.crowdsim_lightzero_env'],
     ),
-    env_manager=dict(type='base'),
+    env_manager=dict(type='subprocess'),
     policy=dict(
         type='muzero',
         import_names=['lzero.policy.muzero'],
diff --git a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
index 7ddf4550a..b8b3bc43b 100644
--- a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
+++ b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
@@ -5,6 +5,7 @@
 import gym
 # from shapely.geometry import Point
 import numpy as np
+from scipy.stats import entropy
 # import folium
 # from folium.plugins import TimestampedGeoJson, AntPath
 
@@ -51,18 +52,6 @@ def __init__(self, dataset, custom_config=None):
         self.upper_right = self.config.upper_right
         self.human_df = pd.read_csv(self.config.dataset_dir)
         logging.info("Finished reading {} rows".format(len(self.human_df)))
-        # # for temporarily processing data
-        # sample_list=np.random.choice(self.human_num, size=[50,], replace=False)
-        # sample_list=sample_list[np.argsort(sample_list)]
-        # print(sample_list)
-        # self.human_df= self.human_df[self.human_df["id"].isin(sample_list)]
-        # for i,human_id in enumerate(sample_list):
-        #     mask=(self.human_df["id"]==human_id)
-        #     self.human_df.loc[mask,"id"]=i
-        # self.human_df=self.human_df.sort_values(by=["id","timestamp"],ascending=[True,True])
-        # print(self.human_df.head())
-        # self.human_df.to_csv("50 users-5.csv",index=False)
-        # exit(0)
 
         self.human_df['t'] = pd.to_datetime(self.human_df['timestamp'], unit='s')  # 's' stands for second
         self.human_df['aoi'] = -1  # 加入aoi记录aoi
@@ -81,6 +70,8 @@ def __init__(self, dataset, custom_config=None):
         self.robot_y_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num])
         self.update_human_timelist = np.zeros([self.config.num_timestep, ])
         self.data_transmission = 0
+        self.data_collection_distribution = np.zeros(self.human_num)
+        self.data_transmission_distribution = np.zeros(self.human_num)
 
     def set_agent(self, agent):
         self.agent = agent
@@ -144,11 +135,15 @@ def reset(self, phase='test', test_case=None):
         self.robot_y_timelist[self.current_timestep, :] = self.nlat / 2
         self.update_human_timelist = np.zeros([self.config.num_timestep, ])
         self.data_transmission = 0
+        self.data_collection_distribution = np.zeros(self.human_num)
+        self.data_transmission_distribution = np.zeros(self.human_num)
 
         # for visualization
         self.plot_states = []
         self.robot_actions = []
         self.rewards = []
+        self.aoi_rewards = []
+        self.energy_rewards = []
         self.action_values = []
         self.plot_states.append([[robot.get_obs() for robot in self.robots],
                                  [human.get_obs() for human in self.humans]])
@@ -178,7 +173,7 @@ def step(self, action):
             new_energy = robot.energy - consume_energy
             self.robot_energy_timelist[self.current_timestep + 1][robot_id] = new_energy
 
-            if is_collide is True:
+            if is_collide or (new_robot_px < 0 or new_robot_px > self.nlon or new_robot_py < 0 or new_robot_py > self.nlat):
                 new_robot_position[robot_id][0] = robot.px
                 new_robot_position[robot_id][1] = robot.py
                 self.robot_x_timelist[self.current_timestep + 1][robot_id] = robot.px
@@ -231,6 +226,8 @@ def step(self, action):
             self.cur_data_amount_timelist[human_id] = human.data_amount
             self.current_human_aoi_list[human_id] = human.aoi
             self.sync_human_df(human_id, self.current_timestep + 1, human.aoi, human.data_amount)
+            self.data_collection_distribution[human_id] += human.collect_v
+            self.data_transmission_distribution[human_id] += human_transmit_data_list[human_id]
 
         self.mean_aoi_timelist[self.current_timestep + 1] = np.mean(self.current_human_aoi_list)
         self.update_human_timelist[self.current_timestep] = num_updated_human
@@ -244,13 +241,20 @@ def step(self, action):
             self.total_generated_data_amount += self.generate_data_amount_per_step
 
         # TODO: need to be well-defined
-        reward = self.mean_aoi_timelist[self.current_timestep] - self.mean_aoi_timelist[self.current_timestep + 1] \
-                 - self.config.energy_factor * np.sum(current_enenrgy_consume)
+        aoi_reward = self.mean_aoi_timelist[self.current_timestep] - self.mean_aoi_timelist[self.current_timestep + 1]
+        energy_reward = np.sum(current_enenrgy_consume)
+        reward = aoi_reward \
+                 - self.config.energy_factor * energy_reward
 
         # if hasattr(self.agent.policy, 'action_values'):
         #     self.action_values.append(self.agent.policy.action_values)
         self.robot_actions.append(action)
         self.rewards.append(reward)
+        self.aoi_rewards.append(aoi_reward)
+        self.energy_rewards.append(energy_reward)
+        distribution_entropy = entropy(
+            self.data_collection_distribution/ np.sum(self.data_collection_distribution),
+            self.data_transmission_distribution/np.sum(self.data_transmission_distribution) + 1e-10)
         self.plot_states.append([[robot.get_obs() for robot in self.robots],
                                  [human.get_obs() for human in self.humans]])
 
@@ -266,11 +270,12 @@ def step(self, action):
         info = {
             "performance_info": {
             "mean_aoi": self.mean_aoi_timelist[self.current_timestep],
-            "mean_transmit_data": delta_sum_transmit_data / self.human_num,
+            "mean_transmit_data": self.data_transmission / self.human_num,
             "mean_energy_consumption": 1.0 - (
                         np.mean(self.robot_energy_timelist[self.current_timestep]) / self.max_uav_energy),
             "transmitted_data_ratio": self.data_transmission/(self.total_generated_data_amount*0.3),
-            "human_coverage": np.mean(self.update_human_timelist) / self.human_num
+            "human_coverage": np.mean(self.update_human_timelist) / self.human_num,
+            "distribution_entropy": distribution_entropy  # 增加交叉熵信息
             },
         }
 
@@ -286,6 +291,7 @@ def render(self):
         map_max_y = self.config.nlat
         # 创建一个新的图形
         fig, ax = plt.subplots(figsize=(8, 6))
+        plt.subplots_adjust(right=0.75) # 给数据留白
 
         # 绘制机器人的历史轨迹
         for timestep in range(len(self.robot_x_timelist)):
@@ -314,6 +320,15 @@ def render(self):
         ax.set_xlabel('X')
         ax.set_ylabel('Y')
 
+        # 在图的右上角显示reward/aoi_reward/energy_reward/mean_aoi/energy
+        reward_text = f"Reward: {self.rewards[-1] if self.rewards else 0:.2f}\n" \
+                      f"AOI Reward: {self.aoi_rewards[-1] if self.aoi_rewards else 0:.2f}\n" \
+                      f"Energy Reward: {self.energy_rewards[-1] if self.energy_rewards else 0:.2f}\n" \
+                      f"Mean AOI: {self.mean_aoi_timelist[self.current_timestep] if self.current_timestep < len(self.mean_aoi_timelist) else 0:.2f}\n" \
+                      f"Energy: {np.mean(self.robot_energy_timelist[self.current_timestep]) if self.current_timestep < len(self.robot_energy_timelist) else 0:.2f}"
+        plt.text(1.05, 0.95, reward_text, horizontalalignment='left', verticalalignment='top', 
+             transform=ax.transAxes, fontsize=10, bbox=dict(facecolor='white', alpha=0.6), 
+             clip_on=False)  # Ensure text is not clipped
         # 在地图之外留出一些空白区域
         ax.margins(x=0.1, y=0.1)
         ax.set_title('Crowd Simulation Visualization')
diff --git a/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py b/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py
index db6c47b31..03a4f5af3 100644
--- a/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py
+++ b/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py
@@ -37,7 +37,7 @@ def __init__(self, id, config):
         self.data_queue = InformationQueue()
         self.data_amount = 0
         self.collect_v_prob = getattr(self.config, 'collect_v_prob', {1: 1, 2: 0})
-        self.collect_v = random.choices(list(self.collect_v_prob.keys()), list(self.collect_v_prob.values()))[0]
+        self.collect_v = random.choices(list(map(int, self.collect_v_prob.keys())), list(self.collect_v_prob.values()))[0]
 
     def set(self, px, py, theta, aoi, data_amount):
         self.px = px
diff --git a/zoo/CrowdSim/envs/crowdsim_lightzero_env.py b/zoo/CrowdSim/envs/crowdsim_lightzero_env.py
index bcecc89bc..59548dbe5 100644
--- a/zoo/CrowdSim/envs/crowdsim_lightzero_env.py
+++ b/zoo/CrowdSim/envs/crowdsim_lightzero_env.py
@@ -128,6 +128,8 @@ def step(self, action: Union[int, np.ndarray]) -> BaseEnvTimestep:
                 if not os.path.exists(self._replay_path):
                     os.makedirs(self._replay_path)
                 imageio.mimsave(self._replay_path + '/replay.gif', self._frame)
+                # save env.human_df as csv
+                self._env.human_df.to_csv(self._replay_path + '/human_df.csv')
         return BaseEnvTimestep(obs, rew, done, info)
 
     def enable_save_replay(self, replay_path: Optional[str] = None) -> None:
diff --git a/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py b/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py
index 09f36d6b7..9ff396da5 100644
--- a/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py
+++ b/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py
@@ -10,7 +10,9 @@
         human_num = 59, # purdue
         one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]],
         obs_mode = '2-dim-array',
-        env_mode = 'easy',
+        env_mode = 'hard',
+        transmit_v=120,
+        collect_v_prob = {'1': 1, '2': 0},
         )
 
 @ pytest.mark.envtest
@@ -72,6 +74,7 @@ def test_obs_1_dim_array(self):
             if timestep.done:
                 break
         print(env.observation_space, env.action_space, env.reward_space)
+        print('episode reward:', timestep.info['eval_episode_return'])
         env.close()
 
 

From 63d37a89086dfd8dc7813221a3eca9673cbf97b4 Mon Sep 17 00:00:00 2001
From: nighood <nighoodren@gmail.com>
Date: Thu, 20 Jun 2024 18:29:00 +0800
Subject: [PATCH 16/16] polish(rjy): polish code according to comments

---
 lzero/agent/efficientzero.py                  |   8 +-
 lzero/agent/muzero.py                         |   8 +-
 lzero/agent/sampled_efficientzero.py          |  15 +-
 lzero/mcts/utils.py                           |  20 +-
 lzero/model/common.py                         |  98 ++++---
 lzero/model/common_gcn.py                     |  75 +++---
 lzero/model/muzero_model_gcn.py               |   9 +-
 lzero/model/muzero_model_md.py                |  23 +-
 .../model/sampled_efficientzero_model_gcn.py  |   8 +-
 lzero/model/sampled_efficientzero_model_md.py |   8 +-
 lzero/model/tests/test_common_gcn.py          |   6 +-
 lzero/model/tests/test_rgcn.py                |  10 +-
 lzero/policy/efficientzero.py                 |  19 +-
 lzero/policy/muzero.py                        |  22 +-
 lzero/policy/sampled_efficientzero.py         |  99 ++++---
 lzero/worker/muzero_collector.py              | 139 +++++-----
 lzero/worker/muzero_evaluator.py              |  16 +-
 .../config/CrowdSim_efficientzero_config.py   |  35 +--
 zoo/CrowdSim/config/CrowdSim_muzero_config.py |  26 +-
 .../crowdsim_efficientzero_md_config.py       |  47 ++--
 .../config/crowdsim_muzero_md_config.py       |  56 ++--
 .../config/crowdsim_muzero_rgcn_config.py     |  46 ++--
 zoo/CrowdSim/config/crowdsim_sez_md_config.py |  47 ++--
 zoo/CrowdSim/entry/eval_crowdsim.py           |  14 +-
 zoo/CrowdSim/envs/CrowdSim_env.py             |  18 +-
 .../envs/Crowdsim/env/base_env_config.py      | 137 ----------
 zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py   | 250 +++++++++++++-----
 .../Crowdsim/env/crowd_sim_base_config.py     | 156 +++++++++++
 zoo/CrowdSim/envs/Crowdsim/env/model/agent.py | 102 ++++++-
 zoo/CrowdSim/envs/Crowdsim/env/model/mdp.py   |  15 +-
 zoo/CrowdSim/envs/Crowdsim/env/model/utils.py | 173 +++++++++---
 zoo/CrowdSim/envs/crowdsim_lightzero_env.py   |  33 ++-
 zoo/CrowdSim/envs/test_CrowdSim_env.py        |  21 +-
 .../envs/test_crowdsim_lightzero_env.py       |  23 +-
 34 files changed, 1039 insertions(+), 743 deletions(-)
 delete mode 100644 zoo/CrowdSim/envs/Crowdsim/env/base_env_config.py
 create mode 100644 zoo/CrowdSim/envs/Crowdsim/env/crowd_sim_base_config.py

diff --git a/lzero/agent/efficientzero.py b/lzero/agent/efficientzero.py
index f06cd844b..bd8e6ff7b 100644
--- a/lzero/agent/efficientzero.py
+++ b/lzero/agent/efficientzero.py
@@ -127,8 +127,8 @@ def __init__(
         self.env_fn, self.collector_env_cfg, self.evaluator_env_cfg = get_vec_env_setting(self.cfg.env)
 
     def train(
-        self,
-        step: int = int(1e7),
+            self,
+            step: int = int(1e7),
     ) -> TrainingReturn:
         """
         Overview:
@@ -359,8 +359,8 @@ def deploy(
         return EvalReturn(eval_value=np.mean(reward_list), eval_value_std=np.std(reward_list))
 
     def batch_evaluate(
-        self,
-        n_evaluator_episode: int = None,
+            self,
+            n_evaluator_episode: int = None,
     ) -> EvalReturn:
         """
         Overview:
diff --git a/lzero/agent/muzero.py b/lzero/agent/muzero.py
index 26ffc6f04..dfb691a69 100644
--- a/lzero/agent/muzero.py
+++ b/lzero/agent/muzero.py
@@ -130,8 +130,8 @@ def __init__(
         self.env_fn, self.collector_env_cfg, self.evaluator_env_cfg = get_vec_env_setting(self.cfg.env)
 
     def train(
-        self,
-        step: int = int(1e7),
+            self,
+            step: int = int(1e7),
     ) -> TrainingReturn:
         """
         Overview:
@@ -362,8 +362,8 @@ def deploy(
         return EvalReturn(eval_value=np.mean(reward_list), eval_value_std=np.std(reward_list))
 
     def batch_evaluate(
-        self,
-        n_evaluator_episode: int = None,
+            self,
+            n_evaluator_episode: int = None,
     ) -> EvalReturn:
         """
         Overview:
diff --git a/lzero/agent/sampled_efficientzero.py b/lzero/agent/sampled_efficientzero.py
index ece54b784..a60dae859 100644
--- a/lzero/agent/sampled_efficientzero.py
+++ b/lzero/agent/sampled_efficientzero.py
@@ -93,7 +93,12 @@ def __init__(
             cfg.main_config.exp_name = exp_name
         self.origin_cfg = cfg
         self.cfg = compile_config(
-            cfg.main_config, seed=seed, env=None, auto=True, policy=SampledEfficientZeroPolicy, create_cfg=cfg.create_config
+            cfg.main_config,
+            seed=seed,
+            env=None,
+            auto=True,
+            policy=SampledEfficientZeroPolicy,
+            create_cfg=cfg.create_config
         )
         self.exp_name = self.cfg.exp_name
 
@@ -127,8 +132,8 @@ def __init__(
         self.env_fn, self.collector_env_cfg, self.evaluator_env_cfg = get_vec_env_setting(self.cfg.env)
 
     def train(
-        self,
-        step: int = int(1e7),
+            self,
+            step: int = int(1e7),
     ) -> TrainingReturn:
         """
         Overview:
@@ -359,8 +364,8 @@ def deploy(
         return EvalReturn(eval_value=np.mean(reward_list), eval_value_std=np.std(reward_list))
 
     def batch_evaluate(
-        self,
-        n_evaluator_episode: int = None,
+            self,
+            n_evaluator_episode: int = None,
     ) -> EvalReturn:
         """
         Overview:
diff --git a/lzero/mcts/utils.py b/lzero/mcts/utils.py
index 80e3f588e..11afad53a 100644
--- a/lzero/mcts/utils.py
+++ b/lzero/mcts/utils.py
@@ -6,8 +6,9 @@
 from graphviz import Digraph
 
 
-def generate_random_actions_discrete(num_actions: int, action_space_size: int, num_of_sampled_actions: int,
-                                     reshape=False):
+def generate_random_actions_discrete(
+    num_actions: int, action_space_size: int, num_of_sampled_actions: int, reshape=False
+):
     """
     Overview:
         Generate a list of random actions.
@@ -19,10 +20,7 @@ def generate_random_actions_discrete(num_actions: int, action_space_size: int, n
     Returns:
         A list of random actions.
     """
-    actions = [
-        np.random.randint(0, action_space_size, num_of_sampled_actions).reshape(-1)
-        for _ in range(num_actions)
-    ]
+    actions = [np.random.randint(0, action_space_size, num_of_sampled_actions).reshape(-1) for _ in range(num_actions)]
 
     # If num_of_sampled_actions == 1, flatten the actions to a list of numbers
     if num_of_sampled_actions == 1:
@@ -97,7 +95,9 @@ def prepare_observation(observation_list, model_type='conv'):
     Returns:
         - np.ndarray: Reshaped array of observations.
     """
-    assert model_type in ['conv', 'mlp', 'rgcn', 'mlp_md'], "model_type must be either 'conv', 'mlp', 'rgcn' or 'mlp_md'"
+    assert model_type in [
+        'conv', 'mlp', 'rgcn', 'mlp_md'
+    ], "model_type must be either 'conv', 'mlp', 'rgcn' or 'mlp_md'"
     observation_array = np.array(observation_list)
     batch_size = observation_array.shape[0]
 
@@ -116,7 +116,7 @@ def prepare_observation(observation_list, model_type='conv'):
             observation_array = observation_array.reshape(batch_size, -1)
         else:
             raise ValueError("For 'mlp' model_type, the observation must have 3 dimensions [B, S, O]")
-        
+
     elif model_type == 'rgcn':
         if observation_array.ndim == 4:
             # TODO(rjy): strage process
@@ -127,7 +127,9 @@ def prepare_observation(observation_list, model_type='conv'):
             # Flatten the last two dimensions
             observation_array = observation_array.reshape(batch_size, -1)
         else:
-            raise ValueError("For 'rgcn' model_type, the observation must have 3 dimensions [B, S, O] or 4 dimensions [B, S, M, O]")
+            raise ValueError(
+                "For 'rgcn' model_type, the observation must have 3 dimensions [B, S, O] or 4 dimensions [B, S, M, O]"
+            )
 
     return observation_array
 
diff --git a/lzero/model/common.py b/lzero/model/common.py
index ddf4a5d59..2b798a163 100644
--- a/lzero/model/common.py
+++ b/lzero/model/common.py
@@ -38,10 +38,14 @@ class MZNetworkOutput:
 
 
 class DownSample(nn.Module):
-            
-    def __init__(self, observation_shape: SequenceType, out_channels: int, activation: nn.Module = nn.ReLU(inplace=True),
-                 norm_type: Optional[str] = 'BN',
-                 ) -> None:
+
+    def __init__(
+            self,
+            observation_shape: SequenceType,
+            out_channels: int,
+            activation: nn.Module = nn.ReLU(inplace=True),
+            norm_type: Optional[str] = 'BN',
+    ) -> None:
         """
         Overview:
             Define downSample convolution network. Encode the observation into hidden state.
@@ -74,11 +78,7 @@ def __init__(self, observation_shape: SequenceType, out_channels: int, activatio
         self.resblocks1 = nn.ModuleList(
             [
                 ResBlock(
-                    in_channels=out_channels // 2,
-                    activation=activation,
-                    norm_type='BN',
-                    res_type='basic',
-                    bias=False
+                    in_channels=out_channels // 2, activation=activation, norm_type='BN', res_type='basic', bias=False
                 ) for _ in range(1)
             ]
         )
@@ -92,17 +92,15 @@ def __init__(self, observation_shape: SequenceType, out_channels: int, activatio
         )
         self.resblocks2 = nn.ModuleList(
             [
-                ResBlock(
-                    in_channels=out_channels, activation=activation, norm_type='BN', res_type='basic', bias=False
-                ) for _ in range(1)
+                ResBlock(in_channels=out_channels, activation=activation, norm_type='BN', res_type='basic', bias=False)
+                for _ in range(1)
             ]
         )
         self.pooling1 = nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
         self.resblocks3 = nn.ModuleList(
             [
-                ResBlock(
-                    in_channels=out_channels, activation=activation, norm_type='BN', res_type='basic', bias=False
-                ) for _ in range(1)
+                ResBlock(in_channels=out_channels, activation=activation, norm_type='BN', res_type='basic', bias=False)
+                for _ in range(1)
             ]
         )
         self.pooling2 = nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
@@ -176,15 +174,18 @@ def __init__(
                 self.norm = nn.BatchNorm2d(num_channels)
             elif norm_type == 'LN':
                 if downsample:
-                    self.norm = nn.LayerNorm([num_channels, math.ceil(observation_shape[-2] / 16), math.ceil(observation_shape[-1] / 16)])
+                    self.norm = nn.LayerNorm(
+                        [num_channels,
+                         math.ceil(observation_shape[-2] / 16),
+                         math.ceil(observation_shape[-1] / 16)]
+                    )
                 else:
                     self.norm = nn.LayerNorm([num_channels, observation_shape[-2], observation_shape[-1]])
-            
+
         self.resblocks = nn.ModuleList(
             [
-                ResBlock(
-                    in_channels=num_channels, activation=activation, norm_type='BN', res_type='basic', bias=False
-                ) for _ in range(num_res_blocks)
+                ResBlock(in_channels=num_channels, activation=activation, norm_type='BN', res_type='basic', bias=False)
+                for _ in range(num_res_blocks)
             ]
         )
         self.activation = activation
@@ -225,13 +226,13 @@ def get_param_mean(self) -> float:
 class RepresentationNetworkMLP(nn.Module):
 
     def __init__(
-            self,
-            observation_shape: int,
-            hidden_channels: int = 64,
-            layer_num: int = 2,
-            activation: Optional[nn.Module] = nn.ReLU(inplace=True),
-            last_linear_layer_init_zero: bool = True,
-            norm_type: Optional[str] = 'BN',
+        self,
+        observation_shape: int,
+        hidden_channels: int = 64,
+        layer_num: int = 2,
+        activation: Optional[nn.Module] = nn.ReLU(inplace=True),
+        last_linear_layer_init_zero: bool = True,
+        norm_type: Optional[str] = 'BN',
     ) -> torch.Tensor:
         """
         Overview:
@@ -325,26 +326,35 @@ def __init__(
 
         self.resblocks = nn.ModuleList(
             [
-                ResBlock(
-                    in_channels=num_channels, activation=activation, norm_type='BN', res_type='basic', bias=False
-                ) for _ in range(num_res_blocks)
+                ResBlock(in_channels=num_channels, activation=activation, norm_type='BN', res_type='basic', bias=False)
+                for _ in range(num_res_blocks)
             ]
         )
 
         self.conv1x1_value = nn.Conv2d(num_channels, value_head_channels, 1)
         self.conv1x1_policy = nn.Conv2d(num_channels, policy_head_channels, 1)
-        
+
         if norm_type == 'BN':
             self.norm_value = nn.BatchNorm2d(value_head_channels)
             self.norm_policy = nn.BatchNorm2d(policy_head_channels)
         elif norm_type == 'LN':
             if downsample:
-                self.norm_value = nn.LayerNorm([value_head_channels, math.ceil(observation_shape[-2] / 16), math.ceil(observation_shape[-1] / 16)])
-                self.norm_policy = nn.LayerNorm([policy_head_channels, math.ceil(observation_shape[-2] / 16), math.ceil(observation_shape[-1] / 16)])
+                self.norm_value = nn.LayerNorm(
+                    [value_head_channels,
+                     math.ceil(observation_shape[-2] / 16),
+                     math.ceil(observation_shape[-1] / 16)]
+                )
+                self.norm_policy = nn.LayerNorm(
+                    [
+                        policy_head_channels,
+                        math.ceil(observation_shape[-2] / 16),
+                        math.ceil(observation_shape[-1] / 16)
+                    ]
+                )
             else:
                 self.norm_value = nn.LayerNorm([value_head_channels, observation_shape[-2], observation_shape[-1]])
                 self.norm_policy = nn.LayerNorm([policy_head_channels, observation_shape[-2], observation_shape[-1]])
-        
+
         self.flatten_output_size_for_value_head = flatten_output_size_for_value_head
         self.flatten_output_size_for_policy_head = flatten_output_size_for_policy_head
         self.activation = activation
@@ -406,16 +416,16 @@ def forward(self, latent_state: torch.Tensor) -> Tuple[torch.Tensor, torch.Tenso
 class PredictionNetworkMLP(nn.Module):
 
     def __init__(
-            self,
-            action_space_size,
-            num_channels,
-            common_layer_num: int = 2,
-            fc_value_layers: SequenceType = [32],
-            fc_policy_layers: SequenceType = [32],
-            output_support_size: int = 601,
-            last_linear_layer_init_zero: bool = True,
-            activation: Optional[nn.Module] = nn.ReLU(inplace=True),
-            norm_type: Optional[str] = 'BN',
+        self,
+        action_space_size,
+        num_channels,
+        common_layer_num: int = 2,
+        fc_value_layers: SequenceType = [32],
+        fc_policy_layers: SequenceType = [32],
+        output_support_size: int = 601,
+        last_linear_layer_init_zero: bool = True,
+        activation: Optional[nn.Module] = nn.ReLU(inplace=True),
+        norm_type: Optional[str] = 'BN',
     ):
         """
         Overview:
diff --git a/lzero/model/common_gcn.py b/lzero/model/common_gcn.py
index 857890c2f..73bd7a20c 100644
--- a/lzero/model/common_gcn.py
+++ b/lzero/model/common_gcn.py
@@ -9,29 +9,31 @@
 
 from .utils import renormalize, get_params_mean, get_dynamic_mean, get_reward_mean
 
+
 class RGCNLayer(nn.Module):
     """
     Overview:
         Relational graph convolutional network layer.
     """
+
     def __init__(
-            self,
-            robot_num: int,
-            human_num: int,
-            robot_state_dim,
-            human_state_dim,
-            similarity_function,
-            num_layer = 2,
-            X_dim = 32,
-            layerwise_graph = False,
-            skip_connection = True,
-            wr_dims = [64, 32],     # the last dim should equal to X_dim
-            wh_dims = [64, 32],     # the last dim should equal to X_dim
-            final_state_dim = 32,   # should equal to X_dim
-            norm_type= None,
-            last_linear_layer_init_zero=True,
-            activation: Optional[nn.Module] = nn.ReLU(inplace=True),
-            ):
+        self,
+        robot_num: int,
+        human_num: int,
+        robot_state_dim,
+        human_state_dim,
+        similarity_function,
+        num_layer=2,
+        X_dim=32,
+        layerwise_graph=False,
+        skip_connection=True,
+        wr_dims=[64, 32],  # the last dim should equal to X_dim
+        wh_dims=[64, 32],  # the last dim should equal to X_dim
+        final_state_dim=32,  # should equal to X_dim
+        norm_type=None,
+        last_linear_layer_init_zero=True,
+        activation: Optional[nn.Module] = nn.ReLU(inplace=True),
+    ):
         super().__init__()
 
         # design choice
@@ -59,7 +61,7 @@ def __init__(
             activation=activation,
             norm_type=norm_type,
             last_linear_layer_init_zero=last_linear_layer_init_zero,
-            )  # inputs,64,32
+        )  # inputs,64,32
         self.w_h = MLP(
             in_channels=human_state_dim,
             hidden_channels=wh_dims[0],
@@ -68,7 +70,7 @@ def __init__(
             activation=activation,
             norm_type=norm_type,
             last_linear_layer_init_zero=last_linear_layer_init_zero,
-            )  # inputs,64,32
+        )  # inputs,64,32
 
         if self.similarity_function == 'embedded_gaussian':
             self.w_a = nn.Parameter(torch.randn(self.X_dim, self.X_dim))
@@ -79,7 +81,7 @@ def __init__(
                 hidden_channels=2 * X_dim,
                 out_channels=1,
                 layer_num=1,
-                )
+            )
 
         embedding_dim = self.X_dim
         self.Ws = torch.nn.ParameterList()
@@ -148,10 +150,12 @@ def forward(self, state):
                 stack_num = state.size(1) // ((self.robot_num + self.human_num) * self.robot_state_dim)
                 assert stack_num == 1, "stack_num should be 1 for 1-dim-array obs"
                 # robot_states shape:(B, stack_num*robot_num, state_dim)
-                robot_states = state[:, :stack_num * self.robot_num * self.robot_state_dim].reshape(-1, self.robot_num, self.robot_state_dim)
+                robot_states = state[:, :stack_num * self.robot_num *
+                                     self.robot_state_dim].reshape(-1, self.robot_num, self.robot_state_dim)
                 # human_states shape:(B, stack_num*human_num, state_dim)
-                human_states = state[:, stack_num * self.robot_num * self.robot_state_dim:].reshape(-1, self.human_num, self.human_state_dim)
-                
+                human_states = state[:, stack_num * self.robot_num *
+                                     self.robot_state_dim:].reshape(-1, self.human_num, self.human_state_dim)
+
         # compute feature matrix X
         robot_state_embedings = self.w_r(robot_states)  # batch x num x embedding_dim
         human_state_embedings = self.w_h(human_states)
@@ -180,19 +184,20 @@ def forward(self, state):
 
         return next_H
 
+
 class RepresentationNetworkGCN(nn.Module):
 
     def __init__(
-            self,
-            robot_state_dim: int,
-            human_state_dim:  int,
-            robot_num: int,
-            human_num: int,
-            hidden_channels: int = 64,
-            layer_num: int = 2,
-            activation: Optional[nn.Module] = nn.ReLU(inplace=True),
-            last_linear_layer_init_zero: bool = True,
-            norm_type: Optional[str] = 'BN',
+        self,
+        robot_state_dim: int,
+        human_state_dim: int,
+        robot_num: int,
+        human_num: int,
+        hidden_channels: int = 64,
+        layer_num: int = 2,
+        activation: Optional[nn.Module] = nn.ReLU(inplace=True),
+        last_linear_layer_init_zero: bool = True,
+        norm_type: Optional[str] = 'BN',
     ) -> torch.Tensor:
         """
         Overview:
@@ -229,7 +234,7 @@ def __init__(
             num_layer=2,
             X_dim=hidden_channels,
             final_state_dim=hidden_channels,
-            wr_dims=[hidden_channels, hidden_channels], # TODO: check dim
+            wr_dims=[hidden_channels, hidden_channels],  # TODO: check dim
             wh_dims=[hidden_channels, hidden_channels],
             layerwise_graph=False,
             skip_connection=True,
@@ -258,4 +263,4 @@ def forward(self, x: Dict[str, torch.Tensor]) -> torch.Tensor:
         """
         gcn_embedding = self.rgcn(x)
         gcn_embedding = gcn_embedding.view(gcn_embedding.shape[0], -1)  # (B,M,N) -> (B,M*N)
-        return self.fc_representation(gcn_embedding)
\ No newline at end of file
+        return self.fc_representation(gcn_embedding)
diff --git a/lzero/model/muzero_model_gcn.py b/lzero/model/muzero_model_gcn.py
index 6e2c808a6..08770a880 100644
--- a/lzero/model/muzero_model_gcn.py
+++ b/lzero/model/muzero_model_gcn.py
@@ -106,10 +106,10 @@ def __init__(
         self.res_connection_in_dynamics = res_connection_in_dynamics
 
         self.representation_network = RepresentationNetworkGCN(
-            robot_state_dim = robot_state_dim,
-            human_state_dim = human_state_dim,
-            robot_num = robot_num,
-            human_num = human_num,
+            robot_state_dim=robot_state_dim,
+            human_state_dim=human_state_dim,
+            robot_num=robot_num,
+            human_num=human_num,
             hidden_channels=self.latent_state_dim,
             layer_num=2,
             norm_type=norm_type
@@ -454,4 +454,3 @@ def get_dynamic_mean(self) -> float:
 
     def get_reward_mean(self) -> float:
         return get_reward_mean(self)
-
diff --git a/lzero/model/muzero_model_md.py b/lzero/model/muzero_model_md.py
index 187509ac8..cca5f863e 100644
--- a/lzero/model/muzero_model_md.py
+++ b/lzero/model/muzero_model_md.py
@@ -453,21 +453,20 @@ def get_reward_mean(self) -> float:
         return get_reward_mean(self)
 
 
-
 class PredictionNetworkMD(nn.Module):
 
     def __init__(
-            self,
-            agent_num: int,
-            single_agent_action_size,
-            num_channels,
-            common_layer_num: int = 2,
-            fc_value_layers: SequenceType = [32],
-            output_support_size: int = 601,
-            last_linear_layer_init_zero: bool = True,
-            activation: Optional[nn.Module] = nn.ReLU(inplace=True),
-            norm_type: Optional[str] = 'BN',
-            output_separate_logit: bool = False,
+        self,
+        agent_num: int,
+        single_agent_action_size,
+        num_channels,
+        common_layer_num: int = 2,
+        fc_value_layers: SequenceType = [32],
+        output_support_size: int = 601,
+        last_linear_layer_init_zero: bool = True,
+        activation: Optional[nn.Module] = nn.ReLU(inplace=True),
+        norm_type: Optional[str] = 'BN',
+        output_separate_logit: bool = False,
     ):
         """
         Overview:
diff --git a/lzero/model/sampled_efficientzero_model_gcn.py b/lzero/model/sampled_efficientzero_model_gcn.py
index 5747736eb..70fe8bf05 100644
--- a/lzero/model/sampled_efficientzero_model_gcn.py
+++ b/lzero/model/sampled_efficientzero_model_gcn.py
@@ -144,10 +144,10 @@ def __init__(
         self.res_connection_in_dynamics = res_connection_in_dynamics
 
         self.representation_network = RepresentationNetworkGCN(
-            robot_state_dim = robot_state_dim,
-            human_state_dim = human_state_dim,
-            robot_num = robot_num,
-            human_num = human_num,
+            robot_state_dim=robot_state_dim,
+            human_state_dim=human_state_dim,
+            robot_num=robot_num,
+            human_num=human_num,
             hidden_channels=self.latent_state_dim,
             norm_type=norm_type
         )
diff --git a/lzero/model/sampled_efficientzero_model_md.py b/lzero/model/sampled_efficientzero_model_md.py
index 99092de4a..ec2673a3a 100644
--- a/lzero/model/sampled_efficientzero_model_md.py
+++ b/lzero/model/sampled_efficientzero_model_md.py
@@ -499,10 +499,10 @@ def __init__(
             )
         else:
             self.policy_multi_head = MultiHead(
-            head_cls=DiscreteHead,
-            hidden_size=self.num_channels,
-            output_size_list=[single_agent_action_size for _ in range(agent_num)],
-        )
+                head_cls=DiscreteHead,
+                hidden_size=self.num_channels,
+                output_size_list=[single_agent_action_size for _ in range(agent_num)],
+            )
         self.output_separate_logit = output_separate_logit
 
     def forward(self, latent_state: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
diff --git a/lzero/model/tests/test_common_gcn.py b/lzero/model/tests/test_common_gcn.py
index a7574b3b8..e8ec41d1c 100644
--- a/lzero/model/tests/test_common_gcn.py
+++ b/lzero/model/tests/test_common_gcn.py
@@ -5,6 +5,7 @@
 
 # ...
 
+
 class TestLightZeroEnvWrapper:
 
     # ...
@@ -55,7 +56,7 @@ def test_representation_network_gcn_with_dict_obs(self):
 
     def test_representation_network_gcn_with_2d_array_obs(self):
         robot_state_dim = 10
-        human_state_dim = 10    # 2d_array_obs, so the dimensions must be the same
+        human_state_dim = 10  # 2d_array_obs, so the dimensions must be the same
         robot_num = 3
         human_num = 2
         hidden_channels = 64
@@ -95,8 +96,9 @@ def test_representation_network_gcn_with_2d_array_obs(self):
         # Check intermediate type
         assert isinstance(representation_network.rgcn(x), torch.Tensor)
 
+
 if __name__ == '__main__':
     test = TestLightZeroEnvWrapper()
     test.test_representation_network_gcn_with_dict_obs()
     test.test_representation_network_gcn_with_2d_array_obs()
-    print("All tests passed.")
\ No newline at end of file
+    print("All tests passed.")
diff --git a/lzero/model/tests/test_rgcn.py b/lzero/model/tests/test_rgcn.py
index 32b90b6bb..836f36b2e 100644
--- a/lzero/model/tests/test_rgcn.py
+++ b/lzero/model/tests/test_rgcn.py
@@ -5,7 +5,9 @@
 import unittest
 from lzero.model.common_gcn import RGCNLayer
 
+
 class TestRGCNLayer(unittest.TestCase):
+
     def setUp(self):
         self.robot_state_dim = 10
         self.human_state_dim = 10
@@ -42,9 +44,13 @@ def test_similarity_function(self):
         if self.similarity_function == 'embedded_gaussian':
             X = torch.randn(self.batch_size, self.num_nodes * 2, 32)
             A = self.rgcn_layer.compute_similarity_matrix(X)
-            self.assertEqual(A.shape, (self.batch_size, self.num_nodes * 2, self.num_nodes * 2), "Similarity matrix shape is incorrect.")
+            self.assertEqual(
+                A.shape, (self.batch_size, self.num_nodes * 2, self.num_nodes * 2),
+                "Similarity matrix shape is incorrect."
+            )
             self.assertTrue(torch.all(A >= 0) and torch.all(A <= 1), "Similarity matrix values should be normalized.")
 
+
 # Running the tests
 if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/lzero/policy/efficientzero.py b/lzero/policy/efficientzero.py
index b9e844cbe..7c0e50c42 100644
--- a/lzero/policy/efficientzero.py
+++ b/lzero/policy/efficientzero.py
@@ -370,7 +370,9 @@ def _forward_learn(self, data: torch.Tensor) -> Dict[str, Union[float, int]]:
             target_normalized_visit_count_masked = torch.index_select(
                 target_normalized_visit_count_init_step, 0, non_masked_indices
             )
-            target_policy_entropy = -((target_normalized_visit_count_masked+1e-6) * (target_normalized_visit_count_masked+1e-6).log()).sum(-1).mean()
+            target_policy_entropy = -(
+                (target_normalized_visit_count_masked + 1e-6) * (target_normalized_visit_count_masked + 1e-6).log()
+            ).sum(-1).mean()
         else:
             # Set target_policy_entropy to log(|A|) if all rows are masked
             target_policy_entropy = torch.log(torch.tensor(target_normalized_visit_count_init_step.shape[-1]))
@@ -437,7 +439,9 @@ def _forward_learn(self, data: torch.Tensor) -> Dict[str, Union[float, int]]:
                 target_normalized_visit_count_masked = torch.index_select(
                     target_normalized_visit_count, 0, non_masked_indices
                 )
-                target_policy_entropy += -((target_normalized_visit_count_masked+1e-6) * (target_normalized_visit_count_masked+1e-6).log()).sum(-1).mean()
+                target_policy_entropy += -(
+                    (target_normalized_visit_count_masked + 1e-6) * (target_normalized_visit_count_masked + 1e-6).log()
+                ).sum(-1).mean()
             else:
                 # Set target_policy_entropy to log(|A|) if all rows are masked
                 target_policy_entropy += torch.log(torch.tensor(target_normalized_visit_count.shape[-1]))
@@ -578,8 +582,7 @@ def _forward_collect(
             pred_values = self.inverse_scalar_transform_handle(pred_values).detach().cpu().numpy()
             latent_state_roots = latent_state_roots.detach().cpu().numpy()
             reward_hidden_state_roots = (
-                reward_hidden_state_roots[0].detach().cpu().numpy(),
-                reward_hidden_state_roots[1].detach().cpu().numpy()
+                reward_hidden_state_roots[0].detach().cpu().numpy(), reward_hidden_state_roots[1].detach().cpu().numpy()
             )
             policy_logits = policy_logits.detach().cpu().numpy().tolist()
 
@@ -649,7 +652,13 @@ def _init_eval(self) -> None:
         else:
             self._mcts_eval = MCTSPtree(self._cfg)
 
-    def _forward_eval(self, data: torch.Tensor, action_mask: list, to_play: -1, ready_env_id: np.array = None,):
+    def _forward_eval(
+        self,
+        data: torch.Tensor,
+        action_mask: list,
+        to_play: -1,
+        ready_env_id: np.array = None,
+    ):
         """
          Overview:
              The forward function for evaluating the current policy in eval mode. Use model to execute MCTS search.
diff --git a/lzero/policy/muzero.py b/lzero/policy/muzero.py
index d4e49c30f..d5597db57 100644
--- a/lzero/policy/muzero.py
+++ b/lzero/policy/muzero.py
@@ -445,9 +445,9 @@ def _forward_learn(self, data: Tuple[torch.Tensor]) -> Dict[str, Union[float, in
         # ==============================================================
         # weighted loss with masks (some invalid states which are out of trajectory.)
         loss = (
-                self._cfg.ssl_loss_weight * consistency_loss + self._cfg.policy_loss_weight * policy_loss +
-                self._cfg.value_loss_weight * value_loss + self._cfg.reward_loss_weight * reward_loss +
-                self._cfg.policy_entropy_loss_weight * policy_entropy_loss
+            self._cfg.ssl_loss_weight * consistency_loss + self._cfg.policy_loss_weight * policy_loss +
+            self._cfg.value_loss_weight * value_loss + self._cfg.reward_loss_weight * reward_loss +
+            self._cfg.policy_entropy_loss_weight * policy_entropy_loss
         )
         weighted_total_loss = (weights * loss).mean()
 
@@ -457,8 +457,9 @@ def _forward_learn(self, data: Tuple[torch.Tensor]) -> Dict[str, Union[float, in
         weighted_total_loss.backward()
         if self._cfg.multi_gpu:
             self.sync_gradients(self._learn_model)
-        total_grad_norm_before_clip = torch.nn.utils.clip_grad_norm_(self._learn_model.parameters(),
-                                                                     self._cfg.grad_clip_value)
+        total_grad_norm_before_clip = torch.nn.utils.clip_grad_norm_(
+            self._learn_model.parameters(), self._cfg.grad_clip_value
+        )
         self._optimizer.step()
         if self._cfg.lr_piecewise_constant_decay:
             self.lr_scheduler.step()
@@ -481,7 +482,7 @@ def _forward_learn(self, data: Tuple[torch.Tensor]) -> Dict[str, Union[float, in
             'weighted_total_loss': weighted_total_loss.item(),
             'total_loss': loss.mean().item(),
             'policy_loss': policy_loss.mean().item(),
-            'policy_entropy': - policy_entropy_loss.mean().item() / (self._cfg.num_unroll_steps + 1),
+            'policy_entropy': -policy_entropy_loss.mean().item() / (self._cfg.num_unroll_steps + 1),
             'reward_loss': reward_loss.mean().item(),
             'value_loss': value_loss.mean().item(),
             'consistency_loss': consistency_loss.mean().item() / self._cfg.num_unroll_steps,
@@ -653,8 +654,13 @@ def _get_target_obs_index_in_step_k(self, step):
             end_index = self._cfg.model.observation_shape * (step + self._cfg.model.frame_stack_num)
         return beg_index, end_index
 
-    def _forward_eval(self, data: torch.Tensor, action_mask: list, to_play: int = -1,
-                      ready_env_id: np.array = None, ) -> Dict:
+    def _forward_eval(
+            self,
+            data: torch.Tensor,
+            action_mask: list,
+            to_play: int = -1,
+            ready_env_id: np.array = None,
+    ) -> Dict:
         """
         Overview:
             The forward function for evaluating the current policy in eval mode. Use model to execute MCTS search.
diff --git a/lzero/policy/sampled_efficientzero.py b/lzero/policy/sampled_efficientzero.py
index 687fe631b..761eaa96b 100644
--- a/lzero/policy/sampled_efficientzero.py
+++ b/lzero/policy/sampled_efficientzero.py
@@ -500,9 +500,9 @@ def _forward_learn(self, data: torch.Tensor) -> Dict[str, Union[float, int]]:
         # ==============================================================
         # weighted loss with masks (some invalid states which are out of trajectory.)
         loss = (
-                self._cfg.ssl_loss_weight * consistency_loss + self._cfg.policy_loss_weight * policy_loss +
-                self._cfg.value_loss_weight * value_loss + self._cfg.reward_loss_weight * value_prefix_loss +
-                self._cfg.policy_entropy_loss_weight * policy_entropy_loss
+            self._cfg.ssl_loss_weight * consistency_loss + self._cfg.policy_loss_weight * policy_loss +
+            self._cfg.value_loss_weight * value_loss + self._cfg.reward_loss_weight * value_prefix_loss +
+            self._cfg.policy_entropy_loss_weight * policy_entropy_loss
         )
         weighted_total_loss = (weights * loss).mean()
 
@@ -554,33 +554,37 @@ def _forward_learn(self, data: torch.Tensor) -> Dict[str, Union[float, int]]:
         }
 
         if self._cfg.model.continuous_action_space:
-            return_data.update({
-                # ==============================================================
-                # sampled related core code
-                # ==============================================================
-                'policy_mu_max': mu[:, 0].max().item(),
-                'policy_mu_min': mu[:, 0].min().item(),
-                'policy_mu_mean': mu[:, 0].mean().item(),
-                'policy_sigma_max': sigma.max().item(),
-                'policy_sigma_min': sigma.min().item(),
-                'policy_sigma_mean': sigma.mean().item(),
-                # take the fist dim in action space
-                'target_sampled_actions_max': target_sampled_actions[:, :, 0].max().item(),
-                'target_sampled_actions_min': target_sampled_actions[:, :, 0].min().item(),
-                'target_sampled_actions_mean': target_sampled_actions[:, :, 0].mean().item(),
-                'total_grad_norm_before_clip': total_grad_norm_before_clip.item()
-            })
+            return_data.update(
+                {
+                    # ==============================================================
+                    # sampled related core code
+                    # ==============================================================
+                    'policy_mu_max': mu[:, 0].max().item(),
+                    'policy_mu_min': mu[:, 0].min().item(),
+                    'policy_mu_mean': mu[:, 0].mean().item(),
+                    'policy_sigma_max': sigma.max().item(),
+                    'policy_sigma_min': sigma.min().item(),
+                    'policy_sigma_mean': sigma.mean().item(),
+                    # take the fist dim in action space
+                    'target_sampled_actions_max': target_sampled_actions[:, :, 0].max().item(),
+                    'target_sampled_actions_min': target_sampled_actions[:, :, 0].min().item(),
+                    'target_sampled_actions_mean': target_sampled_actions[:, :, 0].mean().item(),
+                    'total_grad_norm_before_clip': total_grad_norm_before_clip.item()
+                }
+            )
         else:
-            return_data.update({
-                # ==============================================================
-                # sampled related core code
-                # ==============================================================
-                # take the fist dim in action space
-                'target_sampled_actions_max': target_sampled_actions[:, :].float().max().item(),
-                'target_sampled_actions_min': target_sampled_actions[:, :].float().min().item(),
-                'target_sampled_actions_mean': target_sampled_actions[:, :].float().mean().item(),
-                'total_grad_norm_before_clip': total_grad_norm_before_clip.item()
-            })
+            return_data.update(
+                {
+                    # ==============================================================
+                    # sampled related core code
+                    # ==============================================================
+                    # take the fist dim in action space
+                    'target_sampled_actions_max': target_sampled_actions[:, :].float().max().item(),
+                    'target_sampled_actions_min': target_sampled_actions[:, :].float().min().item(),
+                    'target_sampled_actions_mean': target_sampled_actions[:, :].float().mean().item(),
+                    'total_grad_norm_before_clip': total_grad_norm_before_clip.item()
+                }
+            )
 
         return return_data
 
@@ -681,9 +685,9 @@ def _calculate_policy_loss_cont(
         if self._cfg.policy_loss_type == 'KL':
             # KL divergence loss: sum( p* log(p/q) ) = sum( p*log(p) - p*log(q) )= sum( p*log(p)) - sum( p*log(q) )
             policy_loss += (
-                                   torch.exp(target_log_prob_sampled_actions.detach()) *
-                                   (target_log_prob_sampled_actions.detach() - log_prob_sampled_actions)
-                           ).sum(-1) * mask_batch[:, unroll_step]
+                torch.exp(target_log_prob_sampled_actions.detach()) *
+                (target_log_prob_sampled_actions.detach() - log_prob_sampled_actions)
+            ).sum(-1) * mask_batch[:, unroll_step]
         elif self._cfg.policy_loss_type == 'cross_entropy':
             # cross_entropy loss: - sum(p * log (q) )
             policy_loss += -torch.sum(
@@ -724,8 +728,9 @@ def _calculate_policy_loss_disc(
             torch.nonzero(mask_batch[:, unroll_step]).squeeze(-1)
         )
 
-        target_policy_entropy = -((target_normalized_visit_count_masked + 1e-6) * (
-                    target_normalized_visit_count_masked + 1e-6).log()).sum(-1).mean()
+        target_policy_entropy = -(
+            (target_normalized_visit_count_masked + 1e-6) * (target_normalized_visit_count_masked + 1e-6).log()
+        ).sum(-1).mean()
 
         # shape: (batch_size, num_unroll_steps, num_of_sampled_actions, action_dim) -> (batch_size,
         # num_of_sampled_actions, action_dim) e.g. (4, 6, 20, 2) ->  (4, 20, 2)
@@ -769,9 +774,9 @@ def _calculate_policy_loss_disc(
         if self._cfg.policy_loss_type == 'KL':
             # KL divergence loss: sum( p* log(p/q) ) = sum( p*log(p) - p*log(q) )= sum( p*log(p)) - sum( p*log(q) )
             policy_loss += (
-                                   torch.exp(target_log_prob_sampled_actions.detach()) *
-                                   (target_log_prob_sampled_actions.detach() - log_prob_sampled_actions)
-                           ).sum(-1) * mask_batch[:, unroll_step]
+                torch.exp(target_log_prob_sampled_actions.detach()) *
+                (target_log_prob_sampled_actions.detach() - log_prob_sampled_actions)
+            ).sum(-1) * mask_batch[:, unroll_step]
         elif self._cfg.policy_loss_type == 'cross_entropy':
             # cross_entropy loss: - sum(p * log (q) )
             policy_loss += -torch.sum(
@@ -793,8 +798,13 @@ def _init_collect(self) -> None:
         self._collect_mcts_temperature = 1
 
     def _forward_collect(
-            self, data: torch.Tensor, action_mask: list = None, temperature: np.ndarray = 1, to_play=-1,
-            epsilon: float = 0.25, ready_env_id: np.array = None,
+        self,
+        data: torch.Tensor,
+        action_mask: list = None,
+        temperature: np.ndarray = 1,
+        to_play=-1,
+        epsilon: float = 0.25,
+        ready_env_id: np.array = None,
     ):
         """
         Overview:
@@ -832,8 +842,7 @@ def _forward_collect(
             pred_values = self.inverse_scalar_transform_handle(pred_values).detach().cpu().numpy()
             latent_state_roots = latent_state_roots.detach().cpu().numpy()
             reward_hidden_state_roots = (
-                reward_hidden_state_roots[0].detach().cpu().numpy(),
-                reward_hidden_state_roots[1].detach().cpu().numpy()
+                reward_hidden_state_roots[0].detach().cpu().numpy(), reward_hidden_state_roots[1].detach().cpu().numpy()
             )
             policy_logits = policy_logits.detach().cpu().numpy().tolist()
 
@@ -933,7 +942,13 @@ def _init_eval(self) -> None:
         else:
             self._mcts_eval = MCTSPtree(self._cfg)
 
-    def _forward_eval(self, data: torch.Tensor, action_mask: list, to_play: -1, ready_env_id: np.array = None,):
+    def _forward_eval(
+        self,
+        data: torch.Tensor,
+        action_mask: list,
+        to_play: -1,
+        ready_env_id: np.array = None,
+    ):
         """
          Overview:
              The forward function for evaluating the current policy in eval mode. Use model to execute MCTS search.
diff --git a/lzero/worker/muzero_collector.py b/lzero/worker/muzero_collector.py
index 3c1a630dd..272c2cec5 100644
--- a/lzero/worker/muzero_collector.py
+++ b/lzero/worker/muzero_collector.py
@@ -210,24 +210,23 @@ def _compute_priorities(self, i: int, pred_values_lst: List[float], search_value
         if self.policy_config.use_priority:
             # Calculate priorities. The priorities are the L1 losses between the predicted
             # values and the search values. We use 'none' as the reduction parameter, which
-            # means the loss is calculated for each element individually, instead of being summed or averaged. 
+            # means the loss is calculated for each element individually, instead of being summed or averaged.
             # A small constant (1e-6) is added to the results to avoid zero priorities. This
             # is done because zero priorities could potentially cause issues in some scenarios.
             pred_values = torch.from_numpy(np.array(pred_values_lst[i])).to(self.policy_config.device).float().view(-1)
             search_values = torch.from_numpy(np.array(search_values_lst[i])).to(self.policy_config.device
                                                                                 ).float().view(-1)
-            priorities = L1Loss(reduction='none'
-                                )(pred_values,
-                                  search_values).detach().cpu().numpy() + 1e-6
+            priorities = L1Loss(reduction='none')(pred_values, search_values).detach().cpu().numpy() + 1e-6
         else:
             # priorities is None -> use the max priority for all newly collected data
             priorities = None
 
         return priorities
 
-    def pad_and_save_last_trajectory(self, i: int, last_game_segments: List[GameSegment],
-                                     last_game_priorities: List[np.ndarray],
-                                     game_segments: List[GameSegment], done: np.ndarray) -> None:
+    def pad_and_save_last_trajectory(
+            self, i: int, last_game_segments: List[GameSegment], last_game_priorities: List[np.ndarray],
+            game_segments: List[GameSegment], done: np.ndarray
+    ) -> None:
         """
         Overview:
             Save the game segment to the pool if the current game is finished, padding it if necessary.
@@ -270,12 +269,18 @@ def pad_and_save_last_trajectory(self, i: int, last_game_segments: List[GameSegm
 
         # pad over and save
         if self.policy_config.gumbel_algo:
-            last_game_segments[i].pad_over(pad_obs_lst, pad_reward_lst, pad_root_values_lst, pad_child_visits_lst,
-                                           next_segment_improved_policy=pad_improved_policy_prob)
+            last_game_segments[i].pad_over(
+                pad_obs_lst,
+                pad_reward_lst,
+                pad_root_values_lst,
+                pad_child_visits_lst,
+                next_segment_improved_policy=pad_improved_policy_prob
+            )
         else:
             if self.policy_config.use_ture_chance_label_in_chance_encoder:
-                last_game_segments[i].pad_over(pad_obs_lst, pad_reward_lst, pad_root_values_lst, pad_child_visits_lst,
-                                               next_chances=chance_lst)
+                last_game_segments[i].pad_over(
+                    pad_obs_lst, pad_reward_lst, pad_root_values_lst, pad_child_visits_lst, next_chances=chance_lst
+                )
             else:
                 last_game_segments[i].pad_over(pad_obs_lst, pad_reward_lst, pad_root_values_lst, pad_child_visits_lst)
         """
@@ -437,10 +442,7 @@ def collect(self,
 
                 if self.policy_config.gumbel_algo:
                     improved_policy_dict_no_env_id = {k: v['improved_policy_probs'] for k, v in policy_output.items()}
-                    completed_value_no_env_id = {
-                        k: v['roots_completed_value']
-                        for k, v in policy_output.items()
-                    }
+                    completed_value_no_env_id = {k: v['roots_completed_value'] for k, v in policy_output.items()}
                 # TODO(pu): subprocess
                 actions = {}
                 distributions_dict = {}
@@ -488,8 +490,11 @@ def collect(self,
                             distributions_dict[env_id], value_dict[env_id], root_sampled_actions_dict[env_id]
                         )
                     elif self.policy_config.gumbel_algo:
-                        game_segments[env_id].store_search_stats(distributions_dict[env_id], value_dict[env_id],
-                                                                 improved_policy=improved_policy_dict[env_id])
+                        game_segments[env_id].store_search_stats(
+                            distributions_dict[env_id],
+                            value_dict[env_id],
+                            improved_policy=improved_policy_dict[env_id]
+                        )
                     else:
                         game_segments[env_id].store_search_stats(distributions_dict[env_id], value_dict[env_id])
                     # append a transition tuple, including a_t, o_{t+1}, r_{t}, action_mask_{t}, to_play_{t}
@@ -571,30 +576,28 @@ def collect(self,
                 self._env_info[env_id]['time'] += self._timer.value + interaction_duration
                 if timestep.done:
                     reward = timestep.info['eval_episode_return']
+                    info = {
+                        'reward': reward,
+                        'time': self._env_info[env_id]['time'],
+                        'step': self._env_info[env_id]['step'],
+                        'visit_entropy': visit_entropies_lst[env_id] / eps_steps_lst[env_id],
+                    }
                     if timestep.info.get('performance_info') is not None:
+                        # this branch is for the performance evaluation of crowdsim env
                         mean_aoi = timestep.info['performance_info']['mean_aoi']
                         mean_transmit_data = timestep.info['performance_info']['mean_transmit_data']
                         mean_energy_consumption = timestep.info['performance_info']['mean_energy_consumption']
                         transmitted_data_ratio = timestep.info['performance_info']['transmitted_data_ratio']
                         human_coverage = timestep.info['performance_info']['human_coverage']
-                        info = {
-                            'reward': reward,
-                            'time': self._env_info[env_id]['time'],
-                            'step': self._env_info[env_id]['step'],
-                            'visit_entropy': visit_entropies_lst[env_id] / eps_steps_lst[env_id],
-                            'mean_aoi': mean_aoi,
-                            'mean_transmit_data': mean_transmit_data,
-                            'mean_energy_consumption': mean_energy_consumption,
-                            'transmitted_data_ratio': transmitted_data_ratio,
-                            'human_coverage': human_coverage,
-                        }
-                    else:
-                        info = {
-                            'reward': reward,
-                            'time': self._env_info[env_id]['time'],
-                            'step': self._env_info[env_id]['step'],
-                            'visit_entropy': visit_entropies_lst[env_id] / eps_steps_lst[env_id],
-                        }
+                        info.update(
+                            {
+                                'mean_aoi': mean_aoi,
+                                'mean_transmit_data': mean_transmit_data,
+                                'mean_energy_consumption': mean_energy_consumption,
+                                'transmitted_data_ratio': transmitted_data_ratio,
+                                'human_coverage': human_coverage,
+                            }
+                        )
                     if self.policy_config.gumbel_algo:
                         info['completed_value'] = completed_value_lst[env_id] / eps_steps_lst[env_id]
                     collected_episode += 1
@@ -729,51 +732,39 @@ def _output_log(self, train_iter: int) -> None:
             if self.policy_config.gumbel_algo:
                 completed_value = [d['completed_value'] for d in self._episode_info]
             self._total_duration += duration
+            info = {
+                'episode_count': episode_count,
+                'envstep_count': envstep_count,
+                'avg_envstep_per_episode': envstep_count / episode_count,
+                'avg_envstep_per_sec': envstep_count / duration,
+                'avg_episode_per_sec': episode_count / duration,
+                'collect_time': duration,
+                'reward_mean': np.mean(episode_reward),
+                'reward_std': np.std(episode_reward),
+                'reward_max': np.max(episode_reward),
+                'reward_min': np.min(episode_reward),
+                'total_envstep_count': self._total_envstep_count,
+                'total_episode_count': self._total_episode_count,
+                'total_duration': self._total_duration,
+                'visit_entropy': np.mean(visit_entropy),
+                # 'each_reward': episode_reward,
+            }
             if self._episode_info[0].get('mean_aoi') is not None:
+                # this branch is for the performance evaluation of crowdsim env
                 episode_aoi = [d['mean_aoi'] for d in self._episode_info]
                 episode_energy_consumption = [d['mean_energy_consumption'] for d in self._episode_info]
                 episode_transmitted_data_ratio = [d['transmitted_data_ratio'] for d in self._episode_info]
                 episode_human_coverage = [d['human_coverage'] for d in self._episode_info]
                 mean_transmit_data = [d['mean_transmit_data'] for d in self._episode_info]
-                info = {
-                    'episode_count': episode_count,
-                    'envstep_count': envstep_count,
-                    'avg_envstep_per_episode': envstep_count / episode_count,
-                    'avg_envstep_per_sec': envstep_count / duration,
-                    'avg_episode_per_sec': episode_count / duration,
-                    'collect_time': duration,
-                    'reward_mean': np.mean(episode_reward),
-                    'reward_std': np.std(episode_reward),
-                    'reward_max': np.max(episode_reward),
-                    'reward_min': np.min(episode_reward),
-                    'total_envstep_count': self._total_envstep_count,
-                    'total_episode_count': self._total_episode_count,
-                    'total_duration': self._total_duration,
-                    'visit_entropy': np.mean(visit_entropy),
-                    'episode_mean_aoi': np.mean(episode_aoi),
-                    'episode_mean_transmit_data': np.mean(mean_transmit_data),
-                    'episode_mean_energy_consumption': np.mean(episode_energy_consumption),
-                    'episode_mean_transmitted_data_ratio': np.mean(episode_transmitted_data_ratio),
-                    'episode_mean_human_coverage': np.mean(episode_human_coverage),
-                }
-            else:
-                info = {
-                    'episode_count': episode_count,
-                    'envstep_count': envstep_count,
-                    'avg_envstep_per_episode': envstep_count / episode_count,
-                    'avg_envstep_per_sec': envstep_count / duration,
-                    'avg_episode_per_sec': episode_count / duration,
-                    'collect_time': duration,
-                    'reward_mean': np.mean(episode_reward),
-                    'reward_std': np.std(episode_reward),
-                    'reward_max': np.max(episode_reward),
-                    'reward_min': np.min(episode_reward),
-                    'total_envstep_count': self._total_envstep_count,
-                    'total_episode_count': self._total_episode_count,
-                    'total_duration': self._total_duration,
-                    'visit_entropy': np.mean(visit_entropy),
-                    # 'each_reward': episode_reward,
-                }
+                info.update(
+                    {
+                        'episode_mean_aoi': np.mean(episode_aoi),
+                        'episode_mean_transmit_data': np.mean(mean_transmit_data),
+                        'episode_mean_energy_consumption': np.mean(episode_energy_consumption),
+                        'episode_mean_transmitted_data_ratio': np.mean(episode_transmitted_data_ratio),
+                        'episode_mean_human_coverage': np.mean(episode_human_coverage),
+                    }
+                )
             if self.policy_config.gumbel_algo:
                 info['completed_value'] = np.mean(completed_value)
             self._episode_info.clear()
diff --git a/lzero/worker/muzero_evaluator.py b/lzero/worker/muzero_evaluator.py
index d96cf90f7..983191f0c 100644
--- a/lzero/worker/muzero_evaluator.py
+++ b/lzero/worker/muzero_evaluator.py
@@ -235,8 +235,9 @@ def eval(
                 time.sleep(retry_waiting_time)
                 self._logger.info('=' * 10 + 'Wait for all environments (subprocess) to finish resetting.' + '=' * 10)
                 self._logger.info(
-                    'After sleeping {}s, the current _env_states is {}'.format(retry_waiting_time,
-                                                                               self._env._env_states)
+                    'After sleeping {}s, the current _env_states is {}'.format(
+                        retry_waiting_time, self._env._env_states
+                    )
                 )
                 init_obs = self._env.ready_obs
 
@@ -342,8 +343,9 @@ def eval(
                             # Env reset is done by env_manager automatically.
                             self._policy.reset([env_id])
                             reward = t.info['eval_episode_return']
-                            saved_info = {'eval_episode_return': t.info['eval_episode_return']} 
+                            saved_info = {'eval_episode_return': t.info['eval_episode_return']}
                             if 'performance_info' in t.info:
+                                # this branch is for crowdsim env
                                 saved_info.update(t.info['performance_info'])
                             if 'episode_info' in t.info:
                                 saved_info.update(t.info['episode_info'])
@@ -369,7 +371,8 @@ def eval(
                                     )
                                     time.sleep(retry_waiting_time)
                                     self._logger.info(
-                                        '=' * 10 + 'Wait for all environments (subprocess) to finish resetting.' + '=' * 10
+                                        '=' * 10 + 'Wait for all environments (subprocess) to finish resetting.' +
+                                        '=' * 10
                                     )
                                     self._logger.info(
                                         'After sleeping {}s, the current _env_states is {}'.format(
@@ -442,9 +445,8 @@ def eval(
             stop_flag = episode_return >= self._stop_value and train_iter > 0
             if stop_flag:
                 self._logger.info(
-                    "[LightZero serial pipeline] " +
-                    "Current episode_return: {} is greater than stop_value: {}".format(episode_return,
-                                                                                       self._stop_value) +
+                    "[LightZero serial pipeline] " + "Current episode_return: {} is greater than stop_value: {}".
+                    format(episode_return, self._stop_value) +
                     ", so your MCTS/RL agent is converged, you can refer to 'log/evaluator/evaluator_logger.txt' for details."
                 )
 
diff --git a/zoo/CrowdSim/config/CrowdSim_efficientzero_config.py b/zoo/CrowdSim/config/CrowdSim_efficientzero_config.py
index 0252c64ee..fc81f2c46 100644
--- a/zoo/CrowdSim/config/CrowdSim_efficientzero_config.py
+++ b/zoo/CrowdSim/config/CrowdSim_efficientzero_config.py
@@ -1,6 +1,5 @@
 from easydict import EasyDict
 import os
-os.environ["CUDA_VISIBLE_DEVICES"] = '2'
 
 # ==============================================================
 # begin of the most frequently changed config specified by the user
@@ -14,9 +13,10 @@
 max_env_step = int(3e5)
 reanalyze_ratio = 0.
 robot_num = 2
-human_num = 10  # purdue
-# human_num = 33  # NCSU
-# human_num = 92  # KAIST
+# different human_num for different datasets
+human_num = 10  # purdue dataset
+# human_num = 33  # NCSU dataset
+# human_num = 92  # KAIST dataset
 one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]
 # ==============================================================
 # end of the most frequently changed config specified by the user
@@ -27,9 +27,9 @@
     f'result/crowd_num_human/CrowdSim_efficientzero_step{max_env_step}_uav{robot_num}_human{human_num}_upc{update_per_collect}_rr{reanalyze_ratio}_seed0',
     env=dict(
         env_name='CrowdSim-v0',
-        robot_num = robot_num,
-        human_num = human_num, 
-        one_uav_action_space = one_uav_action_space,
+        robot_num=robot_num,
+        human_num=human_num,
+        one_uav_action_space=one_uav_action_space,
         continuous=False,
         manually_discretization=False,
         collector_env_num=collector_env_num,
@@ -39,14 +39,14 @@
     ),
     policy=dict(
         model=dict(
-            observation_shape=(robot_num+human_num)*4,
-            action_space_size=(len(one_uav_action_space))**robot_num,
-            model_type='mlp', 
+            observation_shape=(robot_num + human_num) * 4,
+            action_space_size=(len(one_uav_action_space)) ** robot_num,
+            model_type='mlp',
             lstm_hidden_size=256,
             latent_state_dim=256,
             discrete_action_encoding_type='one_hot',
             # res_connection_in_dynamics=True,
-            norm_type='BN', 
+            norm_type='BN',
         ),
         cuda=True,
         env_type='not_board_games',
@@ -88,16 +88,5 @@
 create_config = CrowdSim_efficientzero_create_config
 
 if __name__ == "__main__":
-    # Users can use different train entry by specifying the entry_type.
-    entry_type = "train_muzero"  # options={"train_muzero", "train_muzero_with_gym_env"}
-
-    if entry_type == "train_muzero":
-        from lzero.entry import train_muzero
-    elif entry_type == "train_muzero_with_gym_env":
-        """
-        The ``train_muzero_with_gym_env`` entry means that the environment used in the training process is generated by wrapping the original gym environment with LightZeroEnvWrapper.
-        Users can refer to lzero/envs/wrappers for more details.
-        """
-        from lzero.entry import train_muzero_with_gym_env as train_muzero
-
+    from lzero.entry import train_muzero
     train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step)
diff --git a/zoo/CrowdSim/config/CrowdSim_muzero_config.py b/zoo/CrowdSim/config/CrowdSim_muzero_config.py
index 8def49e3e..3705d2850 100644
--- a/zoo/CrowdSim/config/CrowdSim_muzero_config.py
+++ b/zoo/CrowdSim/config/CrowdSim_muzero_config.py
@@ -1,6 +1,5 @@
 from easydict import EasyDict
 import os
-os.environ["CUDA_VISIBLE_DEVICES"] = '2'
 # ==============================================================
 # begin of the most frequently changed config specified by the user
 # ==============================================================
@@ -26,9 +25,9 @@
     f'result/crowd_num_human/CrowdSim_muzero_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_upc{update_per_collect}_rr{reanalyze_ratio}_seed0',
     env=dict(
         env_name='CrowdSim-v0',
-        robot_num = robot_num,
-        human_num = human_num, 
-        one_uav_action_space = one_uav_action_space,
+        robot_num=robot_num,
+        human_num=human_num,
+        one_uav_action_space=one_uav_action_space,
         continuous=False,
         manually_discretization=False,
         collector_env_num=collector_env_num,
@@ -38,9 +37,9 @@
     ),
     policy=dict(
         model=dict(
-            observation_shape=(robot_num+human_num)*4,
-            action_space_size=(len(one_uav_action_space))**robot_num,
-            model_type='mlp', 
+            observation_shape=(robot_num + human_num) * 4,
+            action_space_size=(len(one_uav_action_space)) ** robot_num,
+            model_type='mlp',
             lstm_hidden_size=256,
             latent_state_dim=256,
             self_supervised_learning_loss=True,  # NOTE: default is False.
@@ -90,16 +89,5 @@
 create_config = CrowdSim_muzero_create_config
 
 if __name__ == "__main__":
-    # Users can use different train entry by specifying the entry_type.
-    entry_type = "train_muzero"  # options={"train_muzero", "train_muzero_with_gym_env"}
-
-    if entry_type == "train_muzero":
-        from lzero.entry import train_muzero
-    elif entry_type == "train_muzero_with_gym_env":
-        """
-        The ``train_muzero_with_gym_env`` entry means that the environment used in the training process is generated by wrapping the original gym environment with LightZeroEnvWrapper.
-        Users can refer to lzero/envs/wrappers for more details.
-        """
-        from lzero.entry import train_muzero_with_gym_env as train_muzero
-
+    from lzero.entry import train_muzero
     train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step)
diff --git a/zoo/CrowdSim/config/crowdsim_efficientzero_md_config.py b/zoo/CrowdSim/config/crowdsim_efficientzero_md_config.py
index ff2a0913c..85a799a83 100644
--- a/zoo/CrowdSim/config/crowdsim_efficientzero_md_config.py
+++ b/zoo/CrowdSim/config/crowdsim_efficientzero_md_config.py
@@ -1,6 +1,5 @@
 from easydict import EasyDict
 import os
-os.environ["CUDA_VISIBLE_DEVICES"] = '1'
 # ==============================================================
 # begin of the most frequently changed config specified by the user
 # ==============================================================
@@ -13,9 +12,10 @@
 max_env_step = int(3e5)
 reanalyze_ratio = 0.
 robot_num = 2
-human_num = 59  # purdue
-# human_num = 33  # NCSU
-# human_num = 92  # KAIST
+# different human_num for different datasets
+human_num = 59  # purdue dataset
+# human_num = 33  # NCSU dataset
+# human_num = 92  # KAIST dataset
 one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]
 transmit_v = 20
 # ==============================================================
@@ -30,10 +30,10 @@
         transmit_v=transmit_v,
         obs_mode='1-dim-array',
         env_name='CrowdSim-v0',
-        dataset = 'purdue',
-        robot_num = robot_num,
-        human_num = human_num,
-        one_uav_action_space = one_uav_action_space,
+        dataset='purdue',
+        robot_num=robot_num,
+        human_num=human_num,
+        one_uav_action_space=one_uav_action_space,
         continuous=False,
         manually_discretization=False,
         collector_env_num=collector_env_num,
@@ -43,17 +43,17 @@
     ),
     policy=dict(
         model=dict(
-            agent_num = robot_num,
-            observation_shape=(robot_num + human_num)*4,
+            agent_num=robot_num,
+            observation_shape=(robot_num + human_num) * 4,
             obs_mode='1-dim-array',
-            robot_state_dim = 4,
-            human_state_dim = 4,
-            robot_num = robot_num,
-            human_num = human_num,
+            robot_state_dim=4,
+            human_state_dim=4,
+            robot_num=robot_num,
+            human_num=human_num,
             single_agent_action_size=len(one_uav_action_space),
-            action_space_size=(len(one_uav_action_space))**robot_num,
-            model_type='mlp_md', 
-            output_separate_logit=False,    # not output separate logit for each action.
+            action_space_size=(len(one_uav_action_space)) ** robot_num,
+            model_type='mlp_md',
+            output_separate_logit=False,  # not output separate logit for each action.
             lstm_hidden_size=128,
             latent_state_dim=128,
             discrete_action_encoding_type='one_hot',
@@ -97,16 +97,5 @@
 create_config = CrowdSim_efficientzero_md_create_config
 
 if __name__ == "__main__":
-    # Users can use different train entry by specifying the entry_type.
-    entry_type = "train_muzero"  # options={"train_muzero", "train_muzero_with_gym_env"}
-
-    if entry_type == "train_muzero":
-        from lzero.entry import train_muzero
-    elif entry_type == "train_muzero_with_gym_env":
-        """
-        The ``train_muzero_with_gym_env`` entry means that the environment used in the training process is generated by wrapping the original gym environment with LightZeroEnvWrapper.
-        Users can refer to lzero/envs/wrappers for more details.
-        """
-        from lzero.entry import train_muzero_with_gym_env as train_muzero
-
+    from lzero.entry import train_muzero
     train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step)
diff --git a/zoo/CrowdSim/config/crowdsim_muzero_md_config.py b/zoo/CrowdSim/config/crowdsim_muzero_md_config.py
index a79eb0bbe..28c40d9cd 100644
--- a/zoo/CrowdSim/config/crowdsim_muzero_md_config.py
+++ b/zoo/CrowdSim/config/crowdsim_muzero_md_config.py
@@ -1,6 +1,6 @@
 from easydict import EasyDict
 import os
-os.environ["CUDA_VISIBLE_DEVICES"] = '1'
+os.environ["CUDA_VISIBLE_DEVICES"] = '2'
 # ==============================================================
 # begin of the most frequently changed config specified by the user
 # ==============================================================
@@ -10,30 +10,31 @@
 num_simulations = 25
 update_per_collect = 100
 batch_size = 256
-max_env_step = int(3e5)
+max_env_step = int(5e5)
 reanalyze_ratio = 0.
 robot_num = 2
-human_num = 59  # purdue
-# human_num = 33  # NCSU
-# human_num = 92  # KAIST
+# different human_num for different datasets
+human_num = 59  # purdue dataset
+# human_num = 33  # NCSU dataset
+# human_num = 92  # KAIST dataset
 one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]
-transmit_v = 20
+transmit_v = 120
 # ==============================================================
 # end of the most frequently changed config specified by the user
 # ==============================================================
 
 CrowdSim_muzero_config = dict(
     exp_name=
-    f'result/new_env/new_CrowdSim_vt{transmit_v}_muzero_md_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0',
+    f'result/new_env2_hard/new_CrowdSim2_hard_womd_vc1_vt{transmit_v}_muzero_md_ssl_step{max_env_step}_uav{robot_num}_human{human_num}_ns{num_simulations}_upc{update_per_collect}_seed0',
     env=dict(
-        env_mode = 'hard',
+        env_mode='hard',
         transmit_v=transmit_v,
         obs_mode='1-dim-array',
         env_name='CrowdSim-v0',
-        dataset = 'purdue',
-        robot_num = robot_num,
-        human_num = human_num, 
-        one_uav_action_space = one_uav_action_space,
+        dataset='purdue',
+        robot_num=robot_num,
+        human_num=human_num,
+        one_uav_action_space=one_uav_action_space,
         continuous=False,
         manually_discretization=False,
         collector_env_num=collector_env_num,
@@ -45,17 +46,17 @@
         model=dict(
             # robot_observation_shape=(robot_num, 4),
             # human_observation_shape=(human_num, 4),
-            agent_num = robot_num,
-            observation_shape=(robot_num + human_num)*4,
+            agent_num=robot_num,
+            observation_shape=(robot_num + human_num) * 4,
             obs_mode='1-dim-array',
-            robot_state_dim = 4,
-            human_state_dim = 4,
-            robot_num = robot_num,
-            human_num = human_num,
+            robot_state_dim=4,
+            human_state_dim=4,
+            robot_num=robot_num,
+            human_num=human_num,
             single_agent_action_size=len(one_uav_action_space),
-            action_space_size=(len(one_uav_action_space))**robot_num,
-            model_type='mlp_md', 
-            output_separate_logit=False,    # not output separate logit for each action.
+            action_space_size=(len(one_uav_action_space)) ** robot_num,
+            model_type='mlp_md',
+            output_separate_logit=False,  # not output separate logit for each action.
             lstm_hidden_size=256,
             latent_state_dim=256,
             self_supervised_learning_loss=True,  # NOTE: default is False.
@@ -105,16 +106,5 @@
 create_config = CrowdSim_muzero_create_config
 
 if __name__ == "__main__":
-    # Users can use different train entry by specifying the entry_type.
-    entry_type = "train_muzero"  # options={"train_muzero", "train_muzero_with_gym_env"}
-
-    if entry_type == "train_muzero":
-        from lzero.entry import train_muzero
-    elif entry_type == "train_muzero_with_gym_env":
-        """
-        The ``train_muzero_with_gym_env`` entry means that the environment used in the training process is generated by wrapping the original gym environment with LightZeroEnvWrapper.
-        Users can refer to lzero/envs/wrappers for more details.
-        """
-        from lzero.entry import train_muzero_with_gym_env as train_muzero
-
+    from lzero.entry import train_muzero
     train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step)
diff --git a/zoo/CrowdSim/config/crowdsim_muzero_rgcn_config.py b/zoo/CrowdSim/config/crowdsim_muzero_rgcn_config.py
index bb6b9ced3..0f3dcf147 100644
--- a/zoo/CrowdSim/config/crowdsim_muzero_rgcn_config.py
+++ b/zoo/CrowdSim/config/crowdsim_muzero_rgcn_config.py
@@ -1,6 +1,5 @@
 from easydict import EasyDict
 import os
-os.environ["CUDA_VISIBLE_DEVICES"] = '2'
 # ==============================================================
 # begin of the most frequently changed config specified by the user
 # ==============================================================
@@ -13,24 +12,24 @@
 max_env_step = int(3e5)
 reanalyze_ratio = 0.
 robot_num = 2
-human_num = 59  # purdue
-# human_num = 33  # NCSU
-# human_num = 92  # KAIST
+# different human_num for different datasets
+human_num = 59  # purdue dataset
+# human_num = 33  # NCSU dataset
+# human_num = 92  # KAIST dataset
 one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]
 # ==============================================================
 # end of the most frequently changed config specified by the user
 # ==============================================================
 
 CrowdSim_muzero_config = dict(
-    exp_name=
-    f'result/CrowdSim_muzerogcn_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0',
+    exp_name=f'result/CrowdSim_muzerogcn_ssl_step{max_env_step}_uav{robot_num}__human{human_num}_seed0',
     env=dict(
         obs_mode='1-dim-array',
         env_name='CrowdSim-v0',
-        dataset = 'purdue',
-        robot_num = robot_num,
-        human_num = human_num, 
-        one_uav_action_space = one_uav_action_space,
+        dataset='purdue',
+        robot_num=robot_num,
+        human_num=human_num,
+        one_uav_action_space=one_uav_action_space,
         continuous=False,
         manually_discretization=False,
         collector_env_num=collector_env_num,
@@ -42,14 +41,14 @@
         model=dict(
             # robot_observation_shape=(robot_num, 4),
             # human_observation_shape=(human_num, 4),
-            observation_shape=(robot_num + human_num)*4,
+            observation_shape=(robot_num + human_num) * 4,
             obs_mode='1-dim-array',
-            robot_state_dim = 4,
-            human_state_dim = 4,
-            robot_num = robot_num,
-            human_num = human_num,
-            action_space_size=(len(one_uav_action_space))**robot_num,
-            model_type='rgcn', 
+            robot_state_dim=4,
+            human_state_dim=4,
+            robot_num=robot_num,
+            human_num=human_num,
+            action_space_size=(len(one_uav_action_space)) ** robot_num,
+            model_type='rgcn',
             lstm_hidden_size=256,
             latent_state_dim=256,
             self_supervised_learning_loss=True,  # NOTE: default is False.
@@ -99,16 +98,5 @@
 create_config = CrowdSim_muzero_create_config
 
 if __name__ == "__main__":
-    # Users can use different train entry by specifying the entry_type.
-    entry_type = "train_muzero"  # options={"train_muzero", "train_muzero_with_gym_env"}
-
-    if entry_type == "train_muzero":
-        from lzero.entry import train_muzero
-    elif entry_type == "train_muzero_with_gym_env":
-        """
-        The ``train_muzero_with_gym_env`` entry means that the environment used in the training process is generated by wrapping the original gym environment with LightZeroEnvWrapper.
-        Users can refer to lzero/envs/wrappers for more details.
-        """
-        from lzero.entry import train_muzero_with_gym_env as train_muzero
-
+    from lzero.entry import train_muzero
     train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step)
diff --git a/zoo/CrowdSim/config/crowdsim_sez_md_config.py b/zoo/CrowdSim/config/crowdsim_sez_md_config.py
index ae2e5074c..a6ac190a8 100644
--- a/zoo/CrowdSim/config/crowdsim_sez_md_config.py
+++ b/zoo/CrowdSim/config/crowdsim_sez_md_config.py
@@ -1,6 +1,5 @@
 from easydict import EasyDict
 import os
-os.environ["CUDA_VISIBLE_DEVICES"] = '0'
 # ==============================================================
 # begin of the most frequently changed config specified by the user
 # ==============================================================
@@ -13,9 +12,10 @@
 max_env_step = int(3e5)
 reanalyze_ratio = 0.
 robot_num = 2
-human_num = 59  # purdue
-# human_num = 33  # NCSU
-# human_num = 92  # KAIST
+# different human_num for different datasets
+human_num = 59  # purdue dataset
+# human_num = 33  # NCSU dataset
+# human_num = 92  # KAIST dataset
 one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]
 K = 10
 transmit_v = 20
@@ -31,10 +31,10 @@
         obs_mode='1-dim-array',
         transmit_v=transmit_v,
         env_name='CrowdSim-v0',
-        dataset = 'purdue',
-        robot_num = robot_num,
-        human_num = human_num, 
-        one_uav_action_space = one_uav_action_space,
+        dataset='purdue',
+        robot_num=robot_num,
+        human_num=human_num,
+        one_uav_action_space=one_uav_action_space,
         continuous=False,
         manually_discretization=False,
         collector_env_num=collector_env_num,
@@ -44,17 +44,17 @@
     ),
     policy=dict(
         model=dict(
-            agent_num = robot_num,
-            observation_shape=(robot_num + human_num)*4,
+            agent_num=robot_num,
+            observation_shape=(robot_num + human_num) * 4,
             obs_mode='1-dim-array',
-            robot_state_dim = 4,
-            human_state_dim = 4,
-            robot_num = robot_num,
-            human_num = human_num,
+            robot_state_dim=4,
+            human_state_dim=4,
+            robot_num=robot_num,
+            human_num=human_num,
             single_agent_action_size=len(one_uav_action_space),
-            action_space_size=(len(one_uav_action_space))**robot_num,
-            model_type='mlp_md', 
-            output_separate_logit=False,    # not output separate logit for each action.
+            action_space_size=(len(one_uav_action_space)) ** robot_num,
+            model_type='mlp_md',
+            output_separate_logit=False,  # not output separate logit for each action.
             continuous_action_space=False,
             num_of_sampled_actions=K,
             lstm_hidden_size=128,
@@ -106,16 +106,5 @@
 create_config = CrowdSim_sez_create_config
 
 if __name__ == "__main__":
-    # Users can use different train entry by specifying the entry_type.
-    entry_type = "train_muzero"  # options={"train_muzero", "train_muzero_with_gym_env"}
-
-    if entry_type == "train_muzero":
-        from lzero.entry import train_muzero
-    elif entry_type == "train_muzero_with_gym_env":
-        """
-        The ``train_muzero_with_gym_env`` entry means that the environment used in the training process is generated by wrapping the original gym environment with LightZeroEnvWrapper.
-        Users can refer to lzero/envs/wrappers for more details.
-        """
-        from lzero.entry import train_muzero_with_gym_env as train_muzero
-
+    from lzero.entry import train_muzero
     train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step)
diff --git a/zoo/CrowdSim/entry/eval_crowdsim.py b/zoo/CrowdSim/entry/eval_crowdsim.py
index 13297c76f..46364bdc8 100644
--- a/zoo/CrowdSim/entry/eval_crowdsim.py
+++ b/zoo/CrowdSim/entry/eval_crowdsim.py
@@ -28,8 +28,8 @@
 
     # model_path is the path to the trained MuZero model checkpoint.
     # If no path is provided, the script will use the default model.
-    model_path = '/home/nighoodRen/LightZero/result/old_env/CrowdSim_muzeromd_ssl_step300000_uav2__human59_seed0_240503_022923/ckpt/ckpt_best.pth.tar'
-    main_config.exp_name = '/home/nighoodRen/LightZero/result/old_env/CrowdSim_muzeromd_ssl_step300000_uav2__human59_seed0_240503_022923/' + 'eval'   # original result folder/eval
+    model_path = 'xxx/ckpt_best.pth.tar'
+    main_config.exp_name = 'xxx' + 'eval'  # original result folder/eval
     # seeds is a list of seed values for the random number generator, used to initialize the environment.
     seeds = [0]
     # num_episodes_each_seed is the number of episodes to run for each seed.
@@ -49,8 +49,8 @@
     # A boolean flag indicating whether to save the video of the environment.
     main_config.env.save_replay = True
     # The path where the recorded video will be saved.
-    main_config.env.replay_path = main_config.exp_name + '/video'   # current result folder/eval
-    
+    main_config.env.replay_path = main_config.exp_name + '/video'  # current result folder/eval
+
     # The maximum number of steps for each episode during evaluation. This may need to be adjusted based on the specific characteristics of the environment.
     main_config.env.eval_max_episode_steps = int(20)
 
@@ -77,7 +77,9 @@
 
     # Printing the evaluation results. The average reward and the total reward for each seed are displayed, followed by the mean reward across all seeds.
     print("=" * 20)
-    print(f"We evaluated a total of {len(seeds)} seeds. For each seed, we evaluated {num_episodes_each_seed} episode(s).")
+    print(
+        f"We evaluated a total of {len(seeds)} seeds. For each seed, we evaluated {num_episodes_each_seed} episode(s)."
+    )
     print(f"For seeds {seeds}, the mean returns are {returns_mean_seeds}, and the returns are {returns_seeds}.")
     print("Across all seeds, the mean reward is:", returns_mean_seeds.mean())
-    print("=" * 20)
\ No newline at end of file
+    print("=" * 20)
diff --git a/zoo/CrowdSim/envs/CrowdSim_env.py b/zoo/CrowdSim/envs/CrowdSim_env.py
index 21d31863e..6cc3a7209 100644
--- a/zoo/CrowdSim/envs/CrowdSim_env.py
+++ b/zoo/CrowdSim/envs/CrowdSim_env.py
@@ -23,22 +23,20 @@ def __init__(self, cfg: dict = {}) -> None:
         self._robot_num = self._cfg.robot_num
         self._human_num = self._cfg.human_num
         self._observation_space = gym.spaces.Box(
-            low=float("-inf"),
-            high=float("inf"),
-            shape=((self._robot_num+self._human_num)*4,),
-            dtype=np.float32)
+            low=float("-inf"), high=float("inf"), shape=((self._robot_num + self._human_num) * 4, ), dtype=np.float32
+        )
         # action space
         # one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]
         self.real_action_space = list(product(self._cfg.one_uav_action_space, repeat=self._robot_num))
         one_uav_action_n = len(self._cfg.one_uav_action_space)
-        self._action_space = gym.spaces.Discrete(one_uav_action_n**self._robot_num)
+        self._action_space = gym.spaces.Discrete(one_uav_action_n ** self._robot_num)
         self._action_space.seed(0)  # default seed
         self._reward_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1, ), dtype=np.float32)
         self._continuous = False
 
     def reset(self) -> np.ndarray:
         if not self._init_flag:
-            self._env = gym.make('CrowdSim-v0', dataset = self._cfg.dataset, custom_config = self._cfg)
+            self._env = gym.make('CrowdSim-v0', dataset=self._cfg.dataset, custom_config=self._cfg)
             self._init_flag = True
         if hasattr(self, '_seed') and hasattr(self, '_dynamic_seed') and self._dynamic_seed:
             np_seed = 100 * np.random.randint(1, 1000)
@@ -52,8 +50,8 @@ def reset(self) -> np.ndarray:
         raw_obs = self._env.reset()
         obs_list = raw_obs.to_array()
         # human_obs, robot_obs = obs_list
-        obs = np.concatenate(obs_list,axis=0).flatten() # for 1 dim e.g.(244,)
-        assert len(obs)==(self._robot_num+self._human_num)*4
+        obs = np.concatenate(obs_list, axis=0).flatten()  # for 1 dim e.g.(244,)
+        assert len(obs) == (self._robot_num + self._human_num) * 4
         action_mask = np.ones(self.action_space.n, 'int8')
         obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1}
 
@@ -76,8 +74,8 @@ def step(self, action: Union[int, np.ndarray]) -> BaseEnvTimestep:
         assert isinstance(real_action, tuple) and len(real_action) == self._robot_num, "illegal action!"
         raw_obs, rew, done, info = self._env.step(real_action)
         obs_list = to_ndarray(raw_obs.to_tensor())
-        obs = np.concatenate(obs_list,axis=0).flatten() # for 1 dim e.g.(244,)
-        assert len(obs)==(self._robot_num+self._human_num)*4
+        obs = np.concatenate(obs_list, axis=0).flatten()  # for 1 dim e.g.(244,)
+        assert len(obs) == (self._robot_num + self._human_num) * 4
 
         self._eval_episode_return += rew
         if done:
diff --git a/zoo/CrowdSim/envs/Crowdsim/env/base_env_config.py b/zoo/CrowdSim/envs/Crowdsim/env/base_env_config.py
deleted file mode 100644
index 865ab56fb..000000000
--- a/zoo/CrowdSim/envs/Crowdsim/env/base_env_config.py
+++ /dev/null
@@ -1,137 +0,0 @@
-from easydict import EasyDict
-
-# define base config
-base_config = EasyDict({
-    "num_timestep": 120,  # 120x15=1800s=30min
-    "step_time": 15,  # second per step
-    "max_uav_energy": 359640,  # 359640 J <-- 359.64 kJ (4500mah, 22.2v) 大疆经纬
-    "rotation_limit": 360,
-    "diameter_of_human_blockers": 0.5,  # m
-    "h_rx": 1.3,  # m, height of RX
-    "h_b": 1.7,  # m, height of a human blocker
-    "velocity": 18,
-    "frequence_band": 28,  # GHz
-    "h_d": 120,  # m, height of drone-BS
-    "alpha_nlos": 113.63,
-    "beta_nlos": 1.16,
-    "zeta_nlos": 2.58,  # Frequency 28GHz, sub-urban. channel model
-    "alpha_los": 84.64,
-    "beta_los": 1.55,
-    "zeta_los": 0.12,
-    "g_tx": 0,  # dB
-    "g_rx": 5,  # dB
-    "tallest_locs": None,  # obstacle
-    "no_fly_zone": None,  # obstacle
-    "start_timestamp": 1519894800,
-    "end_timestamp": 1519896600,
-    "energy_factor": 3,  # TODO: energy factor in reward function
-    "robot_num": 2,  # TODO: 多了要用多进程
-    "rollout_num": 1,  # 1 2 6 12 15, calculated based on robot_num
-})
-
-# define all dataset configs
-dataset_configs = {
-    'purdue': EasyDict({
-        "lower_left": [-86.93, 40.4203],  # 经纬度
-        "upper_right": [-86.9103, 40.4313],
-        "nlon": 200,
-        "nlat": 120,
-        "human_num": 59,
-        "dataset_dir": '/home/nighoodRen/CrowdSim/CrowdSim/envs/crowd_sim/dataset/purdue/59 users.csv',
-        "sensing_range": 23.2,  # unit  23.2
-        "one_uav_action_space": [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30], [21, 21], [21, -21], [-21, 21], [-21, -21]],
-        "max_x_distance": 1667,  # m
-        "max_y_distance": 1222,  # m
-        "density_of_human_blockers": 30000 / 1667 / 1222,  # block/m2
-    }),
-    'ncsu': EasyDict({
-        "lower_left": [-78.6988, 35.7651],  # 经纬度
-        "upper_right": [-78.6628, 35.7896],
-        "nlon": 3600,
-        "nlat": 2450,
-        "human_num": 33,
-        "dataset_dir": '/home/nighoodRen/CrowdSim/CrowdSim/envs/crowd_sim/dataset/NCSU/33 users.csv',
-        "sensing_range": 220,  # unit  220
-        "one_uav_action_space": [[0, 0], [300, 0], [-300, 0], [0, 300], [0, -300], [210, 210], [210, -210], [-210, 210], [-210, -210]],
-        "max_x_distance": 3255.4913305859623,
-        "max_y_distance": 2718.3945272795013,
-        "density_of_human_blockers": 30000 / 3255.4913305859623 / 2718.3945272795013,  # block/m2
-    }),
-    'kaist': EasyDict({
-        "lower_left": [127.3475, 36.3597],
-        "upper_right": [127.3709, 36.3793],
-        "nlon": 2340,
-        "nlat": 1960,
-        "human_num": 92,
-        "dataset_dir": '/home/nighoodRen/CrowdSim/CrowdSim/envs/crowd_sim/dataset/KAIST/92 users.csv',
-        "sensing_range": 220,  # unit  220
-        "one_uav_action_space": [[0, 0], [300, 0], [-300, 0], [0, 300], [0, -300], [210, 210], [210, -210], [-210, 210], [-210, -210]],
-        "max_x_distance": 2100.207579392558,
-        "max_y_distance": 2174.930950809533,
-        "density_of_human_blockers": 30000 / 2100.207579392558 / 2174.930950809533,  # block/m2
-    }),
-    # ... could add more dataset configs here
-}
-
-# get config according to data set name
-def get_selected_config(data_set_name):
-    if data_set_name in dataset_configs:
-        dataset_config = dataset_configs[data_set_name]
-        return EasyDict({**base_config, **dataset_config})
-    else:
-        raise ValueError(f"Data set '{data_set_name}' not found.")
-
-# r:meters, 2d distance
-# threshold: dB
-def try_sensing_range(r, data_set_name):
-    import math
-    config = get_selected_config(data_set_name)
-    p_los = math.exp(
-        -config.density_of_human_blockers * config.diameter_of_human_blockers * r * (config.h_b - config.h_rx) / (
-                config.h_d - config.h_rx))
-    p_nlos = 1 - p_los
-    PL_los = config.alpha_los + config.beta_los * 10 * math.log10(
-        math.sqrt(r * r + config.h_d * config.h_d)) + config.zeta_los
-    PL_nlos = config.alpha_nlos + config.beta_nlos * 10 * math.log10(
-        math.sqrt(r * r + config.h_d * config.h_d)) + config.zeta_nlos
-    PL = p_los * PL_los + p_nlos * PL_nlos
-    CL = PL - config.g_tx - config.g_rx
-    print(p_los, p_nlos)
-    print(CL)
-
-
-# Maximum Coupling Loss (110dB is recommended)
-# purdue:
-
-# 123dB -> 560m -> 60.5 range
-# 121dB -> 420m -> 45.4 range
-# 119dB -> 300m -> 32.4 range
-# 117dB -> 215m -> 23.2 range √
-# 115dB -> 140m -> 15 range
-
-# ncsu:
-# 123dB -> 600m -> 600 range
-# 121dB -> 435m -> 435 range
-# 119dB -> 315m -> 315 range
-# 117dB -> 220m -> 220 range √
-# 115dB -> 145m -> 145 range
-
-# kaist:
-# 123dB -> 600m -> 600 range
-# 121dB -> 435m -> 435 range
-# 119dB -> 315m -> 315 range
-# 117dB -> 220m -> 220 range √
-# 115dB -> 145m -> 145 range
-
-# san:
-# 123dB -> 600m -> 600 range
-# 121dB -> 450m -> 450 range
-# 119dB -> 330m -> 330 range
-# 117dB -> 240m -> 240 range √
-# 115dB -> 165m -> 165 range
-
-if __name__ == '__main__':
-    # example usage
-    data_set_name = 'purdue'
-    selected_config = get_selected_config(data_set_name)
-    print(selected_config)
\ No newline at end of file
diff --git a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
index 7ddf4550a..e31689866 100644
--- a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
+++ b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim.py
@@ -10,15 +10,54 @@
 
 from zoo.CrowdSim.envs.Crowdsim.env.model.utils import *
 from zoo.CrowdSim.envs.Crowdsim.env.model.mdp import HumanState, RobotState, JointState
-from zoo.CrowdSim.envs.Crowdsim.env.base_env_config import get_selected_config
-
+from LightZero.zoo.CrowdSim.envs.Crowdsim.env.crowd_sim_base_config import get_selected_config
 
 
 class CrowdSim(gym.Env):
+    """
+    Overview:
+        LightZero version of the CrowdSim environment. This class includes methods for resetting, closing, and \
+        stepping through the environment, as well as seeding for reproducibility, saving replay videos, and generating \
+        random actions. It also includes properties for accessing the observation space, action space, and reward space of the \
+        environment. The environment is a grid world with humans and robots moving around. The robots are tasked with \
+        minimizing the average age of information (AoI) of the humans by moving to their locations and collecting data from them. \
+        The humans generate data at a constant rate, and the robots have a limited energy supply that is consumed by moving. \
+        The environment is divided into two modes: 'easy' and 'hard'. In the 'easy' mode, the robots can only collect data from \
+        humans when they are within a certain range, and the AoI of a human is reset to 0 when a robot collects data from them. \
+        In the 'hard' mode, the robots can collect data from humans even when they are not within range, and the AoI of a human \
+        is not reset when a robot collects data from them. The environment is initialized with a dataset of human locations and \
+        timestamps, and the robots are tasked with collecting data from the humans to minimize the average AoI. The environment \
+        is considered solved when the average AoI is minimized to a certain threshold or the time limit is reached.
+    Interface:
+        `__init__`, `reset`, `step`, `render`, `sync_human_df`, `generate_human`, `generate_robot`.
+    """
     metadata = {'render.modes': ['human']}
 
     def __init__(self, dataset, custom_config=None):
-        # mcfg should include: 
+        """
+        Overview:
+            Initialize the environment with a dataset and a custom configuration. The dataset contains the locations and \
+            timestamps of the humans, and the custom configuration contains the environment mode, number of humans, number \
+            of robots, maximum timestep, step time, start timestamp, and maximum UAV energy. The environment is divided into \
+            two modes: 'easy' and 'hard'. In the 'easy' mode, the robots can only collect data from humans when they are within \
+            a certain range, and the AoI of a human is reset to 0 when a robot collects data from them. In the 'hard' mode, the \
+            robots can collect data from humans even when they are not within range, and the AoI of a human is not reset when a \
+            robot collects data from them. The environment is initialized with a dataset of human locations and timestamps, and \
+            the robots are tasked with collecting data from the humans to minimize the average AoI. The environment is considered \
+            solved when the average AoI is minimized to a certain threshold or the time limit is reached.
+        Args:
+            - dataset (:obj:`str`): The path to the dataset file.
+            - custom_config (:obj:`dict`): A dictionary containing the custom configuration for the environment. \
+                The custom configuration should include the following keys:
+                - env_mode (:obj:`str`): The environment mode ('easy' or 'hard').
+                - human_num (:obj:`int`): The number of humans in the environment.
+                - robot_num (:obj:`int`): The number of robots in the environment.
+                - num_timestep (:obj:`int`): The maximum timestep for the environment.
+                - step_time (:obj:`float`): The time per step in seconds.
+                - start_timestamp (:obj:`int`): The start timestamp for the environment.
+                - max_uav_energy (:obj:`float`): The maximum energy for the UAVs.
+        """
+        # mcfg should include:
         self.time_limit = None
         self.robots = None
         self.humans = None
@@ -29,10 +68,10 @@ def __init__(self, dataset, custom_config=None):
         self.config = get_selected_config(dataset)
         self.config.update(custom_config)
 
-        self.env_mode = self.config.env_mode    # 'easy' or 'hard'
+        self.env_mode = self.config.env_mode  # 'easy' or 'hard'
         self.human_num = self.config.human_num
         self.robot_num = self.config.robot_num
-        self.num_timestep = self.config.num_timestep    # max timestep
+        self.num_timestep = self.config.num_timestep  # max timestep
         self.step_time = self.config.step_time  # second per step
         self.start_timestamp = self.config.start_timestamp  # fit timpestamp to datetime
         self.max_uav_energy = self.config.max_uav_energy
@@ -40,8 +79,9 @@ def __init__(self, dataset, custom_config=None):
         self.action_space = gym.spaces.Discrete(len(self.config.one_uav_action_space))
         # human obs: [px, py, remaining_data_amount, aoi]
         # robot obs: [px, py, theta, energy]
-        # self.observation_space = gym.spaces.Box(low=float("-inf"), high=float("inf"), shape=(4), dtype=np.float32)
-        self.observation_space = gym.spaces.Box(low=float("-inf"), high=float("inf"), shape=(self.robot_num+self.human_num, 4), dtype=np.float32)
+        self.observation_space = gym.spaces.Box(
+            low=float("-inf"), high=float("inf"), shape=(self.robot_num + self.human_num, 4), dtype=np.float32
+        )
 
         # load_dataset
         self.transmit_v = self.config.transmit_v  # 5*0.3Mb/s
@@ -51,48 +91,63 @@ def __init__(self, dataset, custom_config=None):
         self.upper_right = self.config.upper_right
         self.human_df = pd.read_csv(self.config.dataset_dir)
         logging.info("Finished reading {} rows".format(len(self.human_df)))
-        # # for temporarily processing data
-        # sample_list=np.random.choice(self.human_num, size=[50,], replace=False)
-        # sample_list=sample_list[np.argsort(sample_list)]
-        # print(sample_list)
-        # self.human_df= self.human_df[self.human_df["id"].isin(sample_list)]
-        # for i,human_id in enumerate(sample_list):
-        #     mask=(self.human_df["id"]==human_id)
-        #     self.human_df.loc[mask,"id"]=i
-        # self.human_df=self.human_df.sort_values(by=["id","timestamp"],ascending=[True,True])
-        # print(self.human_df.head())
-        # self.human_df.to_csv("50 users-5.csv",index=False)
-        # exit(0)
 
         self.human_df['t'] = pd.to_datetime(self.human_df['timestamp'], unit='s')  # 's' stands for second
         self.human_df['aoi'] = -1  # 加入aoi记录aoi
-        self.human_df['data_amount'] = -1    # record the remaining data amount of each human
+        self.human_df['data_amount'] = -1  # record the remaining data amount of each human
         self.human_df['energy'] = -1  # 加入energy记录energy
         logging.info('Env mode: {}'.format(self.env_mode))
         logging.info('human number: {}'.format(self.human_num))
         logging.info('Robot number: {}'.format(self.robot_num))
 
         # for debug
-        self.current_human_aoi_list = np.zeros([self.human_num, ])
-        self.mean_aoi_timelist = np.zeros([self.config.num_timestep + 1, ])
-        self.cur_data_amount_timelist = np.zeros([self.human_num, ])
+        self.current_human_aoi_list = np.zeros([
+            self.human_num,
+        ])
+        self.mean_aoi_timelist = np.zeros([
+            self.config.num_timestep + 1,
+        ])
+        self.cur_data_amount_timelist = np.zeros([
+            self.human_num,
+        ])
         self.robot_energy_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num])
         self.robot_x_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num])
         self.robot_y_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num])
-        self.update_human_timelist = np.zeros([self.config.num_timestep, ])
+        self.update_human_timelist = np.zeros([
+            self.config.num_timestep,
+        ])
         self.data_transmission = 0
 
-    def set_agent(self, agent):
-        self.agent = agent
-
     def generate_human(self, human_id, selected_data, selected_next_data):
+        """
+        Overview:
+            Generate a human with the given id, selected data, and selected next data. The human is initialized with \
+            the given data and next data, and the remaining data amount is set to 0. The human is also initialized with \
+            an AoI of 0.
+        Argments:
+            - human_id (:obj:`int`): The id of the human.
+            - selected_data (:obj:`pd.DataFrame`): The selected data for the current timestep.
+            - selected_next_data (:obj:`pd.DataFrame`): The selected data for the next timestep.
+        Returns:
+            - human (:obj:`Human`): The generated human.
+        """
         human = Human(human_id, self.config)
-        px, py, theta = get_human_position_from_list(self.current_timestep, human_id, selected_data, selected_next_data, self.config)
+        px, py, theta = get_human_position_from_list(
+            self.current_timestep, human_id, selected_data, selected_next_data, self.config
+        )
         # human obs: [px, py, data_amount, aoi]
         human.set(px, py, theta, 0, 0)  # initial aoi of human is 0
         return human
 
     def generate_robot(self, robot_id):
+        """
+        Overview:
+            Generate a robot with the given id. The robot is initialized with the given id and the maximum UAV energy.
+        Argments:
+            - robot_id (:obj:`int`): The id of the robot.
+        Returns:
+            - robot (:obj:`Robot`): The generated robot.
+        """
         robot = Robot(robot_id, self.config)
         # robot obs: [px, py, theta, energy]
         robot.set(self.nlon / 2, self.nlat / 2, 0, self.max_uav_energy)  # robot有energy
@@ -108,13 +163,26 @@ def sync_human_df(self, human_id, current_timestep, aoi, data_amount):
             - aoi (:obj:`int`): The aoi of the human.
         """
         current_timestamp = self.start_timestamp + current_timestep * self.step_time
-        current_index = self.human_df[
-            (self.human_df.id == human_id) & (self.human_df.timestamp == current_timestamp)].index
+        current_index = self.human_df[(self.human_df.id == human_id)
+                                      & (self.human_df.timestamp == current_timestamp)].index
         # self.human_df.loc[current_index, "aoi"] = aoi   # slower
-        self.human_df.iat[current_index.values[0], 9] = aoi # faster
+        self.human_df.iat[current_index.values[0], 9] = aoi  # faster
         self.human_df.iat[current_index.values[0], 10] = data_amount
 
     def reset(self, phase='test', test_case=None):
+        """
+        Overview:
+            Reset the environment to the initial state. The environment is reset to the start timestamp, and the humans \
+            and robots are generated with the given data. The humans are initialized with the selected data and next data, \
+            and the robots are initialized with the given id. The environment is also initialized with the current timestep, \
+            mean AoI, robot energy, robot x, robot y, and update human timelist. The environment is considered solved when \
+            the average AoI is minimized to a certain threshold or the time limit is reached.
+        Argments:
+            - phase (:obj:`str`): The phase of the environment ('train' or 'test').
+            - test_case (:obj:`int`): The test case for the environment.
+        Returns:
+            - state (:obj:`JointState`): The initial state of the environment.
+        """
         self.current_timestep = 0
 
         # generate human
@@ -132,9 +200,15 @@ def reset(self, phase='test', test_case=None):
         for robot_id in range(self.robot_num):
             self.robots.append(self.generate_robot(robot_id))
 
-        self.cur_data_amount_timelist = np.zeros([self.human_num, ])
-        self.current_human_aoi_list = np.zeros([self.human_num, ])
-        self.mean_aoi_timelist = np.zeros([self.config.num_timestep + 1, ])
+        self.cur_data_amount_timelist = np.zeros([
+            self.human_num,
+        ])
+        self.current_human_aoi_list = np.zeros([
+            self.human_num,
+        ])
+        self.mean_aoi_timelist = np.zeros([
+            self.config.num_timestep + 1,
+        ])
         self.mean_aoi_timelist[self.current_timestep] = np.mean(self.current_human_aoi_list)
         self.robot_energy_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num])
         self.robot_energy_timelist[self.current_timestep, :] = self.max_uav_energy
@@ -142,7 +216,9 @@ def reset(self, phase='test', test_case=None):
         self.robot_x_timelist[self.current_timestep, :] = self.nlon / 2
         self.robot_y_timelist = np.zeros([self.config.num_timestep + 1, self.robot_num])
         self.robot_y_timelist[self.current_timestep, :] = self.nlat / 2
-        self.update_human_timelist = np.zeros([self.config.num_timestep, ])
+        self.update_human_timelist = np.zeros([
+            self.config.num_timestep,
+        ])
         self.data_transmission = 0
 
         # for visualization
@@ -150,17 +226,35 @@ def reset(self, phase='test', test_case=None):
         self.robot_actions = []
         self.rewards = []
         self.action_values = []
-        self.plot_states.append([[robot.get_obs() for robot in self.robots],
-                                 [human.get_obs() for human in self.humans]])
-        
+        self.plot_states.append(
+            [[robot.get_obs() for robot in self.robots], [human.get_obs() for human in self.humans]]
+        )
+
         state = JointState([robot.get_obs() for robot in self.robots], [human.get_obs() for human in self.humans])
         return state
 
     def step(self, action):
+        """
+        Overview:
+            Perform a step in the environment using the provided action, and return the next state of the environment. \
+            The next state is encapsulated in a BaseEnvTimestep object, which includes the new observation, reward, done flag, \
+            and info dictionary. The cumulative reward (`_eval_episode_return`) is updated with the reward obtained in this step. \
+            If the episode ends (done is True), the total reward for the episode is stored in the info dictionary.
+        Argments:
+            - action (:obj:`Union[int, np.ndarray]`): The action to be performed in the environment. If the action is a 1-dimensional \
+                numpy array, it is squeezed to a 0-dimension array.
+        Returns:
+            - next_state (:obj:`JointState`): The next state of the environment.
+            - reward (:obj:`float`): The reward obtained in this step.
+            - done (:obj:`bool`): A flag indicating whether the episode has ended.
+            - info (:obj:`dict`): A dictionary containing additional information about the environment.
+        """
         new_robot_position = np.zeros([self.robot_num, 2])
-        current_enenrgy_consume = np.zeros([self.robot_num, ])
+        current_enenrgy_consume = np.zeros([
+            self.robot_num,
+        ])
 
-        num_updated_human = 0   # number of humans whose AoI is updated
+        num_updated_human = 0  # number of humans whose AoI is updated
 
         for robot_id, robot in enumerate(self.robots):
             new_robot_px = robot.px + action[robot_id][0]
@@ -191,11 +285,14 @@ def step(self, action):
                 self.robot_y_timelist[self.current_timestep + 1][robot_id] = new_robot_py
                 robot.set(new_robot_px, new_robot_py, robot_theta, energy=new_energy)
 
-        selected_data, selected_next_data = get_human_position_list(self.current_timestep + 1, self.human_df, self.config)
-        human_transmit_data_list = np.zeros_like(self.cur_data_amount_timelist)   # 0 means no update
+        selected_data, selected_next_data = get_human_position_list(
+            self.current_timestep + 1, self.human_df, self.config
+        )
+        human_transmit_data_list = np.zeros_like(self.cur_data_amount_timelist)  # 0 means no update
         for human_id, human in enumerate(self.humans):
-            next_px, next_py, next_theta = get_human_position_from_list(self.current_timestep + 1, human_id,
-                                                                        selected_data, selected_next_data, self.config)
+            next_px, next_py, next_theta = get_human_position_from_list(
+                self.current_timestep + 1, human_id, selected_data, selected_next_data, self.config
+            )
             should_reset = judge_aoi_update([next_px, next_py], new_robot_position, self.config)
             if self.env_mode == 'easy':
                 if should_reset:
@@ -212,7 +309,7 @@ def step(self, action):
                     human_transmit_data_list[human_id] = 0
                     new_aoi = human.aoi + 1
                     human.set(next_px, next_py, next_theta, aoi=new_aoi, data_amount=human.aoi)
-                    
+
             elif self.env_mode == 'hard':
                 if should_reset:
                     # if the human is in the range of the robot, then part of human's data will be transmitted
@@ -238,7 +335,7 @@ def step(self, action):
         self.data_transmission += (delta_sum_transmit_data * 0.3)  # Mb, 0.02M/s per person
         if self.env_mode == 'easy':
             # in easy mode, the data amount generated per step is equal to the number of humans
-            self.total_generated_data_amount = self.num_timestep*self.human_num
+            self.total_generated_data_amount = self.num_timestep * self.human_num
         elif self.env_mode == 'hard':
             # in hard mode, the data amount generated per step is equal to the sum of the data amount of all humans
             self.total_generated_data_amount += self.generate_data_amount_per_step
@@ -251,11 +348,11 @@ def step(self, action):
         #     self.action_values.append(self.agent.policy.action_values)
         self.robot_actions.append(action)
         self.rewards.append(reward)
-        self.plot_states.append([[robot.get_obs() for robot in self.robots],
-                                 [human.get_obs() for human in self.humans]])
+        self.plot_states.append(
+            [[robot.get_obs() for robot in self.robots], [human.get_obs() for human in self.humans]]
+        )
 
-        next_state = JointState([robot.get_obs() for robot in self.robots],
-                                [human.get_obs() for human in self.humans])
+        next_state = JointState([robot.get_obs() for robot in self.robots], [human.get_obs() for human in self.humans])
 
         self.current_timestep += 1
         # print('This game is on',self.current_timestep,' step\n')
@@ -265,18 +362,26 @@ def step(self, action):
             done = False
         info = {
             "performance_info": {
-            "mean_aoi": self.mean_aoi_timelist[self.current_timestep],
-            "mean_transmit_data": delta_sum_transmit_data / self.human_num,
-            "mean_energy_consumption": 1.0 - (
-                        np.mean(self.robot_energy_timelist[self.current_timestep]) / self.max_uav_energy),
-            "transmitted_data_ratio": self.data_transmission/(self.total_generated_data_amount*0.3),
-            "human_coverage": np.mean(self.update_human_timelist) / self.human_num
+                "mean_aoi": self.mean_aoi_timelist[self.current_timestep],
+                "mean_transmit_data": delta_sum_transmit_data / self.human_num,
+                "mean_energy_consumption": 1.0 -
+                (np.mean(self.robot_energy_timelist[self.current_timestep]) / self.max_uav_energy),
+                "transmitted_data_ratio": self.data_transmission / (self.total_generated_data_amount * 0.3),
+                "human_coverage": np.mean(self.update_human_timelist) / self.human_num
             },
         }
 
         return next_state, reward, done, info
 
     def render(self):
+        """
+        Overview:
+            Render the environment to an image. The image is generated using the matplotlib library, and it includes the \
+            historical trajectories of the robots, the current positions of the robots, the sensing range of the robots, the \
+            positions of the humans, and their AoI values. The image is returned as a numpy array.
+        Returns:
+            - image (:obj:`np.ndarray`): The rendered image of the environment.
+        """
         import matplotlib.pyplot as plt
         import matplotlib.patches as patches
         import io
@@ -284,44 +389,49 @@ def render(self):
 
         map_max_x = self.config.nlon
         map_max_y = self.config.nlat
-        # 创建一个新的图形
+        # Create a new figure
         fig, ax = plt.subplots(figsize=(8, 6))
 
-        # 绘制机器人的历史轨迹
+        # Plot the historical trajectories of the robots
         for timestep in range(len(self.robot_x_timelist)):
             for robot_id in range(len(self.robot_x_timelist[timestep])):
-                ax.plot(self.robot_x_timelist[timestep][robot_id], self.robot_y_timelist[timestep][robot_id], color='gray', alpha=0.5)
-
-        # 绘制机器人的位置
+                ax.plot(
+                    self.robot_x_timelist[timestep][robot_id],
+                    self.robot_y_timelist[timestep][robot_id],
+                    color='gray',
+                    alpha=0.5
+                )
+
+        # Plot the current positions of the robots
         for robot in self.robots:
             ax.plot(robot.px, robot.py, marker='o', markersize=5, color='blue')
-        
-        # 绘制机器人的感知范围
+
+        # Plot the sensing range of the robots
         for robot in self.robots:
             robot_x, robot_y = robot.px, robot.py
             circle = patches.Circle((robot_x, robot_y), self.config.sensing_range, edgecolor='blue', facecolor='none')
             ax.add_patch(circle)
-        
-        # 绘制人类的位置和AOI变化
+
+        # Plot the positions of the humans and their AOI values
         for human in self.humans:
             human_x, human_y, aoi = human.px, human.py, human.aoi
             ax.plot(human_x, human_y, marker='x', markersize=5, color='red')
             ax.text(human_x, human_y, str(aoi), fontsize=8, color='black')
 
-        # 设置图形标题和坐标轴标签
+        # Set the title and axis labels
         # ax.set_xlim(0, map_max_x)
         # ax.set_ylim(0, map_max_y)
         ax.set_xlabel('X')
         ax.set_ylabel('Y')
 
-        # 在地图之外留出一些空白区域
+        # Leave some margin around the map
         ax.margins(x=0.1, y=0.1)
         ax.set_title('Crowd Simulation Visualization')
 
-        # 显示图形
+        # Render the figure to an image
         fig.canvas.draw()
         image = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
-        image = image.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+        image = image.reshape(fig.canvas.get_width_height()[::-1] + (3, ))
         plt.close()
 
-        return image
\ No newline at end of file
+        return image
diff --git a/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim_base_config.py b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim_base_config.py
new file mode 100644
index 000000000..008fd7e16
--- /dev/null
+++ b/zoo/CrowdSim/envs/Crowdsim/env/crowd_sim_base_config.py
@@ -0,0 +1,156 @@
+from easydict import EasyDict
+
+# define base config
+base_config = EasyDict(
+    {
+        "num_timestep": 120,  # 120x15=1800s=30min
+        "step_time": 15,  # seconds per step
+        "max_uav_energy": 359640,  # 359640 J <-- 359.64 kJ (4500mAh, 22.2V) DJI Matrice
+        "rotation_limit": 360,
+        "diameter_of_human_blockers": 0.5,  # meters
+        "h_rx": 1.3,  # meters, height of RX
+        "h_b": 1.7,  # meters, height of a human blocker
+        "velocity": 18,
+        "frequence_band": 28,  # GHz
+        "h_d": 120,  # meters, height of drone-BS
+        "alpha_nlos": 113.63,
+        "beta_nlos": 1.16,
+        "zeta_nlos": 2.58,  # Frequency 28GHz, sub-urban. channel model
+        "alpha_los": 84.64,
+        "beta_los": 1.55,
+        "zeta_los": 0.12,
+        "g_tx": 0,  # dB
+        "g_rx": 5,  # dB
+        "tallest_locs": None,  # obstacle
+        "no_fly_zone": None,  # obstacle
+        "start_timestamp": 1519894800,
+        "end_timestamp": 1519896600,
+        "energy_factor": 3,  # TODO: energy factor in reward function
+        "robot_num": 2,
+        "rollout_num": 1,  # 1 2 6 12 15, calculated based on robot_num
+    }
+)
+
+# define all dataset configs
+dataset_configs = {
+    'purdue': EasyDict(
+        {
+            "lower_left": [-86.93, 40.4203],  # longitude and latitude
+            "upper_right": [-86.9103, 40.4313],
+            "nlon": 200,
+            "nlat": 120,
+            "human_num": 59,
+            "dataset_dir": '/home/nighoodRen/CrowdSim/CrowdSim/envs/crowd_sim/dataset/purdue/59 users.csv',
+            "sensing_range": 23.2,  # unit 23.2
+            "one_uav_action_space": [
+                [0, 0], [30, 0], [-30, 0], [0, 30], [0, -30], [21, 21], [21, -21], [-21, 21], [-21, -21]
+            ],
+            "max_x_distance": 1667,  # meters
+            "max_y_distance": 1222,  # meters
+            "density_of_human_blockers": 30000 / 1667 / 1222,  # blockers/m2
+        }
+    ),
+    'ncsu': EasyDict(
+        {
+            "lower_left": [-78.6988, 35.7651],  # longitude and latitude
+            "upper_right": [-78.6628, 35.7896],
+            "nlon": 3600,
+            "nlat": 2450,
+            "human_num": 33,
+            "dataset_dir": '/home/nighoodRen/CrowdSim/CrowdSim/envs/crowd_sim/dataset/NCSU/33 users.csv',
+            "sensing_range": 220,  # unit 220
+            "one_uav_action_space": [
+                [0, 0], [300, 0], [-300, 0], [0, 300], [0, -300], [210, 210], [210, -210], [-210, 210], [-210, -210]
+            ],
+            "max_x_distance": 3255.4913305859623,  # meters
+            "max_y_distance": 2718.3945272795013,  # meters
+            "density_of_human_blockers": 30000 / 3255.4913305859623 / 2718.3945272795013,  # blockers/m2
+        }
+    ),
+    'kaist': EasyDict(
+        {
+            "lower_left": [127.3475, 36.3597],  # longitude and latitude
+            "upper_right": [127.3709, 36.3793],
+            "nlon": 2340,
+            "nlat": 1960,
+            "human_num": 92,
+            "dataset_dir": '/home/nighoodRen/CrowdSim/CrowdSim/envs/crowd_sim/dataset/KAIST/92 users.csv',
+            "sensing_range": 220,  # unit 220
+            "one_uav_action_space": [
+                [0, 0], [300, 0], [-300, 0], [0, 300], [0, -300], [210, 210], [210, -210], [-210, 210], [-210, -210]
+            ],
+            "max_x_distance": 2100.207579392558,  # meters
+            "max_y_distance": 2174.930950809533,  # meters
+            "density_of_human_blockers": 30000 / 2100.207579392558 / 2174.930950809533,  # blockers/m2
+        }
+    ),
+    # ... could add more dataset configs here
+}
+
+
+# get config according to data set name
+def get_selected_config(data_set_name):
+    if data_set_name in dataset_configs:
+        dataset_config = dataset_configs[data_set_name]
+        return EasyDict({**base_config, **dataset_config})
+    else:
+        raise ValueError(f"Data set '{data_set_name}' not found.")
+
+
+# r:meters, 2D distance
+# threshold: dB
+def try_sensing_range(r, data_set_name):
+    import math
+    config = get_selected_config(data_set_name)
+    p_los = math.exp(
+        -config.density_of_human_blockers * config.diameter_of_human_blockers * r * (config.h_b - config.h_rx) /
+        (config.h_d - config.h_rx)
+    )
+    p_nlos = 1 - p_los
+    PL_los = config.alpha_los + config.beta_los * 10 * math.log10(
+        math.sqrt(r * r + config.h_d * config.h_d)
+    ) + config.zeta_los
+    PL_nlos = config.alpha_nlos + config.beta_nlos * 10 * math.log10(
+        math.sqrt(r * r + config.h_d * config.h_d)
+    ) + config.zeta_nlos
+    PL = p_los * PL_los + p_nlos * PL_nlos
+    CL = PL - config.g_tx - config.g_rx
+    print(p_los, p_nlos)
+    print(CL)
+
+
+# Maximum Coupling Loss (110dB is recommended)
+# purdue:
+
+# 123dB -> 560m -> 60.5 range
+# 121dB -> 420m -> 45.4 range
+# 119dB -> 300m -> 32.4 range
+# 117dB -> 215m -> 23.2 range √
+# 115dB -> 140m -> 15 range
+
+# ncsu:
+# 123dB -> 600m -> 600 range
+# 121dB -> 435m -> 435 range
+# 119dB -> 315m -> 315 range
+# 117dB -> 220m -> 220 range √
+# 115dB -> 145m -> 145 range
+
+# kaist:
+# 123dB -> 600m -> 600 range
+# 121dB -> 435m -> 435 range
+# 119dB -> 315m -> 315 range
+# 117dB -> 220m -> 220 range √
+# 115dB -> 145m -> 145 range
+
+# san:
+# 123dB -> 600m -> 600 range
+# 121dB -> 450m -> 450 range
+# 119dB -> 330m -> 330 range
+# 117dB -> 240m -> 240 range √
+# 115dB -> 165m -> 165 range
+
+if __name__ == '__main__':
+    # example usage
+    data_set_name = 'purdue'
+    selected_config = get_selected_config(data_set_name)
+    print(selected_config)
diff --git a/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py b/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py
index db6c47b31..77e378636 100644
--- a/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py
+++ b/zoo/CrowdSim/envs/Crowdsim/env/model/agent.py
@@ -5,6 +5,7 @@
 
 
 class Agent():
+
     def __init__(self):
         """
         Base class for robot and human. Have the physical attributes of an agent.
@@ -26,6 +27,15 @@ def act(self, state, current_timestep):
 
 
 class Human():
+    """
+    Overview:
+        Human class. Have the physical attributes of a human agent. The human agent has a data queue to store the \
+        information blocks. The data queue is updated when the human agent moves and transmits data to the robot. \
+        The age of information (aoi) is calculated based on the data queue.
+    Interface:
+        `__init__`, `set`, `update`, `get_obs`.
+    """
+
     # collect_v_prob = {1: 1, 2: 0}
     def __init__(self, id, config):
         self.id = id
@@ -40,6 +50,16 @@ def __init__(self, id, config):
         self.collect_v = random.choices(list(self.collect_v_prob.keys()), list(self.collect_v_prob.values()))[0]
 
     def set(self, px, py, theta, aoi, data_amount):
+        """
+        Overview:
+            Set the physical attributes of the human agent.
+        Arguments:
+            - px (:obj:`float`): The x-coordinate of the human agent.
+            - py (:obj:`float`): The y-coordinate of the human agent.
+            - theta (:obj:`float`): The orientation of the human agent.
+            - aoi (:obj:`float`): The age of information (aoi) of the human agent.
+            - data_amount (:obj:`int`): The amount of data blocks in the data queue of the human agent.
+        """
         self.px = px
         self.py = py
         self.theta = theta
@@ -47,6 +67,16 @@ def set(self, px, py, theta, aoi, data_amount):
         self.data_amount = data_amount
 
     def update(self, px, py, theta, transmitted_data):
+        """
+        Overview:
+            Update the physical attributes of the human agent and the data queue. The age of information (aoi) is \
+            calculated based on the data queue.
+        Arguments:
+            - px (:obj:`float`): The x-coordinate of the human agent.
+            - py (:obj:`float`): The y-coordinate of the human agent.
+            - theta (:obj:`float`): The orientation of the human agent.
+            - transmitted_data (:obj:`int`): The number of data blocks transmitted to the robot.
+        """
         self.px = px  # position
         self.py = py
         self.theta = theta
@@ -54,16 +84,29 @@ def update(self, px, py, theta, transmitted_data):
         self.aoi = self.data_queue.total_aoi()
         self.data_amount = self.data_queue.total_blocks()
 
-    # TODO: change state,可能需要归一化
     def get_obs(self):
+        """
+        Overview:
+            Get the observation of the human agent. The observation includes the position, age of information (aoi), \
+            and the amount of data blocks in the data queue.
+        Returns:
+            - obs (:obj:`HumanState`): The observation of the human agent.
+        """
         # obs: (px, py, remaining_data, aoi)
-        return HumanState(self.px / self.config.nlon,
-                          self.py / self.config.nlat,
-                          self.data_amount / self.config.num_timestep,
-                          self.aoi / self.config.num_timestep)
+        return HumanState(
+            self.px / self.config.nlon, self.py / self.config.nlat, self.data_amount / self.config.num_timestep,
+            self.aoi / self.config.num_timestep
+        )
 
 
 class Robot():
+    """
+    Overview:
+        Robot class. Have the physical attributes of a robot agent.
+    Interface:
+        `__init__`, `set`, `get_obs`.
+    """
+
     def __init__(self, id, config):
         self.id = id
         self.config = config
@@ -73,42 +116,75 @@ def __init__(self, id, config):
         self.energy = None
 
     def set(self, px, py, theta, energy):
+        """
+        Overview:
+            Set the physical attributes of the robot agent.
+        Arguments:
+            - px (:obj:`float`): The x-coordinate of the robot agent.
+            - py (:obj:`float`): The y-coordinate of the robot agent.
+            - theta (:obj:`float`): The orientation of the robot agent.
+            - energy (:obj:`float`): The remaining energy of the robot agent.
+        """
         self.px = px  # position
         self.py = py
         self.theta = theta
         self.energy = energy
 
-    # TODO: change state,可能需要归一化
     def get_obs(self):
-        return RobotState(self.px / self.config.nlon,
-                          self.py / self.config.nlat,
-                          self.theta / self.config.rotation_limit,
-                          self.energy / self.config.max_uav_energy)
+        """
+        Overview:
+            Get the observation of the robot agent. The observation includes the position, orientation, and the remaining \
+            energy of the robot agent.
+        Returns:
+            - obs (:obj:`RobotState`): The observation of the robot agent.
+        """
+        return RobotState(
+            self.px / self.config.nlon, self.py / self.config.nlat, self.theta / self.config.rotation_limit,
+            self.energy / self.config.max_uav_energy
+        )
 
 
 class InformationQueue:
+    """
+    Overview:
+        Information queue class. The data queue is updated when the human agent moves and transmits data to the robot. \
+        The age of information (aoi) is calculated based on the data queue.
+
+    Interface:
+        `__init__`, `update`, `total_aoi`, `total_blocks`.
+    """
+
     def __init__(self):
         # Initialize the queue to hold the age of each information block
         self.queue = []
 
     def update(self, arrivals, departures):
+        """
+        Overview:
+            Update the data queue. Increase the age of information (aoi) for each block in the queue. Add new information \
+            blocks with aoi of 0. Remove the specified number of oldest information blocks.
+        Arguments:
+            - arrivals (:obj:`int`): The number of new information blocks entering the queue.
+            - departures (:obj:`int`): The number of oldest information blocks leaving the queue.
+        """
         # Increase the age of information (aoi) for each block in the queue
         self.queue = [age + 1 for age in self.queue]
-        
+
         # Add new information blocks with aoi of 0
         self.queue.extend([0] * arrivals)
-        
+
         # Remove the specified number of oldest information blocks
         self.queue = self.queue[departures:] if departures <= len(self.queue) else []
 
     def total_aoi(self):
         # Return the total age of information in the queue
         return sum(self.queue)
-    
+
     def total_blocks(self):
         # Return the total number of information blocks in the queue
         return len(self.queue)
 
+
 # # Example of using the InformationQueue class
 # info_queue = InformationQueue()
 # info_queue.update(arrivals=5, departures=0)  # 5 blocks enter the queue, all with aoi of 0
diff --git a/zoo/CrowdSim/envs/Crowdsim/env/model/mdp.py b/zoo/CrowdSim/envs/Crowdsim/env/model/mdp.py
index 72f1b9c4a..9d4d2b770 100644
--- a/zoo/CrowdSim/envs/Crowdsim/env/model/mdp.py
+++ b/zoo/CrowdSim/envs/Crowdsim/env/model/mdp.py
@@ -6,6 +6,7 @@
 
 # State
 class HumanState(object):
+
     def __init__(self, px, py, theta, aoi):
         self.px = px
         self.py = py
@@ -24,6 +25,7 @@ def to_tuple(self):
 
 
 class RobotState(object):
+
     def __init__(self, px, py, theta, energy):
         self.px = px
         self.py = py
@@ -43,6 +45,7 @@ def to_tuple(self):
 
 
 class JointState(object):
+
     def __init__(self, robot_states, human_states):
         for robot_state in robot_states:
             assert isinstance(robot_state, RobotState)
@@ -53,10 +56,12 @@ def __init__(self, robot_states, human_states):
         self.human_states = human_states
 
     def to_tensor(self, add_batch_size=False, device=None):
-        robot_states_tensor = torch.tensor([robot_state.to_tuple() for robot_state in self.robot_states],
-                                           dtype=torch.float32)
-        human_states_tensor = torch.tensor([human_state.to_tuple() for human_state in self.human_states],
-                                           dtype=torch.float32)
+        robot_states_tensor = torch.tensor(
+            [robot_state.to_tuple() for robot_state in self.robot_states], dtype=torch.float32
+        )
+        human_states_tensor = torch.tensor(
+            [human_state.to_tuple() for human_state in self.human_states], dtype=torch.float32
+        )
 
         if add_batch_size:  # True
             robot_states_tensor = robot_states_tensor.unsqueeze(0)
@@ -67,7 +72,7 @@ def to_tensor(self, add_batch_size=False, device=None):
             human_states_tensor = human_states_tensor.to(device)
 
         return robot_states_tensor, human_states_tensor
-    
+
     def to_array(self):
         robot_states_array = np.array([robot_state.to_tuple() for robot_state in self.robot_states])
         human_states_array = np.array([human_state.to_tuple() for human_state in self.human_states])
diff --git a/zoo/CrowdSim/envs/Crowdsim/env/model/utils.py b/zoo/CrowdSim/envs/Crowdsim/env/model/utils.py
index e81ec842f..421b9508a 100644
--- a/zoo/CrowdSim/envs/Crowdsim/env/model/utils.py
+++ b/zoo/CrowdSim/envs/Crowdsim/env/model/utils.py
@@ -7,34 +7,72 @@
 from shapely.geometry import *
 
 
-def tensor_to_joint_state(state, config):  # 恢复原先尺度
+def tensor_to_joint_state(state, config):
+    """
+    Overview:
+        Convert the state tensor to the JointState object. The state tensor is a tuple of two tensors, the first one \
+        is the robot state tensor, and the second one is the human state tensor. The robot state tensor is a tensor of \
+        shape (1, robot_num, 4), and the human state tensor is a tensor of shape (1, human_num, 4).
+    Arguments:
+        - state (:obj:`tuple`): The state tensor.
+        - config (:obj:`dict`): The configuration of the environment.
+    Returns:
+        - joint_state (:obj:`JointState`): The JointState object.
+    """
     robot_states, human_states = state
 
     robot_states = robot_states.cpu().squeeze(0).data.numpy()
-    robot_states = [RobotState(robot_state[0] * config.nlon,
-                               robot_state[1] * config.nlat,
-                               robot_state[2] * config.rotation_limit,
-                               robot_state[3] * config.max_uav_energy) for robot_state in robot_states]
+    robot_states = [
+        RobotState(
+            robot_state[0] * config.nlon, robot_state[1] * config.nlat, robot_state[2] * config.rotation_limit,
+            robot_state[3] * config.max_uav_energy
+        ) for robot_state in robot_states
+    ]
 
     human_states = human_states.cpu().squeeze(0).data.numpy()
-    human_states = [HumanState(human_state[0] * config.nlon,
-                               human_state[1] * config.nlat,
-                               human_state[2] * config.rotation_limit,
-                               human_state[3] * config.num_timestep) for human_state in human_states]
+    human_states = [
+        HumanState(
+            human_state[0] * config.nlon, human_state[1] * config.nlat, human_state[2] * config.rotation_limit,
+            human_state[3] * config.num_timestep
+        ) for human_state in human_states
+    ]
 
     return JointState(robot_states, human_states)
 
 
 def tensor_to_robot_states(robot_state_tensor, config):
+    """
+    Overview:
+        Convert the robot state tensor to a list of RobotState objects. The robot state tensor is a tensor of shape \
+        (1, robot_num, 4).
+    Arguments:
+        - robot_state_tensor (:obj:`torch.Tensor`): The robot state tensor.
+        - config (:obj:`dict`): The configuration of the environment.
+    Returns:
+        - robot_states (:obj:`list`): The list of RobotState objects.
+    """
     robot_states = robot_state_tensor.cpu().squeeze(0).data.numpy()
-    robot_states = [RobotState(robot_state[0] * config.nlon,
-                               robot_state[1] * config.nlat,
-                               robot_state[2] * config.rotation_limit,
-                               robot_state[3] * config.max_uav_energy) for robot_state in robot_states]
+    robot_states = [
+        RobotState(
+            robot_state[0] * config.nlon, robot_state[1] * config.nlat, robot_state[2] * config.rotation_limit,
+            robot_state[3] * config.max_uav_energy
+        ) for robot_state in robot_states
+    ]
     return robot_states
 
 
 def get_human_position_list(selected_timestep, human_df, config):
+    """
+    Overview:
+        Get the human position list at the selected timestep. The human position list is a list of tuples, each tuple \
+        contains the x, y, and theta of a human.
+    Arguments:
+        - selected_timestep (:obj:`int`): The selected timestep.
+        - human_df (:obj:`pandas.DataFrame`): The human dataframe.
+        - config (:obj:`dict`): The configuration of the environment.
+    Returns:
+        - human_position_list (:obj:`list`): The human position list.
+    """
     # config.step_time means the time interval between two timesteps
     selected_timestamp = config.start_timestamp + selected_timestep * config.step_time
     selected_data = human_df[human_df.timestamp == selected_timestamp]
@@ -50,6 +88,21 @@ def get_human_position_list(selected_timestep, human_df, config):
 
 
 def get_human_position_from_list(selected_timestep, human_id, selected_data, selected_next_data, config):
+    """
+    Overview:
+        Get the human position from the human position list at the selected timestep. The human position is a tuple \
+        containing the x, y, and theta of the human.
+    Arguments:
+        - selected_timestep (:obj:`int`): The selected timestep.
+        - human_id (:obj:`int`): The human id.
+        - selected_data (:obj:`pandas.DataFrame`): The human position list at the selected timestep.
+        - selected_next_data (:obj:`pandas.DataFrame`): The human position list at the next timestep.
+        - config (:obj:`dict`): The configuration of the environment.
+    Returns:
+        - px (:obj:`float`): The x coordinate of the human.
+        - py (:obj:`float`): The y coordinate of the human.
+        - theta (:obj:`float`): The orientation of the human.
+    """
     px, py = selected_data.loc[human_id, ["x", "y"]]
 
     if selected_timestep < config.num_timestep:
@@ -70,11 +123,15 @@ def judge_aoi_update(human_position, robot_position, config):
         - human_position (:obj:`list`): The position of the human.
         - robot_position (:obj:`list`): The position of the robot.
         - config (:obj:`dict`): The configuration of the environment.
+    Returns:
+        - should_update (:obj:`bool`): Whether the AoI should be updated.
     """
     should_reset = False
     for robot_id in range(config.robot_num):
-        unit_distance = np.sqrt(np.power(robot_position[robot_id][0] - human_position[0], 2)
-                                + np.power(robot_position[robot_id][1] - human_position[1], 2))
+        unit_distance = np.sqrt(
+            np.power(robot_position[robot_id][0] - human_position[0], 2) +
+            np.power(robot_position[robot_id][1] - human_position[1], 2)
+        )
         if unit_distance <= config.sensing_range:
             should_reset = True
             break
@@ -83,6 +140,16 @@ def judge_aoi_update(human_position, robot_position, config):
 
 
 def inPoly(polygon, x, y):
+    """
+    Overview:
+        Judge whether a point is in a polygon.
+    Arguments:
+        - polygon (:obj:`list`): The polygon.
+        - x (:obj:`float`): The x coordinate of the point.
+        - y (:obj:`float`): The y coordinate of the point.
+    Returns:
+        - in_poly (:obj:`bool`): Whether the point is in the polygon.
+    """
     pt = (x, y)
     line = LineString(polygon)
     point = Point(pt)
@@ -91,12 +158,33 @@ def inPoly(polygon, x, y):
 
 
 def iscrosses(line1, line2):
+    """
+    Overview:
+        Judge whether two lines cross each other.
+    Arguments:
+        - line1 (:obj:`list`): The first line.
+        - line2 (:obj:`list`): The second line.
+    Returns:
+        - crosses (:obj:`bool`): Whether the two lines cross each other.
+    """
     if LineString(line1).crosses(LineString(line2)):
         return True
     return False
 
 
 def crossPoly(square, x1, y1, x2, y2):
+    """
+    Overview:
+        Judge whether a line crosses a polygon.
+    Arguments:
+        - square (:obj:`list`): The polygon.
+        - x1 (:obj:`float`): The x coordinate of the start point of the line.
+        - y1 (:obj:`float`): The y coordinate of the start point of the line.
+        - x2 (:obj:`float`): The x coordinate of the end point of the line.
+        - y2 (:obj:`float`): The y coordinate of the end point of the line.
+    Returns:
+        - crosses (:obj:`bool`): Whether the line crosses the polygon.
+    """
     our_line = LineString([[x1, y1], [x2, y2]])
     line1 = LineString([square[0], square[2]])
     line2 = LineString([square[1], square[3]])
@@ -107,6 +195,18 @@ def crossPoly(square, x1, y1, x2, y2):
 
 
 def judge_collision(new_robot_px, new_robot_py, old_robot_px, old_robot_py, config):
+    """
+    Overview:
+        Judge whether a collision happens. A collision happens when the new position of the robot is in the no-fly zone.
+    Arguments:
+        - new_robot_px (:obj:`float`): The x coordinate of the new position of the robot.
+        - new_robot_py (:obj:`float`): The y coordinate of the new position of the robot.
+        - old_robot_px (:obj:`float`): The x coordinate of the old position of the robot.
+        - old_robot_py (:obj:`float`): The y coordinate of the old position of the robot.
+        - config (:obj:`dict`): The configuration of the environment.
+    Returns:
+        - collision (:obj:`bool`): Whether a collision happens.
+    """
     if config.no_fly_zone is None:
         return False
 
@@ -126,6 +226,17 @@ def get_theta(x1, y1, x2, y2):
 
 
 def consume_uav_energy(fly_time, hover_time, config):
+    """
+    Overview:
+        Calculate the energy consumption of the UAV. The energy consumption is calculated based on the power consumption \
+        of the UAV in the flying state and the hovering state.
+    Arguments:
+        - fly_time (:obj:`float`): The flying time.
+        - hover_time (:obj:`float`): The hovering time.
+        - config (:obj:`dict`): The configuration of the environment.
+    Returns:
+        - energy (:obj:`float`): The energy consumption of the UAV.
+    """
     # configs
     Pu = 0.5  # the average transmitted power of each user, W,  e.g. mobile phone
     P0 = 79.8563  # blade profile power, W
@@ -153,20 +264,16 @@ def get_border(ur, lf):
     lower_right = [ur[0], lf[1]]
     lower_left = [lf[0], lf[1]]
 
-    coordinates = [
-        upper_left,
-        upper_right,
-        lower_right,
-        lower_left,
-        upper_left
-    ]
+    coordinates = [upper_left, upper_right, lower_right, lower_left, upper_left]
 
-    geo_json = {"type": "FeatureCollection",
-                "properties": {
-                    "lower_left": lower_left,
-                    "upper_right": upper_right
-                },
-                "features": []}
+    geo_json = {
+        "type": "FeatureCollection",
+        "properties": {
+            "lower_left": lower_left,
+            "upper_right": upper_right
+        },
+        "features": []
+    }
 
     grid_feature = {
         "type": "Feature",
@@ -191,10 +298,7 @@ def traj_to_timestamped_geojson(index, trajectory, robot_num, color):
 
     # for Point in GeoJSON type
     for _, row in point_gdf.iterrows():
-        corrent_point_coordinates = [
-            row["geometry"].xy[0][0],
-            row["geometry"].xy[1][0]
-        ]
+        corrent_point_coordinates = [row["geometry"].xy[0][0], row["geometry"].xy[1][0]]
         current_time = [row["time"].isoformat()]
 
         if index < robot_num:
@@ -231,12 +335,10 @@ def traj_to_timestamped_geojson(index, trajectory, robot_num, color):
                         'radius': radius,
                         'weight': 1,
                     },
-
                     "style": {  # line
                         "color": color,
                     },
                     "code": 11,
-
                 },
             }
         )
@@ -244,5 +346,4 @@ def traj_to_timestamped_geojson(index, trajectory, robot_num, color):
 
 
 if __name__ == "__main__":
-    print(judge_collision(new_robot_px=6505, new_robot_py=5130,
-                          old_robot_px=6925, old_robot_py=5130))
+    print(judge_collision(new_robot_px=6505, new_robot_py=5130, old_robot_px=6925, old_robot_py=5130))
diff --git a/zoo/CrowdSim/envs/crowdsim_lightzero_env.py b/zoo/CrowdSim/envs/crowdsim_lightzero_env.py
index bcecc89bc..e4d18047d 100644
--- a/zoo/CrowdSim/envs/crowdsim_lightzero_env.py
+++ b/zoo/CrowdSim/envs/crowdsim_lightzero_env.py
@@ -22,25 +22,21 @@ def __init__(self, cfg: dict = {}) -> None:
         self._replay_path = cfg.get('replay_path', None)
         self._robot_num = self._cfg.robot_num
         self._human_num = self._cfg.human_num
-        self._observation_space = gym.spaces.Dict({
-            'robot_state': gym.spaces.Box(
-                low=float("-inf"),
-                high=float("inf"),
-                shape=(self._robot_num, 4),
-                dtype=np.float32
-            ),
-            'human_state': gym.spaces.Box(
-                low=float("-inf"),
-                high=float("inf"),
-                shape=(self._human_num, 4),
-                dtype=np.float32
-            )
-        })
+        self._observation_space = gym.spaces.Dict(
+            {
+                'robot_state': gym.spaces.Box(
+                    low=float("-inf"), high=float("inf"), shape=(self._robot_num, 4), dtype=np.float32
+                ),
+                'human_state': gym.spaces.Box(
+                    low=float("-inf"), high=float("inf"), shape=(self._human_num, 4), dtype=np.float32
+                )
+            }
+        )
         # action space
         # one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]
         self.real_action_space = list(product(self._cfg.one_uav_action_space, repeat=self._robot_num))
         one_uav_action_n = len(self._cfg.one_uav_action_space)
-        self._action_space = gym.spaces.Discrete(one_uav_action_n**self._robot_num)
+        self._action_space = gym.spaces.Discrete(one_uav_action_n ** self._robot_num)
         self._action_space.seed(0)  # default seed
         self._reward_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1, ), dtype=np.float32)
         self._continuous = False
@@ -48,16 +44,17 @@ def __init__(self, cfg: dict = {}) -> None:
         # obs_mode '2-dim-array': np.concatenate((robot_state, human_state), axis=0)
         # obs_mode '1-dim-array': np.concatenate((robot_state, human_state), axis=0).flatten()
         self.obs_mode = self._cfg.get('obs_mode', '2-dim-array')
-        assert self.obs_mode in ['dict', '2-dim-array', '1-dim-array'], "obs_mode should be 'dict' or '2-dim-array' or '1-dim-array'!"
+        assert self.obs_mode in [
+            'dict', '2-dim-array', '1-dim-array'
+        ], "obs_mode should be 'dict' or '2-dim-array' or '1-dim-array'!"
         # action_mode 'combine': combine all robot actions into one action, action space size = one_uav_action_n**robot_num
         # action_mode 'separate': separate robot actions, shape (robot_num,), for each robot action space size = one_uav_action_n
         self.action_mode = self._cfg.get('action_mode', 'combine')
         assert self.action_mode in ['combine', 'separate'], "action_mode should be 'combine' or 'separate'!"
 
-
     def reset(self) -> np.ndarray:
         if not self._init_flag:
-            self._env = gym.make('CrowdSim-v0', dataset = self._cfg.dataset, custom_config = self._cfg)
+            self._env = gym.make('CrowdSim-v0', dataset=self._cfg.dataset, custom_config=self._cfg)
             self._init_flag = True
         if hasattr(self, '_seed') and hasattr(self, '_dynamic_seed') and self._dynamic_seed:
             np_seed = 100 * np.random.randint(1, 1000)
diff --git a/zoo/CrowdSim/envs/test_CrowdSim_env.py b/zoo/CrowdSim/envs/test_CrowdSim_env.py
index 25a8bc411..81ac5f7ab 100644
--- a/zoo/CrowdSim/envs/test_CrowdSim_env.py
+++ b/zoo/CrowdSim/envs/test_CrowdSim_env.py
@@ -2,28 +2,31 @@
 from easydict import EasyDict
 from zoo.CrowdSim.envs.CrowdSim_env import CrowdSimEnv
 
-mcfg=EasyDict(
-        env_name='CrowdSim-v0',
-        dataset = 'purdue',
-        robot_num = 2,
-        human_num = 59, # purdue
-        one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]])
+mcfg = EasyDict(
+    env_name='CrowdSim-v0',
+    dataset='purdue',
+    robot_num=2,
+    human_num=59,  # purdue
+    one_uav_action_space=[[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]]
+)
+
 
 def test_naive(cfg):
     env = CrowdSimEnv(cfg)
     env.seed(314)
     assert env._seed == 314
     obs = env.reset()
-    assert obs['observation'].shape == (244,)
+    assert obs['observation'].shape == (244, )
     for i in range(10):
         random_action = env.random_action()
         timestep = env.step(random_action)
         print(timestep)
         assert isinstance(timestep.obs['observation'], np.ndarray)
         assert isinstance(timestep.done, bool)
-        assert timestep.obs['observation'].shape == (244,)
+        assert timestep.obs['observation'].shape == (244, )
         assert timestep.reward.shape == (1, )
     print(env.observation_space, env.action_space, env.reward_space)
     env.close()
 
-test_naive(mcfg)
\ No newline at end of file
+
+test_naive(mcfg)
diff --git a/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py b/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py
index 09f36d6b7..ee369d2cd 100644
--- a/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py
+++ b/zoo/CrowdSim/envs/test_crowdsim_lightzero_env.py
@@ -3,19 +3,20 @@
 from easydict import EasyDict
 from zoo.CrowdSim.envs.crowdsim_lightzero_env import CrowdSimEnv
 
-mcfg=EasyDict(
-        env_name='CrowdSim-v0',
-        dataset = 'purdue',
-        robot_num = 2,
-        human_num = 59, # purdue
-        one_uav_action_space = [[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]],
-        obs_mode = '2-dim-array',
-        env_mode = 'easy',
-        )
+mcfg = EasyDict(
+    env_name='CrowdSim-v0',
+    dataset='purdue',
+    robot_num=2,
+    human_num=59,  # purdue
+    one_uav_action_space=[[0, 0], [30, 0], [-30, 0], [0, 30], [0, -30]],
+    obs_mode='2-dim-array',
+    env_mode='easy',
+)
 
-@ pytest.mark.envtest
 
+@pytest.mark.envtest
 class TestCrowdSimEnv:
+
     def test_obs_dict(self):
         mcfg['obs_mode'] = 'dict'
         env = CrowdSimEnv(mcfg)
@@ -79,4 +80,4 @@ def test_obs_1_dim_array(self):
     test = TestCrowdSimEnv()
     # test.test_obs_dict()
     # test.test_obs_2_dim_array()
-    test.test_obs_1_dim_array()
\ No newline at end of file
+    test.test_obs_1_dim_array()