diff --git a/curriculum_generation/sample_evaluation_task.py b/curriculum_generation/sample_evaluation_task.py
new file mode 100644
index 0000000..13cb9a5
--- /dev/null
+++ b/curriculum_generation/sample_evaluation_task.py
@@ -0,0 +1,54 @@
+"""Manual test for creating learning curriculum manually"""
+# pylint: disable=invalid-name,redefined-outer-name,bad-builtin
+# pylint: disable=wildcard-import,unused-wildcard-import
+from typing import List
+
+from nmmo.task import constraint as c
+from nmmo.task.base_predicates import AttainSkill, CountEvent, EarnGold, TickGE
+from nmmo.task.task_spec import TaskSpec
+
+curriculum: List[TaskSpec] = []
+
+# Stay alive as long as possible
+curriculum.append(
+    TaskSpec(eval_fn=TickGE, eval_fn_kwargs={"num_tick": 1024})
+)
+
+# Perform these 10 times
+essential_skills = [
+    "EAT_FOOD",
+    "DRINK_WATER",
+    "SCORE_HIT",
+    "PLAYER_KILL",
+    "HARVEST_ITEM",
+    "EQUIP_ITEM",
+    "CONSUME_ITEM",
+    "LEVEL_UP",
+    "EARN_GOLD",
+    "LIST_ITEM",
+    "BUY_ITEM",
+    "GIVE_ITEM",
+    "DESTROY_ITEM",
+    "GIVE_GOLD",
+]
+for event_code in essential_skills:
+    curriculum.append(
+        TaskSpec(
+            eval_fn=CountEvent,
+            eval_fn_kwargs={"event": event_code, "N": 10},
+        )
+    )
+
+# Reach skill level 10
+for skill in c.combat_skills + c.harvest_skills:
+    curriculum.append(
+        TaskSpec(
+            eval_fn=AttainSkill,
+            eval_fn_kwargs={"skill": skill, "level": 10, "num_agent": 1},
+        )
+    )
+
+# Earn gold 50
+curriculum.append(
+    TaskSpec(eval_fn=EarnGold, eval_fn_kwargs={"amount": 50})
+)
diff --git a/environment.py b/environment.py
index 36452ab..a921a65 100644
--- a/environment.py
+++ b/environment.py
@@ -33,14 +33,15 @@ def __init__(self, args: Namespace):
 
 class Postprocessor(StatPostprocessor):
     def __init__(self, env, is_multiagent, agent_id,
-      early_stop_agent_num=0,
-      sqrt_achievement_rewards=False,
-      heal_bonus_weight=0,
-      meander_bonus_weight=0,
-      explore_bonus_weight=0,
-      clip_unique_event=3,
+        eval_mode=False,
+        early_stop_agent_num=0,
+        sqrt_achievement_rewards=False,
+        heal_bonus_weight=0,
+        meander_bonus_weight=0,
+        explore_bonus_weight=0,
+        clip_unique_event=3,
     ):
-        super().__init__(env, agent_id)
+        super().__init__(env, agent_id, eval_mode)
         self.early_stop_agent_num = early_stop_agent_num
         self.sqrt_achievement_rewards = sqrt_achievement_rewards
         self.heal_bonus_weight = heal_bonus_weight
@@ -117,6 +118,7 @@ def env_creator():
         env = pufferlib.emulation.PettingZooPufferEnv(env,
             postprocessor_cls=Postprocessor,
             postprocessor_kwargs={
+                'eval_mode': args.eval_mode,
                 'early_stop_agent_num': args.early_stop_agent_num,
                 'sqrt_achievement_rewards': args.sqrt_achievement_rewards,
                 'heal_bonus_weight': args.heal_bonus_weight,
diff --git a/evaluate.py b/evaluate.py
index 99e9c9b..c9dc617 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -5,7 +5,11 @@
 import time
 from types import SimpleNamespace
 from pathlib import Path
+from collections import defaultdict
+from dataclasses import asdict
+from itertools import cycle
 
+import numpy as np
 import torch
 import pandas as pd
 
@@ -54,11 +58,11 @@ def save_replays(policy_store_dir, save_dir):
     args.early_stop_agent_num = 0  # run the full episode
     args.resilient_population = 0  # no resilient agents
 
-    # TODO: custom models will require different policy creation functions
+    # NOTE: This creates a dummy learner agent. Is it necessary?
     from reinforcement_learning import policy  # import your policy
     def make_policy(envs):
         learner_policy = policy.Baseline(
-            envs,
+            envs._driver_env,
             input_size=args.input_size,
             hidden_size=args.hidden_size,
             task_size=args.task_size
@@ -83,8 +87,10 @@ def make_policy(envs):
 
     # Load the policies into the policy pool
     evaluator.policy_pool.update_policies({
-        p.name: p.policy(make_policy, evaluator.buffers[0], evaluator.device)
-        for p in policy_store._all_policies().values()
+        p.name: p.policy(
+            policy_args=[evaluator.buffers[0]], 
+            device=evaluator.device
+        ) for p in list(policy_store._all_policies().values())
     })
 
     # Set up the replay helper
@@ -121,7 +127,7 @@ def create_policy_ranker(policy_store_dir, ranker_file="openskill.pickle"):
     file = os.path.join(policy_store_dir, ranker_file)
     if os.path.exists(file):
         if os.path.exists(file + ".lock"):
-            raise ValueError("Policy ranker file is locked.")
+            raise ValueError("Policy ranker file is locked. Delete the lock file.")
         logging.info("Using policy ranker from %s", file)
         policy_ranker = pufferlib.utils.PersistentObject(
             file,
@@ -135,21 +141,34 @@ def create_policy_ranker(policy_store_dir, ranker_file="openskill.pickle"):
         )
     return policy_ranker
 
-def rank_policies(policy_store_dir, device):
+class AllPolicySelector(pufferlib.policy_ranker.PolicySelector):
+    def select_policies(self, policies):
+        # Return all policy names in the alpahebetical order
+        # Loops circularly if more policies are needed than available
+        loop = cycle([
+            policies[name] for name in sorted(policies.keys()
+        )])
+        return [next(loop) for _ in range(self._num)]
+
+def rank_policies(policy_store_dir, eval_curriculum_file, device):
     # CHECK ME: can be custom models with different architectures loaded here?
     policy_store = setup_policy_store(policy_store_dir)
-    num_policies = len(policy_store._all_policies())
     policy_ranker = create_policy_ranker(policy_store_dir)
+    num_policies = len(policy_store._all_policies())
+    policy_selector = AllPolicySelector(num_policies)
 
-    # TODO: task-condition agents when generating replays
     args = SimpleNamespace(**config.Config.asdict())
     args.data_dir = policy_store_dir
+    args.eval_mode = True
+    args.num_envs = 5  # sample a bit longer in each env
+    args.num_buffers = 1
     args.learner_weight = 0  # evaluate mode
     args.selfplay_num_policies = num_policies + 1
     args.early_stop_agent_num = 0  # run the full episode
     args.resilient_population = 0  # no resilient agents
+    args.tasks_path = eval_curriculum_file  # task-conditioning
 
-    # TODO: custom models will require different policy creation functions
+    # NOTE: This creates a dummy learner agent. Is it necessary?
     from reinforcement_learning import policy  # import your policy
     def make_policy(envs):
         learner_policy = policy.Baseline(
@@ -174,17 +193,25 @@ def make_policy(envs):
         num_buffers=args.num_buffers,
         selfplay_learner_weight=args.learner_weight,
         selfplay_num_policies=args.selfplay_num_policies,
-        batch_size=args.rollout_batch_size,
+        batch_size=args.eval_batch_size,
         policy_store=policy_store,
         policy_ranker=policy_ranker, # so that a new ranker is created
+        policy_selector=policy_selector,
     )
 
     rank_file = os.path.join(policy_store_dir, "ranking.txt")
     with open(rank_file, "w") as f:
         pass
 
-    while evaluator.global_step < args.train_num_steps:
-        evaluator.evaluate()
+    results = defaultdict(list)
+    while evaluator.global_step < args.eval_num_steps:
+        _, stats, infos = evaluator.evaluate()
+
+        for pol, vals in infos.items():
+            results[pol].extend([
+                e[1] for e in infos[pol]['team_results']
+            ])
+
         ratings = evaluator.policy_ranker.ratings()
         dataframe = pd.DataFrame(
             {
@@ -202,10 +229,27 @@ def make_policy(envs):
                 + "\n\n"
             )
 
+        # Reset the envs and start the new episodes
+        # NOTE: The below line will probably end the episode in the middle, 
+        #   so we won't be able to sample scores from the successful agents.
+        #   Thus, the scores will be biased towards the agents that die early.
+        #   Still, the numbers we get this way is better than frequently
+        #   updating the scores because the openskill ranking only takes the mean.
+        #evaluator.buffers[0]._async_reset()
+
         # CHECK ME: delete the policy_ranker lock file
         Path(evaluator.policy_ranker.lock.lock_file).unlink(missing_ok=True)
 
     evaluator.close()
+    for pol, res in results.items():
+        aggregated = {}
+        keys = asdict(res[0]).keys()
+        for k in keys:
+            if k == 'policy_id':
+                continue
+            aggregated[k] = np.mean([asdict(e)[k] for e in res])
+        results[pol] = aggregated
+    print('Evaluation complete. Average stats:\n', results)
 
 
 if __name__ == "__main__":
@@ -213,7 +257,7 @@ def make_policy(envs):
 
     -p, --policy-store-dir: Directory to load policy checkpoints from for evaluation/ranking
     -s, --replay-save-dir: Directory to save replays (Default: replays/)
-    -e, --eval-mode: Evaluate mode (Default: False)
+    -r, --replay-mode: Replay save mode (Default: False)
     -d, --device: Device to use for evaluation/ranking (Default: cuda if available, otherwise cpu)
 
     To generate replay from your checkpoints, put them together in policy_store_dir, run the following command, 
@@ -247,12 +291,11 @@ def make_policy(envs):
         help="Directory to save replays (Default: replays/)",
     )
     parser.add_argument(
-        "-e",
-        "--eval-mode",
-        dest="eval_mode",
-        type=bool,
-        default=False,
-        help="Evaluate mode (Default: False)",
+        "-r",
+        "--replay-mode",
+        dest="replay_mode",
+        action="store_true",
+        help="Replay mode (Default: False)",
     )
     parser.add_argument(
         "-d",
@@ -262,15 +305,23 @@ def make_policy(envs):
         default="cuda" if torch.cuda.is_available() else "cpu",
         help="Device to use for evaluation/ranking (Default: cuda if available, otherwise cpu)",
     )
+    parser.add_argument(
+        "-t",
+        "--task-file",
+        dest="task_file",
+        type=str,
+        default="reinforcement_learning/eval_task_with_embedding.pkl",
+        help="Task file to use for evaluation",
+    )
 
     # Parse and check the arguments
     eval_args = parser.parse_args()
     assert eval_args.policy_store_dir is not None, "Policy store directory must be specified"
 
-    if eval_args.eval_mode:
-        logging.info("Ranking checkpoints from %s", eval_args.policy_store_dir)
-        logging.info("Replays will NOT be generated")
-        rank_policies(eval_args.policy_store_dir, eval_args.device)
-    else:
+    if getattr(eval_args, "replay_mode", False):
         logging.info("Generating replays from the checkpoints in %s", eval_args.policy_store_dir)
         save_replays(eval_args.policy_store_dir, eval_args.replay_save_dir)
+    else:
+        logging.info("Ranking checkpoints from %s", eval_args.policy_store_dir)
+        logging.info("Replays will NOT be generated")
+        rank_policies(eval_args.policy_store_dir, eval_args.task_file, eval_args.device)
diff --git a/leader_board.py b/leader_board.py
index 1e6edbf..1c8e783 100644
--- a/leader_board.py
+++ b/leader_board.py
@@ -36,6 +36,7 @@ class TeamResult:
     time_alive: int = 0,
     earned_gold: int = 0,
     completed_task_count: int = 0,
+    max_task_progress: float = 0,
     damage_received: int = 0,
     damage_inflicted: int = 0,
     ration_consumed: int = 0,
@@ -72,6 +73,7 @@ def names(cls) -> List[str]:
             "time_alive",
             "earned_gold",
             "completed_task_count",
+            "max_task_progress",
             "damage_received",
             "damage_inflicted",
             "ration_consumed",
@@ -110,8 +112,9 @@ class StatPostprocessor(pufferlib.emulation.Postprocessor):
     """Postprocessing actions and metrics of Neural MMO.
        Process wandb/leader board stats, and save replays.
     """
-    def __init__(self, env, agent_id):
+    def __init__(self, env, agent_id, eval_mode=False):
         super().__init__(env, is_multiagent=True, agent_id=agent_id)
+        self.eval_mode = eval_mode
         self._reset_episode_stats()
 
     def reset(self, observation):
@@ -125,6 +128,7 @@ def _reset_episode_stats(self):
         self._cod_starved = 0
         self._cod_dehydrated = 0
         self._task_completed = 0
+        self._max_task_progress = 0
         self._task_with_2_reward_signal = 0
         self._task_with_0p2_max_progress = 0
         self._curriculum = defaultdict(list)
@@ -156,19 +160,20 @@ def _update_stats(self, agent):
         task = self.env.agent_task_map[agent.ent_id][0]
         # For each task spec, record whether its max progress and reward count
         self._curriculum[task.spec_name].append((task._max_progress, task.reward_signal_count))
+        self._max_task_progress = task._max_progress
         if task.reward_signal_count >= 2:
-            self._task_with_2_reward_signal += 1.0
+            self._task_with_2_reward_signal = 1.0
         if task._max_progress >= 0.2:
-            self._task_with_0p2_max_progress += 1.0
+            self._task_with_0p2_max_progress = 1.0
         if task.completed:
-            self._task_completed += 1.0
+            self._task_completed = 1.0
 
         if agent.damage.val > 0:
-            self._cod_attacked += 1.0
+            self._cod_attacked = 1.0
         elif agent.food.val == 0:
-            self._cod_starved += 1.0
+            self._cod_starved = 1.0
         elif agent.water.val == 0:
-            self._cod_dehydrated += 1.0
+            self._cod_dehydrated = 1.0
 
         self._combat_level.append(agent.attack_level)
         self._harvest_level.append(max(
@@ -249,6 +254,7 @@ def reward_done_info(self, reward, done, info):
             info["stats"][key] = float(val)
 
         # Fill in the "TeamResult"
+        result.max_task_progress = self._max_task_progress
         result.total_score = self._curr_unique_count
         result.time_alive = self._time_alive
         result.earned_gold = achieved["achieved/earned_gold"]
@@ -268,6 +274,10 @@ def reward_done_info(self, reward, done, info):
 
         info["team_results"] = (self.agent_id, result)
 
+        if self.eval_mode:
+            # "return" is used for ranking in the eval mode, so put the task progress here
+            info["return"] = self._max_task_progress  # this is 1 if done
+
         return reward, done, info
 
 # Event processing utilities for Neural MMO.
diff --git a/reinforcement_learning/config.py b/reinforcement_learning/config.py
index 243c23e..f23e894 100644
--- a/reinforcement_learning/config.py
+++ b/reinforcement_learning/config.py
@@ -16,8 +16,10 @@ class Config:
     num_cores = None  # Number of cores to use for training
     num_envs = 6  # Number of environments to use for training
     num_buffers = 2  # Number of buffers to use for training
-    rollout_batch_size = 32768 # Number of steps to rollout
+    rollout_batch_size = 2**15 # Number of steps to rollout
+    eval_batch_size = 2**15 # Number of steps to rollout for eval
     train_num_steps = 10_000_000  # Number of steps to train
+    eval_num_steps = 1_000_000  # Number of steps to evaluate
     checkpoint_interval = 30  # Interval to save models
     run_name = f"nmmo_{time.strftime('%Y%m%d_%H%M%S')}"  # Run name
     runs_dir = "/tmp/runs"  # Directory for runs
@@ -47,6 +49,7 @@ class Config:
     map_size = 128  # Size of maps to use for training
     resilient_population = 0.2  # Percentage of agents to be resilient to starvation/dehydration
     tasks_path = None  # Path to tasks to use for training
+    eval_mode = False # Run the postprocessor in the eval mode
     early_stop_agent_num = 8  # Stop the episode when the number of agents reaches this number
     sqrt_achievement_rewards=False # Use the log of achievement rewards
     heal_bonus_weight = 0.03
diff --git a/reinforcement_learning/eval_task_with_embedding.pkl b/reinforcement_learning/eval_task_with_embedding.pkl
new file mode 100644
index 0000000..4ccd2ac
Binary files /dev/null and b/reinforcement_learning/eval_task_with_embedding.pkl differ
diff --git a/reinforcement_learning/policy.py b/reinforcement_learning/policy.py
index ba82cd2..3bb4d58 100644
--- a/reinforcement_learning/policy.py
+++ b/reinforcement_learning/policy.py
@@ -35,9 +35,11 @@ def critic(self, hidden):
 
 
 class Baseline(pufferlib.models.Policy):
-  def __init__(self, envs, input_size, hidden_size, task_size):
+  def __init__(self, env, input_size=256, hidden_size=256, task_size=4096):
     super().__init__()
-    self.envs = envs
+
+    self.flat_observation_space = env.flat_observation_space
+    self.flat_observation_structure = env.flat_observation_structure
 
     self.tile_encoder = TileEncoder(input_size)
     self.player_encoder = PlayerEncoder(input_size, hidden_size)
@@ -50,7 +52,8 @@ def __init__(self, envs, input_size, hidden_size, task_size):
     self.value_head = torch.nn.Linear(hidden_size, 1)
 
   def encode_observations(self, flat_observations):
-    env_outputs = self.envs.unpack_batched_obs(flat_observations)
+    env_outputs = pufferlib.emulation.unpack_batched_obs(flat_observations,
+        self.flat_observation_space, self.flat_observation_structure)
     tile = self.tile_encoder(env_outputs["Tile"])
     player_embeddings, my_agent = self.player_encoder(
         env_outputs["Entity"], env_outputs["AgentId"][:, 0]
diff --git a/scripts/slurm_run.sh b/scripts/slurm_run.sh
index cebf651..a8080bf 100755
--- a/scripts/slurm_run.sh
+++ b/scripts/slurm_run.sh
@@ -3,22 +3,21 @@
 # Example ussage:
 #
 # sbatch ./scripts/slurm_run.sh scripts/train_baseline.sh \
-#   --train.run_name=realikun_16x8_0001
+#   --run-name=test --wandb-project=nmmo --wandb-entity=kywch
 
 #SBATCH --account=carperai
-#SBATCH --partition=g40
+#SBATCH --partition=g40x
 #SBATCH --nodes=1
 #SBATCH --gpus=1
-#SBATCH --cpus-per-gpu=12
+#SBATCH --cpus-per-gpu=6
 #__SBATCH --mem=80G
-#SBATCH --chdir=/fsx/home-daveey/nmmo-baselines/
+#SBATCH --chdir=/fsx/proj-nmmo/nmmo-baselines/
 #SBATCH --output=sbatch/%j.log
 #SBATCH --error=sbatch/%j.log
 #SBATCH --requeue
-#SBATCH --export=PYTHONUNBUFFERED=1,WANDB_BASE_URL="https://stability.wandb.io",WANDB_DIR=/fsx/home-daveey/tmp/wandb,WANDB_CONFIG_DIR=/fsx/home-daveey/tmp/wandb
+#SBATCH --export=PYTHONUNBUFFERED=1,WANDB_BASE_URL="https://stability.wandb.io",WANDB_DIR=/fsx/proj-nmmo/tmp/wandb,WANDB_CONFIG_DIR=/fsx/proj-nmmo/tmp/wandb
 
-source /fsx/home-daveey/miniconda/etc/profile.d/conda.sh && \
-conda activate nmmo && \
+source /fsx/proj-nmmo/venv/bin/activate && \
 ulimit -c unlimited && \
 ulimit -s unlimited && \
 ulimit -a
diff --git a/scripts/train_baseline.sh b/scripts/train_baseline.sh
index 32464ac..a7d2a56 100755
--- a/scripts/train_baseline.sh
+++ b/scripts/train_baseline.sh
@@ -1,14 +1,5 @@
 #!/bin/bash
 
-python -u -O -m train \
---wandb.entity=daveey \
---wandb.project=nmmo \
---train.runs_dir=/fsx/home-daveey/runs/ \
---train.num_steps=10000000000 \
---env.num_maps=100 \
---rollout.num_buffers=2 \
---rollout.num_envs=12 \
---env.num_npcs=266 \
---ppo.training_batch_size=128 \
---rollout.batch_size=131072 \
+python -u -m train \
+--runs-dir=/fsx/proj-nmmo/runs/ \
 "${@}"
diff --git a/train.py b/train.py
index 682df2d..08f08c0 100644
--- a/train.py
+++ b/train.py
@@ -31,7 +31,7 @@ def setup_env(args):
 
     def make_policy(envs):
         learner_policy = policy.Baseline(
-            envs,
+            envs._driver_env,
             input_size=args.input_size,
             hidden_size=args.hidden_size,
             task_size=args.task_size
@@ -114,6 +114,7 @@ def curriculum_generation_track(trainer, args, use_elm=True):
     # Use the train_task_spec to train agents
     task_encoder.get_task_embedding(curriculum, save_to_file=CUSTOM_CURRICULUM_FILE)
     task_encoder.close()
+    trainer.data.sort_keys = []
     reinforcement_learning_track(trainer, args)
 
 if __name__ == "__main__":