diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..38df9a4
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+models/*
+downloads/*
+data/*
+
+.idea
+.vscode
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/notebooks/eval-a2a.ipynb b/notebooks/eval-a2a.ipynb
new file mode 100644
index 0000000..e69de29
diff --git a/notebooks/eval-a2h.ipynb b/notebooks/eval-a2h.ipynb
new file mode 100644
index 0000000..e69de29
diff --git a/notebooks/eval-a2h_p.ipynb b/notebooks/eval-a2h_p.ipynb
new file mode 100644
index 0000000..427ef0f
--- /dev/null
+++ b/notebooks/eval-a2h_p.ipynb
@@ -0,0 +1,99 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2024-01-17T13:18:30.407261700Z",
+ "start_time": "2024-01-17T13:18:30.398739100Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import pickle\n",
+ "import json"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "DATA_PATH = \"../data/\"\n",
+ "MODEL_PATH = \"../models/\"\n",
+ "RUN_TYPE = \"ppo_runs/\"\n",
+ "# Specify the path to your pickle file\n",
+ "pickle_file_name = '../data/ppo_runs/ppo_sp_models_performance'\n",
+ "pickle_file_path = pickle_file_name + '.pickle'\n",
+ "\n",
+ "# Open the file in binary mode and load the pickled data\n",
+ "with open(pickle_file_path, 'rb') as file:\n",
+ " data = pickle.load(file)\n",
+ " # 转为格式化后的json\n",
+ " json_data = json.dumps(data, indent=2)\n",
+ " # 保存到文件\n",
+ " with open(pickle_file_name + '.json', 'w') as json_file:\n",
+ " json_file.write(json_data)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "pickle_file_path = MODEL_PATH + 'simple/' + 'FCP/' + 'dummy_env.pickle'\n",
+ "\n",
+ "with open(pickle_file_path, 'rb') as file:\n",
+ " data = pickle.load(file)\n",
+ " # 转为格式化后的json\n",
+ " json_data = json.dumps(data, indent=2)\n",
+ " # 保存到文件\n",
+ " with open(MODEL_PATH + 'simple/' + 'FCP/' + 'dummy_env.json', 'w') as json_file:\n",
+ " json_file.write(json_data)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "dict = {'simple': {'PPO_SP+PPO_SP': [308.0, 272.0, 304.0, 340.0, 256.0], 'PPO_SP+BC_test_0': [80.0, 28.0, 152.0, 100.0, 120.0], 'PPO_SP+BC_test_1': [132.0, 80.0, 152.0, 132.0, 156.0]}, 'unident_s': {'PPO_SP+PPO_SP': [176.0, 200.0, 212.0, 188.0, 140.0], 'PPO_SP+BC_test_0': [32.0, 20.0, 28.0, 52.0, 60.0], 'PPO_SP+BC_test_1': [140.0, 112.0, 44.0, 96.0, 92.0]}, 'random1': {'PPO_SP+PPO_SP': [280.0, 288.0, 296.0, 264.0, 256.0], 'PPO_SP+BC_test_0': [28.0, 108.0, 72.0, 36.0, 40.0], 'PPO_SP+BC_test_1': [36.0, 72.0, 108.0, 24.0, 52.0]}, 'random0': {'PPO_SP+PPO_SP': [184.0, 184.0, 152.0, 184.0, 148.0], 'PPO_SP+BC_test_0': [36.0, 8.0, 20.0, 24.0, 20.0], 'PPO_SP+BC_test_1': [68.0, 12.0, 24.0, 36.0, 44.0]}, 'random3': {'PPO_SP+PPO_SP': [164.0, 140.0, 156.0, 132.0, 120.0], 'PPO_SP+BC_test_0': [28.0, 44.0, 36.0, 56.0, 36.0], 'PPO_SP+BC_test_1': [32.0, 56.0, 32.0, 40.0, 44.0]}}\n",
+ "\n",
+ "json_data = json.dumps(dict, indent=2)\n",
+ "# 保存到文件\n",
+ "with open('our.json', 'w') as json_file:\n",
+ " json_file.write(json_data)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "pytorch2-cuda",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/slidev/slides.md b/slidev/slides.md
new file mode 100644
index 0000000..3dd787c
--- /dev/null
+++ b/slidev/slides.md
@@ -0,0 +1,443 @@
+---
+theme: academic
+layout: cover
+class: text-white
+coverAuthor: 胡逸同
+coverAuthorUrl: https://yitong-hu.metattri.com
+coverBackgroundUrl: https://lsky.metattri.com/i/2024/01/19/65aa2a5e7ddab.gif
+coverBackgroundSource: Overcooked-AI
+coverBackgroundSourceUrl: https://github.com/HumanCompatibleAI/overcooked_ai
+hideInToc: true
+fonts:
+ local: Montserrat, Roboto Mono, Roboto Slab # local fonts are used for legal reasons for deployment to https://slidev-theme-academic.alexeble.de and only set up for the example project, remove this line for your project to automatically have fonts imported from Google
+themeConfig:
+ paginationX: r
+ paginationY: t
+ paginationPagesDisabled: [1]
+title: Zero-Shot Coordination in Overcooked-AI
+info: |
+ # slidev-theme-academic
+
+ Created and maintained by [Alexander Eble](https://www.alexeble.de).
+
+ - [GitHub](https://github.com/alexanderdavide/slidev-theme-academic)
+ - [npm](https://www.npmjs.com/package/slidev-theme-academic)
+
+ slidev-theme-academic is licensed under [MIT](https://github.com/alexanderdavide/slidev-theme-academic/blob/master/LICENSE).
+
+
+---
+
+# Zero-Shot Coordination in Overcooked-AI
+
+
+
+---
+hideInToc: true
+---
+
+# 目录
+
+
+
+---
+layout: center
+class: "text-center"
+---
+
+# Overcooked-AI
+
+---
+
+## 简介
+
+[Overcooked-AI](https://github.com/HumanCompatibleAI/overcooked_ai) 是由 UC Baerkeley [CHAI](https://humancompatible.ai/about/) 团队开发的 benchmark 环境,旨在通过 [Overcooked](http://www.ghosttowngames.com/overcooked/) 游戏,评估各算法在 **human-AI 完全合作**任务中的性能。
+
+在 Overcooked-AI 中,2 个玩家需要**合作**完成 `取食材-移动食材-入锅-装盘-上菜` 一系列任务,获得团队得分。Agents 需要学习地图导航、物体交互和上菜,同时注意与伙伴的协调,属于 common-payoff game。
+
+**环境**:
+
+- 2 agents,agents pair = $[A_0,A_1], A_i \in [AI, Human]$
+- 5 种不同的布局,各有不同的地形和物体分布
+- 可交互物体 = [洋葱,盘子,锅,台面,上菜区],环境会无限生成洋葱和盘子
+
+![layout](https://lsky.metattri.com/i/2024/01/19/65aa2a5e7ddab.gif)
+
+---
+layout: figure-side
+figureCaption: Human (Green Hat) v.s. AI (Blue Hat) in Coordination Ring
+figureUrl: https://lsky.metattri.com/i/2024/01/24/65b0ca8512b73.gif
+---
+
+**Agents**:
+
+- 动作空间 = [上、下、左、右移动,啥也不干,交互]
+ > 交互:对个物体有不同的操作,例如:将洋葱入锅,拿盘子盛锅里的汤,将盛好汤的盘子放在上菜区,或把洋葱/盘子暂存至台面
+- 可完全观测环境(MDP),或泛化到可部分观测环境(POMDP)
+
+**任务**:
+
+- `将 3 个洋葱放入锅中 - 煮 20 timesteps - 将汤装入盘子 - 将盘子放在上菜区`
+- **上菜才能得分**,有时间限制
+- 只有团队得分,无个人得分
+
+---
+
+**布局**:
+
+![layout](https://lsky.metattri.com/i/2024/01/19/65aa2a5e7ddab.gif)
+
+不同的布局要求不同的协作策略,从左到右,分别是:
+
+1. Cramped Room:提供低级协调挑战,因空间限制,代理很容易相撞。
+2. Asymmetric Advantages:测试玩家是否可以选择发挥自身优势的高级策略。
+3. Coordination Ring:玩家必须协调,才能在布局的左下角与右上角之间移动。
+4. Forced Coordination:消除了碰撞协调问题,强迫玩家发展高级联合策略,因为单个玩家无法独自上菜。
+5. Counter Circuit:涉及隐式的协调策略,洋葱经柜台传递至锅中,而不是绕道携带。
+
+---
+
+**使用 Multi-Agent MDP 表达游戏过程** [^UtilityLearningHumans2019]:
+
+A multi-agent MDP is defined by a tuple $\langle S, \alpha, \{A_{i \in \alpha}\}, \mathcal{T}, R \rangle$:
+
+- $S$ is a finite set of states, and $R : S \to \mathbb{R}$ is a real-valued reward function.
+- $\alpha$ is a finite set of agents.
+- $A_i$ is the finite set of actions available to agent $i$.
+- $\mathcal{T} : S \times A_1 \times \cdots \times A_n \times S \to [0, 1]$ is a transition function that determines the next state given all of the agents’ actions.
+
+[^UtilityLearningHumans2019]: Carroll, M., Shah, R., Ho, M. K., Griffiths, T., Seshia, S., Abbeel, P., & Dragan, A. (2019). On the Utility of Learning about Humans for Human-AI Coordination. Advances in Neural Information Processing Systems, 32. https://proceedings.neurips.cc/paper_files/paper/2019/hash/f5b1b89d98b7286673128a5fb112cb9a-Abstract.html
+
+---
+
+## 历史遗留问题
+
+早期的 Overcooked-AI 组件版本依赖复杂,且不向下兼容。
+
+**截至 2023 年**,CHAI 团队已经将上述组件合并发布至 [Overcooked-AI](https://github.com/HumanCompatibleAI/overcooked_ai) 仓库。与 `neurips2019` 版本相比,当前版本的套件有大量优化,包括 `human_aware_rl` 引入 [Ray](https://docs.ray.io/en/latest/rllib/rllib-training.html) 作为分布式训练框架,`overcooked_ai_py` 在游戏中加入新动作 `煮食材`,`overcooked_demo` 可一键更新 `overcooked_ai_py` 版本并在 Web 演示游戏,以及更丰富的文档和用例。
+
+然而,目前所调查的相关工作均使用 `neurips2019` [^UtilityLearningHumans2019] 版本实现自己的算法,且不被当前版本的套件兼容。
+
+**因此,目前将以 `neurips2019` 版本为基础复现相关工作**。
+
+[^UtilityLearningHumans2019]: Carroll, M., Shah, R., Ho, M. K., Griffiths, T., Seshia, S., Abbeel, P., & Dragan, A. (2019). On the Utility of Learning about Humans for Human-AI Coordination. Advances in Neural Information Processing Systems, 32. https://proceedings.neurips.cc/paper_files/paper/2019/hash/f5b1b89d98b7286673128a5fb112cb9a-Abstract.html
+
+---
+
+## 改进方向
+
+后期,为方便 zero-shot coordination(ZSC)研究者享受现代 benchmark env 的特性,可以尝试:
+
+1. 将 `neurips2019` 版的模型转换为兼容当前版本的格式,或
+1. 使用当前版本 Overcooked-AI 或 Melting Pot [^MeltingPot2023] [^MeltingPotResearch2023] 复现相关工作
+
+> Melting Pot 是 DeepMind 团队提出的更现代的 MARL benchmark 环境,其同样实现了 Overcooked 游戏,可被视为 Overcooked-AI 的超集。Melting Pot 聚焦于社会情境下的多智能体互动,具有更丰富的特性,例如:更精细的环境设置(可调的观测视窗)、更多的互动任务(游戏)、更多的玩家数量,以及更合理的评估指标,有望成为 ZSC 研究的新标杆。
+
+**今后的有关 MARL 的工作可以考虑使用 Melting Pot 作为 simulater。**
+
+[^MeltingPot2023]: Agapiou, J. P., Vezhnevets, A. S., Duéñez-Guzmán, E. A., Matyas, J., Mao, Y., Sunehag, P., Köster, R., Madhushani, U., Kopparapu, K., Comanescu, R., Strouse, D. J., Johanson, M. B., Singh, S., Haas, J., Mordatch, I., Mobbs, D., & Leibo, J. Z. (2023). Melting Pot 2.0 (arXiv:2211.13746). arXiv. https://doi.org/10.48550/arXiv.2211.13746
+
+[^MeltingPotResearch2023]: Hu Y. (2023). Melting Pot Research Report. https://slidev.metattri.com/
+
+---
+layout: center
+class: "text-center"
+---
+
+# Zero-Shot Coordination Baselines
+
+> Zero-Shot Coordination:与没见过的伙伴(人或 AI)协作
+>
+> 以下工作大部分使用 Overcooked-AI 评估算法性能
+
+---
+
+## On the Utility of Learning about Humans for Human-AI Coordination (HARL) [^UtilityLearningHumans2019]
+
+2019 年,Self-play(SP)和 population-based training(PBT)是两种常用的 MARL 训练策略,用于训练与人类协作的 agents。
+
+本文认为,SP 和 PBT agent 将假设其伙伴是最优的或者与自己相似的(而人类的行为不是最优的且难以预测),这会导致 agent 更适合跟自身而非人类协作,将人类数据或模型纳入训练过程将改进 human-AI coordination 的性能。因此,本文设计了 Overcooked-AI 环境,并提出了:
+
+- Behavior Cloning model(BC)、 Proxy human model H$_{Proxy}$
+ > 二者都是使用人类数据训练的动作分类(预测)器,BC 是参与训练 agent 的伙伴;而 H$_{Proxy}$ 作为 ground truth,用于评估 agent 的性能,二者关系类似于 训练集 和 测试集
+- 2 类与人类协作的 agent 模型
+ - 不使用人类数据:Self-Play(SP)、Population-Based Training(PBT)、规划方法
+ - 使用人类数据:PPO with human model PPO$_{BC}$、规划方法
+
+[^UtilityLearningHumans2019]: Carroll, M., Shah, R., Ho, M. K., Griffiths, T., Seshia, S., Abbeel, P., & Dragan, A. (2019). On the Utility of Learning about Humans for Human-AI Coordination. Advances in Neural Information Processing Systems, 32. https://proceedings.neurips.cc/paper_files/paper/2019/hash/f5b1b89d98b7286673128a5fb112cb9a-Abstract.html
+
+---
+
+### Method
+
+**Self-Play**:paly with self in each iteration, using PPO.
+
+**Population-Based Training(PBT)**:play with n agents in each iteration, using PPO
+
+- 种群规模 $n=3$(本文中),每个 agent 与 SP agent 结构相同,只是伙伴从自己变成了不同的 agents
+- PBT 算法可简述为:初始化 n agents,两两配对训练,最差的 agent 变异(进化),细节如下:
+
+ ```python
+ while not converged:
+ for i in range(n):
+ for j in range(i+1, n):
+ train(agent[i]) # training agent_i using PPO and agent_j is embedded into the environment
+ performance[i] = eval(agent[i])
+ worst_agent = get_worst(performance)
+ agent[worst_agent] = mutate(agent[worst_agent])
+ ```
+
+---
+
+**PPO$_{BC}$**:play with BC in each iteration, using PPO
+
+1. 使用人类游戏数据训练行为克隆(behavior cloning)模型 BC
+ - 分类任务
+ - 使用 cross-entropy loss
+2. BC 作为环境的一部分,使用 PPO 作为策略梯度算法,训练agent
+
+---
+layout: figure
+figureUrl: https://lsky.metattri.com/i/2024/01/22/65ad7022db181.png
+---
+
+### Evaluation
+
+#### AI-H$_{Proxy}$ Play
+
+---
+layout: figure
+figureUrl: https://lsky.metattri.com/i/2024/01/22/65ad6faa1e503.png
+---
+
+#### AI-Human Play
+
+---
+
+## Fictitious Co-Play (FCP) [^strouseCollaboratingHumansHuman2021]
+
+**Motivation**:
+
+1. Self Play(SP)或 Population Play(PP),产生的 agent 过度适应他们的训练伙伴,难以推广到人类
+2. HARL 提出的 PPO$_{BC}$(本文称为 BCP) 涉及到收集大量人类数据,繁重而昂贵
+3. 与 novel partner 的合作需要处理对称问题,比如:二人相遇时的避让策略,同左 or 同右?
+4. 与人类合作需要迅速理解并适应他们的个人优势、劣势和偏好
+5. 好的 agent 应该能够和各水平的伙伴合作,而不是只能和最优伙伴合作
+
+**Contribution**:
+
+- 提出 Fictitious Co-Play(FCP)来训练能够与人类进行 zero-shot 协调的 agent
+- 证明 FCP agent 在与各种 agents 进行 zero-shot 协调时,比 SP、PP 和 BCP 的表现更好
+- 证明 FCP 在任务得分和人类偏好方面都明显优于 BCP 的 SOTA
+
+[^strouseCollaboratingHumansHuman2021]: Strouse, D., McKee, K., Botvinick, M., Hughes, E., & Everett, R. (2021). Collaborating with Humans without Human Data. Advances in Neural Information Processing Systems, 34, 14502–14515. https://proceedings.neurips.cc/paper/2021/hash/797134c3e42371bb4979a462eb2f042a-Abstract.html
+
+---
+layout: figure-side
+figureUrl: https://lsky.metattri.com/i/2024/01/22/65ad7277e4d7d.png
+---
+
+### Method
+
+**Stage 1**:**独立**训练 n 个 SP agents,保存各阶段的 checkpoint 至 pool(代表不同水平)
+
+**Stage 2**:与 pool 中 agents 配对训练 FCP agent
+
+为了推广 FCP,使其能够接受视觉 observation,本文未采用 PPO,而是设计了强化学习算法:使用 V-MPO 算法,结合 ResNet 和 LSTM 构建所有 agent(stage 1 & 2),在分布式环境并行训练。
+
+> “For our reinforcement learning agents, we use the V-MPO [65] algorithm along with a ResNet [26] plus LSTM [29] architecture which we found led to optimal behavior across all layouts. Agents are trained using a distributed set of environments running in parallel [17], each sampling two agents from the training population to play together every episode.” ([Strouse 等, 2021, p. 4](zotero://select/library/items/YRLFN64D)) ([pdf](zotero://open-pdf/library/items/U7HJDG94?page=4&annotation=RB979D7S))
+
+---
+
+### Evaluation
+
+本文使用 3 类 agent,与 FCP 和 baselines 配对玩游戏,比较上菜次数(Deliveries):
+
+- Proxy human H$_{Proxy}$
+- SP agent(as skillfull partner)
+- 随机初始化的策略 agent(as low-skill partner)
+
+![image-20240122230242572](https://lsky.metattri.com/i/2024/01/22/65ae839583503.png)
+
+---
+
+#### 消融实验
+
+- FCP: pool 中 agents 结构相同,seed 不同,Stage 2 使用过去 checkpoints。
+- FCP$_{-T}$: 相比于 FCP,不使用过去 checkpoints(未收敛的 agents),用于测试过程中生成的 checkpoints 的重要性。
+- FCP$_{+A}$: 相比于 FCP,agents 结构不同,用于测试不同的结构会不会带来更好的多样性。
+- FCP$_{-T,+A}$: 相比于 FCP$_{+A}$,不使用过去 checkpoints,用于测试不同结构能否替代过程中生成的 checkpoints。
+
+![image-20240122231418688](https://lsky.metattri.com/i/2024/01/22/65ae864c85547.png)
+
+---
+
+**然而**:
+
+- FCP 不仅耗费时间,而且容易出现研究者的偏见,可能会对创建的 agent 的行为产生负面影响。
+
+- 对于更加复杂的游戏,FCP 可能需要更大的 pool,这可能是不切实际的。
+
+---
+
+## Trajectory Diversity (TrajeDi) [^lupuTrajectoryDiversityZeroShot2021]
+
+TBD
+
+MEP 传承了 TrajeDi 的思想,并达到新 SOTA。
+
+[^lupuTrajectoryDiversityZeroShot2021]: Lupu, A., Cui, B., Hu, H., & Foerster, J. (2021). Trajectory Diversity for Zero-Shot Coordination. Proceedings of the 38th International Conference on Machine Learning, 7204–7213. https://proceedings.mlr.press/v139/lupu21a.html
+
+---
+
+## Maximum Entropy PBT (MEP) [^zhaoMaximumEntropyPopulationBased2023]
+
+### TL;DR
+
+竞争环境下,SP 和 PBT 效果较好,但在与人类合作的环境下,二者会训练出过于 specific 的策略。
+
+一种解决思路是,引入人类数据辅助训练,但数据收集成本较高;
+
+另一种思路是提高参与训练的 agents 的多样性:
+
+- **diverse set of policies**:例如 TrajeDi 优化 trajectory 间的 JS 散度从而达到 diverse 的目标,FCP 则使用随机种子或不同的 checkpoints;
+- **domain randomization**:some features of the environment are changed randomly during training to make the policy robust to that feature,本文的方法可被视为 domain randomization。同时,本文采用最大熵强化学习(MERL),相比于一般的强化学习,MERL 则需要最大化 return + 熵,这样会使得策略更具有**探索性**并且具有更强的**鲁棒性**。
+
+[^zhaoMaximumEntropyPopulationBased2023]: Zhao, R., Song, J., Yuan, Y., Hu, H., Gao, Y., Wu, Y., Sun, Z., & Yang, W. (2023). Maximum Entropy Population-Based Training for Zero-Shot Human-AI Coordination. Proceedings of the AAAI Conference on Artificial Intelligence, 37, 6145–6153. https://doi.org/10.1609/aaai.v37i5.25758
+
+---
+
+### Method
+
+与 FCP 类似,MEP 也是两阶段法:首先训练一个 maximum entropy population,然后通过 population 训练一个 robust agent。
+
+本文借鉴最大熵强化学习的思想修改了训练的目标函数,涉及两个概念:**Population Diversity & Entropy**:
+
+**Population Diversity**:首先需要 population 中 agents 自身的策略更有探索性,同时也需要两两 agents 的策略差异更大。
+
+$$
+\mathrm{PD}\left(\left\{\pi^{(1)}, \pi^{(2)}, \ldots, \pi^{(n)}\right\}, s_{t}\right):=\frac{1}{n} \sum_{i=1}^{n} \mathcal{H}\left(\pi^{(i)}\left(\cdot \mid s_{t}\right)\right)
++\frac{1}{n^{2}} \sum_{i=1}^{n} \sum_{j=1}^{n} D_{\mathrm{KL}}\left(\pi^{(i)}\left(\cdot \mid s_{t}\right), \pi^{(j)}\left(\cdot \mid s_{t}\right)\right)
+$$
+
+where KL-divergence ($D_{\mathrm{KL}}$) and entropy ($\mathcal{H}$) are defined as follows:
+
+$$
+D_{\mathrm{KL}}\left(\pi^{(i)}\left(\cdot \mid s_{t}\right), \pi^{(j)}\left(\cdot \mid s_{t}\right)\right)=
+\sum_{a \in \mathcal{A}} \pi^{(i)}\left(a_{t} \mid s_{t}\right) \log \frac{\pi^{(i)}\left(a_{t} \mid s_{t}\right)}{\pi^{(j)}\left(a_{t} \mid s_{t}\right)}
+$$
+
+$$
+\mathcal{H}\left(\pi^{(i)}\left(\cdot \mid s_{t}\right)\right)=-\sum_{a \in \mathcal{A}} \pi^{(i)}\left(a_{t} \mid s_{t}\right) \log \pi^{(i)}\left(a_{t} \mid s_{t}\right)
+$$
+
+---
+
+**Population Entropy**:因为 PD 计算复杂度过高,并且 KL 散度是 unbounded 的,可能会有收敛性的问题,因此本文提出了 PE(population mean policy 的熵),其具有线性复杂度并且是 bounded 的,作为 PD 的 surrogate loss。文中也证明了 PE 是 PD 的 lower bound,因此可以作为 surrogate loss。
+
+$$
+\mathrm{PE}\left(\left\{\pi^{(1)}, \pi^{(2)}, \ldots, \pi^{(n)}\right\}, s_{t}\right): = \mathcal{H}\left(\bar{\pi}\left(\cdot \mid s_{t}\right)\right),
+\text { where } \bar{\pi}\left(a_{t} \mid s_{t}\right): = \frac{1}{n} \sum_{i = 1}^{n} \pi^{(i)}\left(a_{t} \mid s_{t}\right)
+$$
+
+为了训练出能 coorperate well 又 mutually distinct 的 strategy,本文在目标函数中引入 PE 分量,同时也引入了 hyperparameter $\alpha$ 来控制 PE 的权重,作为 **MEP training objective**:
+
+$$
+J(\bar{\pi})=\sum_t\mathbb{E}_{(s_t,a_t)\sim\bar{\pi}}\left[R(s_t,a_t)+\alpha\mathcal{H}(\bar{\pi}(\cdot|s_t))\right]
+$$
+
+---
+
+#### **Stage 1**: train a maximum entropy population
+
+1. 随机从 population 中采一个 agent
+2. 然后优化该 agent 的策略
+3. 重复步骤 1 - 2,直到 $J(\bar{\pi})$ 收敛。
+
+![image-20240122172348406](https://lsky.metattri.com/i/2024/01/22/65ae342bd4d9e.png)
+
+> $r(s_t, a_t)$ 的获取是由采得的 agent 以及他的 copy 作为 partner 得到的,相当于 SP
+
+---
+
+#### **Stage 2**: Training a robust agent (MEP Agent) paired with MEPooulation
+
+本文没有直接对 MEpopulation 做 uniformly sample 来获得伙伴 agent 与 MEP agent 配对训练,而是使用了 learning progress-based prioritized sampling(LPPS)来选择伙伴。LPPS 会选择 learning progress 最大的伙伴,这样可以使得 MEP agent 更具有探索性。
+
+对于具体的 LPPS 方法,本文未采用 maximize average(最大化对 population 中所有 partner 的表现的平均值), 因为 MEP agent 可能会学到与最容易合作的伙伴合作的策略,而放弃了难以合作的。因本文用 ranked-based 优先级采样让 MEP agent 优先跟难以合作的伙伴配对训练:
+
+$$
+p(\pi^{(i)})=\frac{\operatorname{rank}\left(1/\mathbb{E}_\tau\left[\sum_tR(s_t,a_t^{(A)},a_t^{(i)})\right]\right)^\beta}{\sum_{j=1}^n\operatorname{rank}\left(1/\mathbb{E}_\tau\left[\sum_tR(s_t,a_t^{(A)},a_t^{(j)})\right]\right)^\beta}
+$$
+
+优先级采样是 smooth approximation of maximize minimum(极端情况下只和最难合作的进行训练就是 maximize minimum 了),当 population 足够多时,会有 partner agent 的策略与人类策略 ε-close,文中也证明了 human-ai coordination 的一些下界的性质。
+
+---
+layout: figure
+figureUrl: https://lsky.metattri.com/i/2024/01/22/65ae7af17a694.png
+---
+
+### Evaluation
+
+#### AI-H$_{Proxy}$ Play
+
+---
+
+## Hidden-Utility Self-Play (HSP) [^yuLearningZeroShotCooperation]
+
+TBD
+
+[^yuLearningZeroShotCooperation]: Yu, C., Gao, J., Liu, W., Xu, B., Tang, H., Yang, J., Wang, Y., & Wu, Y. (n.d.). Learning Zero-Shot Cooperation with Humans, Assuming Humans Are Biased.
+
+---
+
+## PECAN [^louPECANLeveragingPolicy2023]
+
+Policy Ensemble Context-Aware zero-shot human-AI coordinatioN
+
+
+
+![image-20240122033547773](https://lsky.metattri.com/i/2024/01/22/65ad721756cbf.png)
+
+[^louPECANLeveragingPolicy2023]: Lou, X., Guo, J., Zhang, J., Wang, J., Huang, K., & Du, Y. (2023). PECAN: Leveraging Policy Ensemble for Context-Aware Zero-Shot Human-AI Coordination. Proceedings of the 2023 International Conference on Autonomous Agents and Multiagent Systems, 679–688.
+
+---
+
+## Cooperative Open-ended LEarning (COLE) [^CooperativeOpenendedLearning2023]
+
+### Method
+
+![image-20240123004328668](https://lsky.metattri.com/i/2024/01/23/65ae9b324dde7.png)
+
+[^CooperativeOpenendedLearning2023]: Li, Y., Zhang, S., Sun, J., Du, Y., Wen, Y., Wang, X., & Pan, W. (2023). Cooperative Open-ended Learning Framework for Zero-Shot Coordination. Proceedings of the 40th International Conference on Machine Learning, 20470–20484. https://proceedings.mlr.press/v202/li23au.html
+
+---
+
+### Evaluation
+
+#### AI-H$_{Proxy}$ Play
+
+![image-20240122233445010](https://lsky.metattri.com/i/2024/01/22/65ae8b17798cb.png)
+
+---
+layout: figure
+figureUrl: https://lsky.metattri.com/i/2024/01/22/65ae8b2899e17.png
+---
+
+#### AI-AI Play
+
+---
+layout: end
+hideInToc: true
+---
+
+# Thank you!
+
+> 胡逸同,2024/01/25