Update RLlib examples to use PPO instead of PG.

huawei-noah · Jan 4, 2024 · 3c18a1d · 3c18a1d
1 parent d7adfe2
commit 3c18a1d
Show file tree

Hide file tree

Showing 5 changed files with 16 additions and 15 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,7 @@ Copy and pasting the git commit messages is __NOT__ enough.
 - The following modules have been renamed: `envision.types` -> `envision.etypes`, `smarts.core.utils.logging` -> `smarts.core.utils.core_logging`, `smarts.core.utils.math` -> `smarts.core.utils.core_math`, `smarts.sstudio.types` -> `smarts.sstudio.sstypes`. For compatibility reasons they can still be imported by their original module name.
 - Exposed `traffic:traci_retries`/`SMARTS_TRAFFIC_TRACI_RETRIES` to control how many times the `SumoTrafficSimulation` will try to restart when using default configuration.
 - `rllib` is now constrained as `<=2.9,>2.4`.
+- The `examples/e12_rllib` training examples `{pg_example|pg_pbt_example}.py` have been changed to `{ppo_example|ppo_pbt_example}.py`. `Policy Gradients (PG)` has been dropped in favor of the more well documented `Proximal Policy Optimization (PPO)`.
 ### Deprecated
 ### Fixed
 - `SumoTrafficSimulation` gives clearer reasons as to why it failed to connect to the TraCI server.

diff --git a/docs/ecosystem/rllib.rst b/docs/ecosystem/rllib.rst
@@ -6,17 +6,17 @@ RLlib
 
 **RLlib** is an open-source library for reinforcement learning that offers both high scalability and a unified API for a variety of applications. ``RLlib`` natively supports ``TensorFlow``, ``TensorFlow Eager``, and ``PyTorch``. Most of its internals are agnostic to such deep learning frameworks.
 
-SMARTS contains two examples using `Policy Gradients (PG) <https://docs.ray.io/en/latest/rllib-algorithms.html#policy-gradients-pg>`_.
+SMARTS contains two examples using `Proximal Policy Optimization (PPO) <https://docs.ray.io/en/latest/rllib/rllib-algorithms.html#ppo>`_.
 
 #. Policy gradient
 
-    + script: :examples:`e12_rllib/pg_example.py`
+    + script: :examples:`e12_rllib/ppo_example.py`
     + Shows the basics of using RLlib with SMARTS through :class:`~smarts.env.rllib_hiway_env.RLlibHiWayEnv`.
 
 #. Policy gradient with population based training
 
-    + script: :examples:`e12_rllib/pg_pbt_example.py`
-    + Combines Policy Gradients with `Population Based Training (PBT) <https://docs.ray.io/en/latest/tune/api/doc/ray.tune.schedulers.PopulationBasedTraining.html>`_ scheduling.
+    + script: :examples:`e12_rllib/ppo_pbt_example.py`
+    + Combines Proximal Policy Optimization with `Population Based Training (PBT) <https://docs.ray.io/en/latest/tune/api/doc/ray.tune.schedulers.PopulationBasedTraining.html>`_ scheduling.
 
 
 Recommended reads

diff --git a/examples/e12_rllib/pg_example.py → examples/e12_rllib/ppo_example.py b/examples/e12_rllib/pg_example.py → examples/e12_rllib/ppo_example.py
@@ -7,7 +7,7 @@
 try:
     from ray.rllib.algorithms.algorithm import Algorithm, AlgorithmConfig
     from ray.rllib.algorithms.callbacks import DefaultCallbacks
-    from ray.rllib.algorithms.pg import PGConfig
+    from ray.rllib.algorithms.ppo import PPOConfig
     from ray.rllib.env.base_env import BaseEnv
     from ray.rllib.evaluation.episode import Episode
     from ray.rllib.evaluation.episode_v2 import EpisodeV2
@@ -106,7 +106,7 @@ def main(
     smarts.core.seed(seed)
     assert len(set(rllib_policies.keys()).difference(agent_specs)) == 0
     algo_config: AlgorithmConfig = (
-        PGConfig()
+        PPOConfig()
         .environment(
             env=RLlibHiWayEnv,
             env_config={
@@ -129,7 +129,7 @@ def main(
             enable_tf1_exec_eagerly=True,
         )
         .training(
-            lr_schedule=[(0, 1e-3), (1e3, 5e-4), (1e5, 1e-4), (1e7, 5e-5), (1e8, 1e-5)],
+            lr=[[0, 1e-3], [1e3, 5e-4], [1e5, 1e-4], [1e7, 5e-5], [1e8, 1e-5]],
             train_batch_size=train_batch_size,
         )
         .multi_agent(

diff --git a/examples/e12_rllib/pg_pbt_example.py → examples/e12_rllib/ppo_pbt_example.py b/examples/e12_rllib/pg_pbt_example.py → examples/e12_rllib/ppo_pbt_example.py
@@ -14,7 +14,7 @@
     from ray import tune
     from ray.rllib.algorithms.algorithm import AlgorithmConfig
     from ray.rllib.algorithms.callbacks import DefaultCallbacks
-    from ray.rllib.algorithms.pg import PGConfig
+    from ray.rllib.algorithms.ppo import PPOConfig
     from ray.rllib.env.base_env import BaseEnv
     from ray.rllib.evaluation.episode import Episode
     from ray.rllib.evaluation.episode_v2 import EpisodeV2
@@ -147,7 +147,7 @@ def main(
     smarts.core.seed(seed)
     assert len(set(rllib_policies.keys()).difference(agent_specs)) == 0
     algo_config: AlgorithmConfig = (
-        PGConfig()
+        PPOConfig()
         .environment(
             env=RLlibHiWayEnv,
             env_config={

diff --git a/examples/tests/test_examples.py b/examples/tests/test_examples.py
@@ -73,10 +73,10 @@ def test_examples(example):
         )
 
 
-def test_rllib_pg_example():
-    from examples.e12_rllib import pg_example
+def test_rllib_ppo_example():
+    from examples.e12_rllib import ppo_example
 
-    main = pg_example.main
+    main = ppo_example.main
     with tempfile.TemporaryDirectory() as result_dir:
         main(
             scenarios=["./scenarios/sumo/loop"],
@@ -95,10 +95,10 @@ def test_rllib_pg_example():
         )
 
 
-def test_rllib_tune_pg_example():
-    from examples.e12_rllib import pg_pbt_example
+def test_rllib_tune_ppo_example():
+    from examples.e12_rllib import ppo_pbt_example
 
-    main = pg_pbt_example.main
+    main = ppo_pbt_example.main
     with tempfile.TemporaryDirectory() as result_dir, tempfile.TemporaryDirectory() as model_dir:
         main(
             scenarios=["./scenarios/sumo/loop"],