Add a check in PPOExperiment to avoid unintended behaviors. (#16)

* refactor interval ops * remove pybind in cpp code * minor fix * minor fix * fix ppo cuda graph
openpsi-project · Jul 3, 2024 · 6caa522 · 6caa522
1 parent 50d508f
commit 6caa522
Show file tree

Hide file tree

Showing 4 changed files with 14 additions and 16 deletions.
diff --git a/realhf/api/core/model_api.py b/realhf/api/core/model_api.py
@@ -36,7 +36,8 @@ class GenerationHyperparameters:
     :type temperature: float
     :param num_samples: The number of samples to generate.
     :type num_samples: int
-    :param use_cuda_graph: Whether to use CUDA graph.
+    :param use_cuda_graph: Whether to use CUDA graph to reduce kernel launch overhead
+        during generation. Recommended for pure generation.
     :type use_cuda_graph: bool
     """
 

diff --git a/realhf/api/quickstart/model.py b/realhf/api/quickstart/model.py
@@ -7,7 +7,7 @@
 logger = logging.getLogger("Quickstart Model Config")
 
 
-@dataclasses.dataclass
+@dataclasses.dataclass(unsafe_hash=True)
 class ParallelismConfig:
     """Model 3D parallelism configuration.
 
@@ -46,17 +46,6 @@ def __str__(self):
         )
 
 
-def parallelism_config_equal(
-    parallel1: ParallelismConfig, parallel2: ParallelismConfig
-) -> bool:
-    # NOTE: Implementing __eq__ in dataclass will cause error in hydra and omegaconf
-    return (
-        parallel1.model_parallel_size == parallel2.model_parallel_size
-        and parallel1.pipeline_parallel_size == parallel2.pipeline_parallel_size
-        and parallel1.data_parallel_size == parallel2.data_parallel_size
-    )
-
-
 @dataclasses.dataclass
 class LoRAConfig:
     dim: int = 32

diff --git a/realhf/experiments/common/ppo_exp.py b/realhf/experiments/common/ppo_exp.py
@@ -261,6 +261,16 @@ def __post_init__(self):
             value_norm_eps=self.ppo.value_norm_eps,
         )
 
+        if self.ppo.gen.use_cuda_graph and (
+            self.actor_train.parallel != self.actor_gen.parallel
+        ):
+            raise ValueError(
+                "CUDA graph cannot be used with parameter reallocation "
+                "because CUDA graph requires pinned parameter memory. "
+                "Either set use_cuda_graph=False or set identical parallel "
+                "strategies for actor_train and actor_gen."
+            )
+
     @property
     def models(self) -> Dict[str, ModelTrainEvalConfig]:
         # role to config

diff --git a/realhf/experiments/common/utils.py b/realhf/experiments/common/utils.py
@@ -117,7 +117,6 @@ def make_model_config(cfg: ModelTrainEvalConfig):
 
 
 def resolve_rpc_hooks(rpc_allocs: List[RPCAllocation]):
-    from realhf.api.quickstart.model import parallelism_config_equal
 
     role_cnt = collections.defaultdict(int)
     for rpc_alloc in rpc_allocs:
@@ -130,8 +129,7 @@ def resolve_rpc_hooks(rpc_allocs: List[RPCAllocation]):
                 if rpc.name == other.rpc.name:
                     continue
                 if rpc.model_name.role == other.rpc.model_name.role and not (
-                    parallelism_config_equal(parallel, other.parallel)
-                    and device_mesh == other.device_mesh
+                    parallel == other.parallel and device_mesh == other.device_mesh
                 ):
                     other.rpc.model_name = ModelName(
                         rpc.model_name.role, role_cnt[rpc.model_name.role] + 1