Merge branch 'main' of github.com:garrett4wade/distributed_llm into f…

…ix-contiguous-param
openpsi-project · Jul 1, 2024 · d3bf732 · d3bf732
2 parents 4aaf456 + ccfe95c
commit d3bf732
Show file tree

Hide file tree

Showing 6 changed files with 2 additions and 22 deletions.
diff --git a/realhf/api/core/data_api.py b/realhf/api/core/data_api.py
@@ -65,7 +65,6 @@ def load_shuffle_split_dataset(
     util: DatasetUtility,
     dataset_path: str,
     dataset_builder: Optional[Callable[[], List[Dict[str, str]]]] = None,
-    max_num_sequences: Optional[int] = None,
 ):
     if dataset_path is not None:
         if dataset_path.endswith(".jsonl"):
@@ -80,9 +79,6 @@ def load_shuffle_split_dataset(
         assert dataset_builder is not None
         data = dataset_builder()
 
-    if max_num_sequences is not None:
-        data = data[:max_num_sequences]
-
     datasize_per_rank = len(data) // util.world_size
     shuffle_indices = get_shuffle_indices(
         util.seed, datasize_per_rank * util.world_size

diff --git a/realhf/apps/main.py b/realhf/apps/main.py
@@ -139,7 +139,6 @@ def main_start(args, recover_count: int = 0):
             "will be saved to temporary directory of the system. "
             "To change the fileroot, set the fileroot option of your choice in your CLUSTER_SPEC_PATH."
         )
-    use_cuda_graph = os.environ.get("USE_CUDA_GRAPH", "0") == "1"
 
     BASE_ENVIRONS = {
         "PYTHONPATH": "/realhf",
@@ -152,12 +151,10 @@ def main_start(args, recover_count: int = 0):
         "RECOVER_RUN": "1" if is_recover_run else "0",
         "SAVE_RECOVER_STATES": "1" if save_recover_states else "0",
         "CLUSTER_SPEC_PATH": cluster_spec_path if cluster_spec_path else "",
-        "USE_CUDA_GRAPH": "1" if use_cuda_graph else "0",
     }
 
     os.environ["IS_REMOTE"] = "0" if not force_allocation_use_cache else "1"
     os.environ["REAL_PACKAGE_PATH"] = repo_path
-    os.environ["USE_CUDA_GRAPH"] = "1" if use_cuda_graph else "0"
 
     # setup experiments
     if args.allocation_mode == "search":

diff --git a/realhf/experiments/common/ppo_exp.py b/realhf/experiments/common/ppo_exp.py
@@ -91,8 +91,6 @@ class PPOHyperparameters:
     :param value_norm_eps: Epsilon factor in the
         denominator of exponential moving average.
     :type value_norm_eps: float
-    :param use_cuda_graph: Whether to use CUDA graph in PPO actor generation.
-    :type use_cuda_graph: bool
     """
 
     gen: GenerationHyperparameters = dataclasses.field(
@@ -117,7 +115,6 @@ class PPOHyperparameters:
     )
     value_norm_beta: float = 0.99995
     value_norm_eps: float = 1e-5
-    use_cuda_graph: bool = False
 
 
 @dataclasses.dataclass
@@ -248,8 +245,6 @@ def __post_init__(self):
             raise NotImplementedError("SFT LoRA is not supported yet.")
         if self.is_rew_lora or self.rew_lora_path is not None:
             raise NotImplementedError("Rew LoRA is not supported yet.")
-        if self.ppo.use_cuda_graph:
-            os.environ["USE_CUDA_GRAPH"] = "1"
 
         self.ppo_kwargs = dict(
             n_minibatches=self.ppo.ppo_n_minibatches,

diff --git a/realhf/impl/dataset/prompt_dataset.py b/realhf/impl/dataset/prompt_dataset.py
@@ -17,7 +17,6 @@ def __init__(
         dataset_path: Optional[str] = None,
         dataset_builder: Optional[Callable[[], List[Dict]]] = None,
         pad_to_max_length: bool = False,
-        max_num_sequences: Optional[int] = None,
     ):
         """A dataset with prompts. Usually used for PPO.
 
@@ -30,14 +29,11 @@ def __init__(
             dataset_builder (Optional[Callable[[], List[Dict]]], optional): Alternative to dataset_path.
                 A callable that returns a list of dictionary. Defaults to None.
             pad_to_max_length (bool): Whether to pad the prompts to max_length. Defaults to False.
-            max_num_sequences (int): Max number of sequences the dataset contains. Only used in tests.
         """
         self._util = util
         self.max_length = max_length
 
-        data = data_api.load_shuffle_split_dataset(
-            util, dataset_path, dataset_builder, max_num_sequences=max_num_sequences
-        )
+        data = data_api.load_shuffle_split_dataset(util, dataset_path, dataset_builder)
 
         prompts_str = [x["prompt"] for x in data]
         util.tokenizer.padding_side = "left"

diff --git a/realhf/impl/model/backend/pipe_runner.py b/realhf/impl/model/backend/pipe_runner.py
@@ -1143,11 +1143,6 @@ def terminate_condition():
             terminate_condition=terminate_condition,
         )
 
-        use_cuda_graph = os.environ.get("USE_CUDA_GRAPH", "0") == "1"
-        if use_cuda_graph:
-            dist.barrier(group=constants.parallelism_group())
-            torch.cuda.synchronize()
-
         if not constants.is_last_pipe_stage():
             return None
 

diff --git a/realhf/impl/model/parallelism/model_parallel/custom_all_reduce.py b/realhf/impl/model/parallelism/model_parallel/custom_all_reduce.py
@@ -129,6 +129,7 @@ def init_custom_ar() -> None:
             " capability. "
         )
         return
+
     _CA_HANDLE = CustomAllreduce(mp_rank, mp_world_size, full_nvlink=full_nvlink)