diff --git a/src/fairchem/core/_cli_hydra.py b/src/fairchem/core/_cli_hydra.py index 6ca8b7d6f..c4daa022d 100644 --- a/src/fairchem/core/_cli_hydra.py +++ b/src/fairchem/core/_cli_hydra.py @@ -63,7 +63,6 @@ def map_cli_args_to_dist_config(cli_args: argparse.Namespace) -> dict: "submit": cli_args.submit, "summit": None, "cpu": cli_args.cpu, - "use_cuda_visibile_devices": True, } diff --git a/src/fairchem/core/common/distutils.py b/src/fairchem/core/common/distutils.py index 604f969a8..a79faa5a4 100644 --- a/src/fairchem/core/common/distutils.py +++ b/src/fairchem/core/common/distutils.py @@ -69,20 +69,12 @@ def setup(config) -> None: f"Init: {config['init_method']}, {config['world_size']}, {config['rank']}" ) - # ensures GPU0 does not have extra context/higher peak memory + assign_device_for_local_rank(config["cpu"], config["local_rank"]) + logging.info( - f"local rank: {config['local_rank']}, visible devices: {os.environ['CUDA_VISIBLE_DEVICES']}" + f"local rank: {config['local_rank']}, rank: {config['rank']}, visible devices: {os.environ['CUDA_VISIBLE_DEVICES']}" ) - # In the new hydra runners, we setup the device for each rank as either cuda:0 or cpu - # after this point, the local rank should either be using "cpu" or "cuda" - if config.get("use_cuda_visibile_devices"): - assign_device_for_local_rank(config["cpu"], config["local_rank"]) - else: - # in the old code, all ranks can see all devices but need to be assigned a device equal to their local rank - # this is dangerous and should be deprecated - torch.cuda.set_device(config["local_rank"]) - dist.init_process_group( backend="nccl", init_method=config["init_method"], @@ -121,8 +113,7 @@ def setup(config) -> None: ), "Can only setup master address and port at this point for a single rank, otherwise we assume the processes and the comm addr/port have already been setup" setup_env_local() config["local_rank"] = int(os.environ.get("LOCAL_RANK")) - if config.get("use_cuda_visibile_devices"): - assign_device_for_local_rank(config["cpu"], config["local_rank"]) + assign_device_for_local_rank(config["cpu"], config["local_rank"]) dist.init_process_group( backend=config["distributed_backend"], rank=int(os.environ.get("RANK")), diff --git a/src/fairchem/core/trainers/base_trainer.py b/src/fairchem/core/trainers/base_trainer.py index 90cdce0e5..a746043f2 100644 --- a/src/fairchem/core/trainers/base_trainer.py +++ b/src/fairchem/core/trainers/base_trainer.py @@ -78,9 +78,7 @@ def __init__( loss_functions: dict[str, str | float], evaluation_metrics: dict[str, str], identifier: str, - # TODO: dealing with local rank is dangerous - # T201111838 remove this and use CUDA_VISIBILE_DEVICES instead so trainers don't need to know about which devie to use - local_rank: int, + local_rank: int, # DEPRECATED, DO NOT USE timestamp_id: str | None = None, run_dir: str | None = None, is_debug: bool = False, @@ -104,8 +102,7 @@ def __init__( self.ema = None if torch.cuda.is_available() and not self.cpu: - logging.info(f"local rank base: {local_rank}") - self.device = torch.device(f"cuda:{local_rank}") + self.device = torch.device("cuda") else: self.device = torch.device("cpu") self.cpu = True # handle case when `--cpu` isn't specified diff --git a/src/fairchem/core/trainers/ocp_trainer.py b/src/fairchem/core/trainers/ocp_trainer.py index 9a13faed6..3b54a069d 100644 --- a/src/fairchem/core/trainers/ocp_trainer.py +++ b/src/fairchem/core/trainers/ocp_trainer.py @@ -79,9 +79,7 @@ def __init__( loss_functions: dict[str, str | float], evaluation_metrics: dict[str, str], identifier: str, - # TODO: dealing with local rank is dangerous - # T201111838 remove this and use CUDA_VISIBILE_DEVICES instead so trainers don't need to know about which devie to use - local_rank: int, + local_rank: int, # DEPRECATED, DO NOT USE timestamp_id: str | None = None, run_dir: str | None = None, is_debug: bool = False,