diff --git a/src/fairchem/core/_cli_hydra.py b/src/fairchem/core/_cli_hydra.py
index 6ca8b7d6f..c4daa022d 100644
--- a/src/fairchem/core/_cli_hydra.py
+++ b/src/fairchem/core/_cli_hydra.py
@@ -63,7 +63,6 @@ def map_cli_args_to_dist_config(cli_args: argparse.Namespace) -> dict:
         "submit": cli_args.submit,
         "summit": None,
         "cpu": cli_args.cpu,
-        "use_cuda_visibile_devices": True,
     }
 
 
diff --git a/src/fairchem/core/common/distutils.py b/src/fairchem/core/common/distutils.py
index 604f969a8..a79faa5a4 100644
--- a/src/fairchem/core/common/distutils.py
+++ b/src/fairchem/core/common/distutils.py
@@ -69,20 +69,12 @@ def setup(config) -> None:
                     f"Init: {config['init_method']}, {config['world_size']}, {config['rank']}"
                 )
 
-                # ensures GPU0 does not have extra context/higher peak memory
+                assign_device_for_local_rank(config["cpu"], config["local_rank"])
+
                 logging.info(
-                    f"local rank: {config['local_rank']}, visible devices: {os.environ['CUDA_VISIBLE_DEVICES']}"
+                    f"local rank: {config['local_rank']}, rank: {config['rank']}, visible devices: {os.environ['CUDA_VISIBLE_DEVICES']}"
                 )
 
-                # In the new hydra runners, we setup the device for each rank as either cuda:0 or cpu
-                # after this point, the local rank should either be using "cpu" or "cuda"
-                if config.get("use_cuda_visibile_devices"):
-                    assign_device_for_local_rank(config["cpu"], config["local_rank"])
-                else:
-                    # in the old code, all ranks can see all devices but need to be assigned a device equal to their local rank
-                    # this is dangerous and should be deprecated
-                    torch.cuda.set_device(config["local_rank"])
-
                 dist.init_process_group(
                     backend="nccl",
                     init_method=config["init_method"],
@@ -121,8 +113,7 @@ def setup(config) -> None:
             ), "Can only setup master address and port at this point for a single rank, otherwise we assume the processes and the comm addr/port have already been setup"
             setup_env_local()
         config["local_rank"] = int(os.environ.get("LOCAL_RANK"))
-        if config.get("use_cuda_visibile_devices"):
-            assign_device_for_local_rank(config["cpu"], config["local_rank"])
+        assign_device_for_local_rank(config["cpu"], config["local_rank"])
         dist.init_process_group(
             backend=config["distributed_backend"],
             rank=int(os.environ.get("RANK")),
diff --git a/src/fairchem/core/trainers/base_trainer.py b/src/fairchem/core/trainers/base_trainer.py
index 90cdce0e5..a746043f2 100644
--- a/src/fairchem/core/trainers/base_trainer.py
+++ b/src/fairchem/core/trainers/base_trainer.py
@@ -78,9 +78,7 @@ def __init__(
         loss_functions: dict[str, str | float],
         evaluation_metrics: dict[str, str],
         identifier: str,
-        # TODO: dealing with local rank is dangerous
-        # T201111838 remove this and use CUDA_VISIBILE_DEVICES instead so trainers don't need to know about which devie to use
-        local_rank: int,
+        local_rank: int,  # DEPRECATED, DO NOT USE
         timestamp_id: str | None = None,
         run_dir: str | None = None,
         is_debug: bool = False,
@@ -104,8 +102,7 @@ def __init__(
         self.ema = None
 
         if torch.cuda.is_available() and not self.cpu:
-            logging.info(f"local rank base: {local_rank}")
-            self.device = torch.device(f"cuda:{local_rank}")
+            self.device = torch.device("cuda")
         else:
             self.device = torch.device("cpu")
             self.cpu = True  # handle case when `--cpu` isn't specified
diff --git a/src/fairchem/core/trainers/ocp_trainer.py b/src/fairchem/core/trainers/ocp_trainer.py
index 9a13faed6..3b54a069d 100644
--- a/src/fairchem/core/trainers/ocp_trainer.py
+++ b/src/fairchem/core/trainers/ocp_trainer.py
@@ -79,9 +79,7 @@ def __init__(
         loss_functions: dict[str, str | float],
         evaluation_metrics: dict[str, str],
         identifier: str,
-        # TODO: dealing with local rank is dangerous
-        # T201111838 remove this and use CUDA_VISIBILE_DEVICES instead so trainers don't need to know about which devie to use
-        local_rank: int,
+        local_rank: int,  # DEPRECATED, DO NOT USE
         timestamp_id: str | None = None,
         run_dir: str | None = None,
         is_debug: bool = False,