diff --git a/docs/tutorials/advanced/embedding_monkeypatch.py b/docs/tutorials/advanced/embedding_monkeypatch.py
index b2983c29ac..e16028abd0 100644
--- a/docs/tutorials/advanced/embedding_monkeypatch.py
+++ b/docs/tutorials/advanced/embedding_monkeypatch.py
@@ -93,7 +93,7 @@ def newforward(self, data):
     x_E = self.out_mlp_E(torch.cat(xs_E, dim=-1))
     if self.direct_forces:
         x_F = self.out_mlp_F(torch.cat(xs_F, dim=-1))
-    with torch.cuda.amp.autocast(False):
+    with torch.autocast("cuda", enabled=False):
         E_t = self.out_energy(x_E.float())
         if self.direct_forces:
             F_st = self.out_forces(x_F.float())
@@ -185,7 +185,7 @@ def embed(self, atoms):
         self.trainer.ema.copy_to()
 
     with (
-        torch.cuda.amp.autocast(enabled=self.trainer.scaler is not None),
+        torch.autocast("cuda", enabled=self.trainer.scaler is not None),
         torch.no_grad(),
     ):
         out = self.trainer.model(batch_list)
diff --git a/src/fairchem/core/models/equiformer_v2/equiformer_v2_deprecated.py b/src/fairchem/core/models/equiformer_v2/equiformer_v2_deprecated.py
index 1da2ed3adb..5af270045e 100644
--- a/src/fairchem/core/models/equiformer_v2/equiformer_v2_deprecated.py
+++ b/src/fairchem/core/models/equiformer_v2/equiformer_v2_deprecated.py
@@ -591,7 +591,7 @@ def forward(self, data):
             # We can also write this as
             # \hat{E_DFT} = E_std * (\hat{E} + E_ref / E_std) + E_mean,
             # which is why we save E_ref / E_std as the linear reference.
-            with torch.cuda.amp.autocast(False):
+            with torch.autocast("cuda", enabled=False):
                 energy = energy.to(self.energy_lin_ref.dtype).index_add(
                     0,
                     graph.batch_full,
diff --git a/src/fairchem/core/models/equiformer_v2/layer_norm.py b/src/fairchem/core/models/equiformer_v2/layer_norm.py
index 8edcfd62fa..e23f573c37 100755
--- a/src/fairchem/core/models/equiformer_v2/layer_norm.py
+++ b/src/fairchem/core/models/equiformer_v2/layer_norm.py
@@ -72,7 +72,7 @@ def __init__(
     def __repr__(self) -> str:
         return f"{self.__class__.__name__}(lmax={self.lmax}, num_channels={self.num_channels}, eps={self.eps})"
 
-    @torch.cuda.amp.autocast(enabled=False)
+    @torch.autocast("cuda", enabled=False)
     def forward(self, node_input):
         """
         Assume input is of shape [N, sphere_basis, C]
@@ -172,7 +172,7 @@ def __init__(
     def __repr__(self) -> str:
         return f"{self.__class__.__name__}(lmax={self.lmax}, num_channels={self.num_channels}, eps={self.eps}, std_balance_degrees={self.std_balance_degrees})"
 
-    @torch.cuda.amp.autocast(enabled=False)
+    @torch.autocast("cuda", enabled=False)
     def forward(self, node_input):
         """
         Assume input is of shape [N, sphere_basis, C]
@@ -260,7 +260,7 @@ def __init__(
     def __repr__(self) -> str:
         return f"{self.__class__.__name__}(lmax={self.lmax}, num_channels={self.num_channels}, eps={self.eps})"
 
-    @torch.cuda.amp.autocast(enabled=False)
+    @torch.autocast("cuda", enabled=False)
     def forward(self, node_input):
         """
         Assume input is of shape [N, sphere_basis, C]
@@ -354,7 +354,7 @@ def __init__(
     def __repr__(self) -> str:
         return f"{self.__class__.__name__}(lmax={self.lmax}, num_channels={self.num_channels}, eps={self.eps}, centering={self.centering}, std_balance_degrees={self.std_balance_degrees})"
 
-    @torch.cuda.amp.autocast(enabled=False)
+    @torch.autocast("cuda", enabled=False)
     def forward(self, node_input):
         """
         Assume input is of shape [N, sphere_basis, C]
diff --git a/src/fairchem/core/models/equiformer_v2/trainers/dens_trainer.py b/src/fairchem/core/models/equiformer_v2/trainers/dens_trainer.py
index 11735d7bb9..8bacc49133 100644
--- a/src/fairchem/core/models/equiformer_v2/trainers/dens_trainer.py
+++ b/src/fairchem/core/models/equiformer_v2/trainers/dens_trainer.py
@@ -392,7 +392,7 @@ def train(self, disable_eval_tqdm=False):
                         )
 
                 # Forward, loss, backward. #TODO update this with new signatures
-                with torch.cuda.amp.autocast(enabled=self.scaler is not None):
+                with torch.autocast("cuda", enabled=self.scaler is not None):
                     out = self._forward(batch)
                     loss = self._compute_loss(out, batch)
 
@@ -767,7 +767,7 @@ def predict(
             desc=f"device {rank}",
             disable=disable_tqdm,
         ):
-            with torch.cuda.amp.autocast(enabled=self.scaler is not None):
+            with torch.autocast("cuda", enabled=self.scaler is not None):
                 out = self._forward(batch)
 
             for key in out:
diff --git a/src/fairchem/core/models/gemnet_oc/gemnet_oc.py b/src/fairchem/core/models/gemnet_oc/gemnet_oc.py
index c982b7d43a..d29a7ed614 100644
--- a/src/fairchem/core/models/gemnet_oc/gemnet_oc.py
+++ b/src/fairchem/core/models/gemnet_oc/gemnet_oc.py
@@ -1267,7 +1267,7 @@ def forward(self, data):
         x_E = self.out_mlp_E(torch.cat(xs_E, dim=-1))
         if self.direct_forces:
             x_F = self.out_mlp_F(torch.cat(xs_F, dim=-1))
-        with torch.cuda.amp.autocast(False):
+        with torch.autocast("cuda", enabled=False):
             E_t = self.out_energy(x_E.float())
             if self.direct_forces:
                 F_st = self.out_forces(x_F.float())
@@ -1465,7 +1465,7 @@ def forward(
     ) -> dict[str, torch.Tensor]:
         # Global output block for final predictions
         x_E = self.out_mlp_E(torch.cat(emb["xs_E"], dim=-1))
-        with torch.cuda.amp.autocast(False):
+        with torch.autocast("cuda", enabled=False):
             E_t = self.out_energy(x_E.float())
 
         nMolecules = torch.max(data.batch) + 1
@@ -1530,7 +1530,7 @@ def forward(
         self, data: Batch, emb: dict[str, torch.Tensor]
     ) -> dict[str, torch.Tensor]:
         if self.direct_forces:
-            with torch.cuda.amp.autocast(False):
+            with torch.autocast("cuda", enabled=False):
                 x_F = self.out_mlp_F(torch.cat(emb["xs_F"], dim=-1).float())
                 F_st = self.out_forces(x_F)
 
diff --git a/src/fairchem/core/models/gemnet_oc/layers/force_scaler.py b/src/fairchem/core/models/gemnet_oc/layers/force_scaler.py
index fe5ae1810a..d41e144657 100644
--- a/src/fairchem/core/models/gemnet_oc/layers/force_scaler.py
+++ b/src/fairchem/core/models/gemnet_oc/layers/force_scaler.py
@@ -15,7 +15,7 @@ class ForceScaler:
     """
     Scales up the energy and then scales down the forces
     to prevent NaNs and infs in calculations using AMP.
-    Inspired by torch.cuda.amp.GradScaler.
+    Inspired by torch.GradScaler("cuda", args...).
     """
 
     def __init__(
diff --git a/src/fairchem/core/modules/scaling/fit.py b/src/fairchem/core/modules/scaling/fit.py
index 462088318e..e3eea3d018 100644
--- a/src/fairchem/core/modules/scaling/fit.py
+++ b/src/fairchem/core/modules/scaling/fit.py
@@ -32,7 +32,7 @@ def _prefilled_input(prompt: str, prefill: str = "") -> str:
 
 def _train_batch(trainer: BaseTrainer, batch) -> None:
     with torch.no_grad():
-        with torch.cuda.amp.autocast(enabled=trainer.scaler is not None):
+        with torch.autocast("cuda", enabled=trainer.scaler is not None):
             out = trainer._forward(batch)
         loss = trainer._compute_loss(out, batch)
         del out, loss
diff --git a/src/fairchem/core/trainers/base_trainer.py b/src/fairchem/core/trainers/base_trainer.py
index 90cdce0e58..b51e9507ab 100644
--- a/src/fairchem/core/trainers/base_trainer.py
+++ b/src/fairchem/core/trainers/base_trainer.py
@@ -153,7 +153,7 @@ def __init__(
             "gp_gpus": gp_gpus,
         }
         # AMP Scaler
-        self.scaler = torch.cuda.amp.GradScaler() if amp and not self.cpu else None
+        self.scaler = torch.GradScaler("cuda") if amp and not self.cpu else None
 
         # Fill in SLURM information in config, if applicable
         if "SLURM_JOB_ID" in os.environ and "folder" in self.config["slurm"]:
@@ -883,7 +883,7 @@ def validate(self, split: str = "val", disable_tqdm: bool = False):
             disable=disable_tqdm,
         ):
             # Forward.
-            with torch.cuda.amp.autocast(enabled=self.scaler is not None):
+            with torch.autocast("cuda", enabled=self.scaler is not None):
                 batch.to(self.device)
                 out = self._forward(batch)
             loss = self._compute_loss(out, batch)
diff --git a/src/fairchem/core/trainers/ocp_trainer.py b/src/fairchem/core/trainers/ocp_trainer.py
index a8976773c6..e9f2b01d50 100644
--- a/src/fairchem/core/trainers/ocp_trainer.py
+++ b/src/fairchem/core/trainers/ocp_trainer.py
@@ -161,7 +161,7 @@ def train(self, disable_eval_tqdm: bool = False) -> None:
                 # Get a batch.
                 batch = next(train_loader_iter)
                 # Forward, loss, backward.
-                with torch.cuda.amp.autocast(enabled=self.scaler is not None):
+                with torch.autocast("cuda", enabled=self.scaler is not None):
                     out = self._forward(batch)
                     loss = self._compute_loss(out, batch)
 
@@ -468,7 +468,7 @@ def predict(
             desc=f"device {rank}",
             disable=disable_tqdm,
         ):
-            with torch.cuda.amp.autocast(enabled=self.scaler is not None):
+            with torch.autocast("cuda", enabled=self.scaler is not None):
                 out = self._forward(batch)
 
             for target_key in self.config["outputs"]: