microsoft · XuehaiPan · Dec 20, 2023 · Dec 20, 2023 · Jan 2, 2024 · Jan 12, 2024
diff --git a/.gitignore b/.gitignore
@@ -53,3 +53,7 @@ docs/code-docs/build
 ## Testing data
 # Saved checkpoints for testing
 tests/unit/saved_checkpoint/
+
+# Virtual environments
+venv/
+.venv/
diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py
@@ -15,29 +15,12 @@
 except ImportError:
     pass
 
-# Delay import pynvml to avoid import error when CUDA is not available
-pynvml = None
-
 
 class CUDA_Accelerator(DeepSpeedAccelerator):
 
     def __init__(self):
         self._name = 'cuda'
         self._communication_backend_name = 'nccl'
-        if pynvml is None:
-            self._init_pynvml()
-
-    def _init_pynvml(self):
-        global pynvml
-        try:
-            import pynvml
-        except ImportError:
-            return
-        try:
-            pynvml.nvmlInit()
-        except pynvml.NVMLError:
-            pynvml = None
-            return
 
     def is_synchronized_device(self):
         return False
@@ -153,30 +136,8 @@ def max_memory_reserved(self, device_index=None):
     def total_memory(self, device_index=None):
         return torch.cuda.get_device_properties(device_index).total_memory
 
-    def _get_nvml_gpu_id(self, torch_gpu_id):
-        """
-        credit: https://discuss.pytorch.org/t/making-pynvml-match-torch-device-ids-cuda-visible-devices/103020
-
-        Remap torch device id to nvml device id, respecting CUDA_VISIBLE_DEVICES.
-
-        If the latter isn't set return the same id
-        """
-        # if CUDA_VISIBLE_DEVICES is used automagically remap the id since pynvml ignores this env var
-        if "CUDA_VISIBLE_DEVICES" in os.environ:
-            ids = list(map(int, os.environ.get("CUDA_VISIBLE_DEVICES", "").split(",")))
-            return ids[torch_gpu_id]  # remap
-        else:
-            return torch_gpu_id
-
     def available_memory(self, device_index=None):
-        if pynvml:
-            if device_index is None:
-                device_index = self.current_device()
-            handle = pynvml.nvmlDeviceGetHandleByIndex(self._get_nvml_gpu_id(device_index))
-            info = pynvml.nvmlDeviceGetMemoryInfo(handle)
-            return info.free
-        else:
-            return self.total_memory(device_index) - self.memory_allocated(device_index)
+        return torch.cuda.mem_get_info(device_index)[0]
 
     # Data types
     def is_bf16_supported(self):

@@ -5,6 +5,5 @@ packaging>=20.0
 psutil
 py-cpuinfo
 pydantic
-pynvml
 torch
 tqdm
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,5 @@ packaging>=20.0 @@
     psutil
     py-cpuinfo
     pydantic
-    pynvml
     torch
     tqdm