diff --git a/.gitignore b/.gitignore index e284c4fd35a1..1fa194e690f9 100644 --- a/.gitignore +++ b/.gitignore @@ -53,3 +53,7 @@ docs/code-docs/build ## Testing data # Saved checkpoints for testing tests/unit/saved_checkpoint/ + +# Virtual environments +venv/ +.venv/ diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py index 2d74daecf3df..4e6bc8629295 100644 --- a/accelerator/cuda_accelerator.py +++ b/accelerator/cuda_accelerator.py @@ -15,29 +15,12 @@ except ImportError: pass -# Delay import pynvml to avoid import error when CUDA is not available -pynvml = None - class CUDA_Accelerator(DeepSpeedAccelerator): def __init__(self): self._name = 'cuda' self._communication_backend_name = 'nccl' - if pynvml is None: - self._init_pynvml() - - def _init_pynvml(self): - global pynvml - try: - import pynvml - except ImportError: - return - try: - pynvml.nvmlInit() - except pynvml.NVMLError: - pynvml = None - return def is_synchronized_device(self): return False @@ -153,30 +136,8 @@ def max_memory_reserved(self, device_index=None): def total_memory(self, device_index=None): return torch.cuda.get_device_properties(device_index).total_memory - def _get_nvml_gpu_id(self, torch_gpu_id): - """ - credit: https://discuss.pytorch.org/t/making-pynvml-match-torch-device-ids-cuda-visible-devices/103020 - - Remap torch device id to nvml device id, respecting CUDA_VISIBLE_DEVICES. - - If the latter isn't set return the same id - """ - # if CUDA_VISIBLE_DEVICES is used automagically remap the id since pynvml ignores this env var - if "CUDA_VISIBLE_DEVICES" in os.environ: - ids = list(map(int, os.environ.get("CUDA_VISIBLE_DEVICES", "").split(","))) - return ids[torch_gpu_id] # remap - else: - return torch_gpu_id - def available_memory(self, device_index=None): - if pynvml: - if device_index is None: - device_index = self.current_device() - handle = pynvml.nvmlDeviceGetHandleByIndex(self._get_nvml_gpu_id(device_index)) - info = pynvml.nvmlDeviceGetMemoryInfo(handle) - return info.free - else: - return self.total_memory(device_index) - self.memory_allocated(device_index) + return torch.cuda.mem_get_info(device_index)[0] # Data types def is_bf16_supported(self): diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 80c9f9b3287a..6840d6dbcc98 100755 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -5,6 +5,5 @@ packaging>=20.0 psutil py-cpuinfo pydantic -pynvml torch tqdm