Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Retrieve CUDA available memory via torch.cuda.mem_get_info() #4847

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,7 @@ docs/code-docs/build
## Testing data
# Saved checkpoints for testing
tests/unit/saved_checkpoint/

# Virtual environments
venv/
.venv/
41 changes: 1 addition & 40 deletions accelerator/cuda_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,29 +15,12 @@
except ImportError:
pass

# Delay import pynvml to avoid import error when CUDA is not available
pynvml = None


class CUDA_Accelerator(DeepSpeedAccelerator):

def __init__(self):
self._name = 'cuda'
self._communication_backend_name = 'nccl'
if pynvml is None:
self._init_pynvml()

def _init_pynvml(self):
global pynvml
try:
import pynvml
except ImportError:
return
try:
pynvml.nvmlInit()
except pynvml.NVMLError:
pynvml = None
return

def is_synchronized_device(self):
return False
Expand Down Expand Up @@ -153,30 +136,8 @@ def max_memory_reserved(self, device_index=None):
def total_memory(self, device_index=None):
return torch.cuda.get_device_properties(device_index).total_memory

def _get_nvml_gpu_id(self, torch_gpu_id):
"""
credit: https://discuss.pytorch.org/t/making-pynvml-match-torch-device-ids-cuda-visible-devices/103020

Remap torch device id to nvml device id, respecting CUDA_VISIBLE_DEVICES.

If the latter isn't set return the same id
"""
# if CUDA_VISIBLE_DEVICES is used automagically remap the id since pynvml ignores this env var
if "CUDA_VISIBLE_DEVICES" in os.environ:
ids = list(map(int, os.environ.get("CUDA_VISIBLE_DEVICES", "").split(",")))
return ids[torch_gpu_id] # remap
else:
return torch_gpu_id

def available_memory(self, device_index=None):
if pynvml:
if device_index is None:
device_index = self.current_device()
handle = pynvml.nvmlDeviceGetHandleByIndex(self._get_nvml_gpu_id(device_index))
info = pynvml.nvmlDeviceGetMemoryInfo(handle)
return info.free
else:
return self.total_memory(device_index) - self.memory_allocated(device_index)
return torch.cuda.mem_get_info(device_index)[0]

# Data types
def is_bf16_supported(self):
Expand Down
1 change: 0 additions & 1 deletion requirements/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,5 @@ packaging>=20.0
psutil
py-cpuinfo
pydantic
pynvml
torch
tqdm