Skip to content

Commit

Permalink
Contemplate amd-smi and friends being not executable
Browse files Browse the repository at this point in the history
This should fix #5111 by handling the case where `amd-smi` (or `rocm-smi`, or `nvidia-smi`, or `docker`) is found but is not marked executable for the current user.

I'm not clear on whether this is a default configuration for any of these tools, but it's certainly possible for a sysadmin to achieve, and possibly for a good reason.
  • Loading branch information
adamnovak authored Oct 1, 2024
1 parent 4f120ac commit 20c32f2
Showing 1 changed file with 4 additions and 4 deletions.
8 changes: 4 additions & 4 deletions src/toil/lib/accelerators.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def have_working_nvidia_smi() -> bool:
"""
try:
subprocess.check_call(['nvidia-smi'])
except (FileNotFoundError, subprocess.CalledProcessError):
except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
return False
return True

Expand Down Expand Up @@ -78,7 +78,7 @@ def have_working_nvidia_docker_runtime() -> bool:
try:
# The runtime injects nvidia-smi; it doesn't seem to have to be in the image we use here
subprocess.check_call(['docker', 'run', '--rm', '--runtime', 'nvidia', '--gpus', 'all', 'ubuntu:20.04', 'nvidia-smi'])
except (FileNotFoundError, subprocess.CalledProcessError):
except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
return False
return True

Expand Down Expand Up @@ -121,7 +121,7 @@ def count_amd_gpus() -> int:
out = subprocess.check_output((["amd-smi", "static"]))
gpu_count = len([line for line in out.decode("utf-8").split("\n") if line.startswith("gpu")])
return gpu_count
except (FileNotFoundError, subprocess.SubprocessError):
except (FileNotFoundError, PermissionError, subprocess.SubprocessError):
# if the amd-smi command fails, try rocm-smi
# if a different exception is raised, something other than the subprocess call is wrong
pass
Expand All @@ -131,7 +131,7 @@ def count_amd_gpus() -> int:
out = subprocess.check_output(["rocm-smi"])
gpu_count = len([line for line in out.decode("utf-8").split("\n") if len(line)> 0 and line[0] in string.digits])
return gpu_count
except (FileNotFoundError, subprocess.SubprocessError):
except (FileNotFoundError, PermissionError, subprocess.SubprocessError):
pass
return 0

Expand Down

0 comments on commit 20c32f2

Please sign in to comment.