Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Install CUDA and CUDA-Samples via the bot #381

Closed
wants to merge 13 commits into from
Prev Previous commit
Next Next commit
Try to simplify the Lmod rc plugin, address review comments, make sur…
…e we have EULA acceptance for CUDA
  • Loading branch information
ocaisa committed Nov 30, 2023
commit ddb20df873436f0cddd6452169e031fbc13f3191
88 changes: 28 additions & 60 deletions create_lmodrc.py
Original file line number Diff line number Diff line change
@@ -29,100 +29,68 @@
return content
end

-- from https://stackoverflow.com/a/40195356
--- Check if a file or directory exists in this path
function exists(file)
local ok, err, code = os.rename(file, file)
if not ok then
if code == 13 then
-- Permission denied, but it exists
return true
end
end
return ok, err
end

local function visible_hook(modT)
local frameStk = require("FrameStk"):singleton()
local mt = frameStk:mt()
local cudaDir = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections')
local cudaDirExists = exists(cudaDir)
if not cudaDirExists then
local haveGpu = mt:haveProperty(modT.sn,"arch","gpu")
if haveGpu then
modT.isVisible = false
end
end
end

local function cuda_enabled_load_hook(t)
local frameStk = require("FrameStk"):singleton()
local mt = frameStk:mt()
local simpleName = string.match(t.modFullName, "(.-)/")
local eprefix = os.getenv('EESSI_PREFIX') .. "/init/gpu_support"
-- if we try to load CUDA itself, check if the software exists in host_injections
-- otherwise, refuse to load CUDA and print error message
-- If we try to load CUDA itself, check if the full CUDA SDK was installed on the host in host_injections.
-- This is required for end users to build additional CUDA software. If the full SDK isn't present, refuse
-- to load the CUDA module and print an informative message on how to set up GPU support for EESSI
if simpleName == 'CUDA' then
-- get the full host_injections path
local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections')
-- build final path where the CUDA software should be installed
local cudaEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
local cudaDirExists = exists(cudaEasyBuildDir)
local cudaDirExists = isDir(cudaEasyBuildDir)
if not cudaDirExists then
io.stderr:write("You requested to load ",simpleName,"\\n")
io.stderr:write("While the module file exists, the actual software is not shipped with EESSI.\\n")
io.stderr:write("In order to be able to use the CUDA module, please follow the instructions in the\\n")
io.stderr:write("gpu_support folder. Adding the CUDA software can be as easy as:\\n")
io.stderr:write("export INSTALL_CUDA=true && ./add_nvidia_gpu_support.sh\\n")
frameStk:__clear()
local advice = "While the module file exists, the actual software is not shipped with EESSI.\\n"
advice = advice .. "In order to be able to use the CUDA module, please follow the instructions \\n"
advice = advice .. "available under https://www.eessi.io/docs/gpu/\\n"
LmodError("You requested to load ", simpleName, "\\n", advice)
end
end
-- when loading CUDA enabled modules check if the necessary matching compatibility libraries are installed
-- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the EESSI linker,
-- otherwise, refuse to load the requested module and print error message
local haveGpu = mt:haveProperty(simpleName,"arch","gpu")
casparvl marked this conversation as resolved.
Show resolved Hide resolved
if haveGpu then
local arch = os.getenv("EESSI_CPU_FAMILY") or ""
local cudaVersionFile = "/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/" .. arch .. "/latest/version.txt"
local cudaDriverExists = exists(cudaVersionFile)
local singularityCudaExists = exists("/.singularity.d/libs/libcuda.so")
local cudaVersionFile = "/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt"
local cudaDriverFile = "/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/" .. arch .. "/latest/libcuda.so"
local cudaDriverExists = isFile(cudaDriverFile)
casparvl marked this conversation as resolved.
Show resolved Hide resolved
local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so")
if not (cudaDriverExists or singularityCudaExists) then
io.stderr:write("You requested to load ",simpleName,"\\n")
io.stderr:write("which relies on the CUDA runtime environment and its compatibility libraries.\\n")
io.stderr:write("In order to be able to use the module, please follow the instructions in the\\n")
io.stderr:write("gpu_support folder. Installing the needed compatibility libraries can be as easy as:\\n")
io.stderr:write("./add_nvidia_gpu_support.sh\\n")
frameStk:__clear()
local advice = "which relies on the CUDA runtime environment and driver libraries.\\n"
advice = advice .. "In order to be able to use the module, please follow the instructions\\n"
advice = advice .. "available under https://www.eessi.io/docs/gpu/\\n"
LmodError("You requested to load ", simpleName, "\\n", advice)
else
-- CUDA driver exists, now we check its version to see if an update is needed
if cudaDriverExists then
ocaisa marked this conversation as resolved.
Show resolved Hide resolved
local cudaVersion = read_file(cudaVersionFile)
local cudaVersion_req = os.getenv("EESSICUDAVERSION")
local major, minor, patch = string.match(cudaVersion, "(%d+)%.(%d+)%.(%d+)")
-- driver CUDA versions don't give a patch version for CUDA
local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)%")
local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)")
local compat_libs_need_update = false
local driver_libs_need_update = false
if major < major_req then
compat_libs_need_update = true
driver_libs_need_update = true
elseif major == major_req then
if minor < minor_req then
compat_libs_need_update = true
elseif minor == minor_req then
if patch < patch_req then
compat_libs_need_update = true
end
driver_libs_need_update = true
end
end
if compat_libs_need_update == true then
io.stderr:write("You requested to load CUDA version ",cudaVersion)
io.stderr:write("but the module you want to load requires CUDA version ",cudaVersion_req,".\\n")
io.stderr:write("Please update your CUDA compatibility libraries in order to use ",simpleName,".\\n")
frameStk:__clear()
if driver_libs_need_update == true then
local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ".\\n"
advice = "Please update your CUDA driver libraries and then follow the instructions \\n"
advice = "under https://www.eessi.io/docs/gpu/ to let EESSI know about the update.\\n"
LmodError("Your driver CUDA version is ", cudaVersion, "\\n", advice)
end
end
end
end
end

hook.register("load", cuda_enabled_load_hook)
hook.register("isVisibleHook", visible_hook)
"""

def error(msg):
4 changes: 4 additions & 0 deletions eb_hooks.py
Original file line number Diff line number Diff line change
@@ -377,6 +377,10 @@ def post_sanitycheck_cuda(self, *args, **kwargs):
# if it is not in the whitelist, delete the file and create a symlink to host_injections
source = os.path.join(root, filename)
target = source.replace("versions", "host_injections")
ocaisa marked this conversation as resolved.
Show resolved Hide resolved
# Make sure source and target are not the same
if source == target:
raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you are"
"using this hook for an EESSI installation?")
os.remove(source)
# Using os.symlink requires the existence of the target directory, so we use os.system
system_command="ln -s '%s' '%s'" % (target, source)
5 changes: 4 additions & 1 deletion eessi-2023.06-eb-4.8.1-system.yml
Original file line number Diff line number Diff line change
@@ -7,4 +7,7 @@ easyconfigs:
- EasyBuild-4.8.2.eb:
options:
from-pr: 19105
- CUDA-12.1.1.eb
- CUDA-12.1.1.eb:
options:
include-easyblocks-from-pr: 3045
accept-eula-for: CUDA
Loading