Skip to content

Commit

Permalink
adding unstaged changes
Browse files Browse the repository at this point in the history
  • Loading branch information
truib committed May 22, 2024
1 parent 705ca71 commit 433d588
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 140 deletions.
6 changes: 4 additions & 2 deletions EESSI-install-software.sh
Original file line number Diff line number Diff line change
Expand Up @@ -204,8 +204,10 @@ ${TOPDIR}/install_scripts.sh --prefix ${EESSI_PREFIX}
# TODO: We should make a nice yaml and loop over all CUDA versions in that yaml to figure out what to install
# Allow skipping CUDA SDK install in e.g. CI environments
if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then
${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_host_injections.sh -c 12.1.1 --accept-cuda-eula
${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuDNN_host_injections.sh -c 12.1.1 -d 8.9.2.26
${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh \
-e ${EESSI_PREFIX}/scripts/gpu_support/nvidia/eessi-2023.06-cuda-and-libraries.yml \
-t /tmp/temp \
--accept-cuda-eula
else
echo "Skipping installation of CUDA SDK and cu* libraries in host_injections, since the --skip-cuda-install flag was passed"
fi
Expand Down
72 changes: 24 additions & 48 deletions create_lmodsitepackage.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,45 +105,46 @@
end
local function eessi_cuda_enabled_load_hook(t)
local function eessi_cuda_and_libraries_enabled_load_hook(t)
local frameStk = require("FrameStk"):singleton()
local mt = frameStk:mt()
local simpleName = string.match(t.modFullName, "(.-)/")
-- If we try to load CUDA itself, check if the full CUDA SDK was installed on the host in host_injections.
-- This is required for end users to build additional CUDA software. If the full SDK isn't present, refuse
-- to load the CUDA module and print an informative message on how to set up GPU support for NESSI
local packagesList = { ["CUDA"] = true, ["cuDNN"] = true, ["cuTENSOR"] = true }
-- If we try to load any of the modules in packagesList, we check if the
-- full package was installed on the host in host_injections.
-- This is required for end users to build additional software that depends
-- on the package. If the full SDK isn't present, refuse
-- to load the module and print an informative message on how to set up GPU support for NESSI
local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n"
if simpleName == 'CUDA' then
if packagesList[simpleName] then
-- simpleName is a module in packagesList
-- get the full host_injections path
local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections')
-- build final path where the CUDA software should be installed
local cudaEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
local cudaDirExists = isDir(cudaEasyBuildDir)
if not cudaDirExists then
-- build final path where the software should be installed
local packageEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
local packageDirExists = isDir(packageEasyBuildDir)
if not packageDirExists then
local advice = "but while the module file exists, the actual software is not entirely shipped with NESSI "
advice = advice .. "due to licencing. You will need to install a full copy of the CUDA SDK where NESSI "
advice = advice .. "due to licencing. You will need to install a full copy of the " .. simpleName .. " package where NESSI "
advice = advice .. "can find it.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYou requested to load ", simpleName, " ", advice)
end
end
-- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the NESSI linker,
-- when loading CUDA (and cu*) enabled modules check if the necessary driver libraries are accessible to the NESSI linker,
-- otherwise, refuse to load the requested module and print error message
local checkGpu = mt:haveProperty(simpleName,"arch","gpu")
local overrideGpuCheck = os.getenv("EESSI_OVERRIDE_GPU_CHECK")
if checkGpu and (overrideGpuCheck == nil) then
local haveGpu = mt:haveProperty(simpleName,"arch","gpu")
if haveGpu then
local arch = os.getenv("EESSI_CPU_FAMILY") or ""
local cudaVersionFile = "/cvmfs/pilot.nessi.no/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt"
local cudaDriverFile = "/cvmfs/pilot.nessi.no/host_injections/nvidia/" .. arch .. "/latest/libcuda.so"
local cvmfs_repo = os.getenv("EESSI_CVMFS_REPO") or ""
local cudaVersionFile = cvmfs_repo .. "/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt"
local cudaDriverFile = cvmfs_repo .. "/host_injections/nvidia/" .. arch .. "/latest/libcuda.so"
local cudaDriverExists = isFile(cudaDriverFile)
local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so")
if not (cudaDriverExists or singularityCudaExists) then
local advice = "which relies on the CUDA runtime environment and driver libraries. "
advice = advice .. "In order to be able to use the module, you will need "
advice = advice .. "to make sure NESSI can find the GPU driver libraries on your host system. You can "
advice = advice .. "override this check by setting the environment variable EESSI_OVERRIDE_GPU_CHECK but "
advice = advice .. "the loaded application will not be able to execute on your system.\\n"
advice = advice .. "to make sure NESSI can find the GPU driver libraries on your host system.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYou requested to load ", simpleName, " ", advice)
else
Expand Down Expand Up @@ -174,38 +175,13 @@
end
end
local function eessi_cudnn_enabled_load_hook(t)
local frameStk = require("FrameStk"):singleton()
local mt = frameStk:mt()
local simpleName = string.match(t.modFullName, "(.-)/")
-- If we try to load cuDNN itself, check if the full cuDNN package was installed on the host in host_injections.
-- This is required for end users to build additional cuDNN dependent software. If the full SDK isn't present, refuse
-- to load the cuDNN module and print an informative message on how to set up GPU support for NESSI
local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n"
if simpleName == 'cuDNN' then
-- get the full host_injections path
local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections')
-- build final path where the cuDNN software should be installed
local cudnnEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
local cudnnDirExists = isDir(cudnnEasyBuildDir)
if not cudnnDirExists then
local advice = "but while the module file exists, the actual software is not entirely shipped with NESSI "
advice = advice .. "due to licencing. You will need to install a full copy of the cuDNN package where NESSI "
advice = advice .. "can find it.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYou requested to load ", simpleName, " ", advice)
end
end
end
-- Combine both functions into a single one, as we can only register one function as load hook in lmod
-- Also: make it non-local, so it can be imported and extended by other lmodrc files if needed
function eessi_load_hook(t)
-- Only apply CUDA and cuDNN hooks if the loaded module is in the NESSI prefix
-- This avoids getting an Lmod Error when trying to load a CUDA or cuDNN module from a local software stack
-- Only apply CUDA and libraries hook if the loaded module is in the NESSI prefix
-- This avoids getting an Lmod Error when trying to load a CUDA or library module from a local software stack
if from_eessi_prefix(t) then
eessi_cuda_enabled_load_hook(t)
eessi_cudnn_enabled_load_hook(t)
eessi_cuda_and_libraries_enabled_load_hook(t)
end
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ easyconfigs:
options:
from-pr: 20299
- EESSI-extend-2023.06-easybuild.eb
# comment to trigger rebuild
- cuDNN-8.9.2.26-CUDA-12.1.1.eb
- cuTENSOR-2.0.1.2-CUDA-12.1.1.eb
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,4 @@ easyconfigs:
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/19451;
options:
from-pr: 19451
- cuDNN-8.9.2.26-CUDA-12.1.1.eb
- OSU-Micro-Benchmarks-7.2-gompi-2023a-CUDA-12.1.1.eb
174 changes: 87 additions & 87 deletions eb_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,47 @@ def post_sanitycheck_hook(self, *args, **kwargs):
POST_SANITYCHECK_HOOKS[self.name](self, *args, **kwargs)


def replace_non_distributable_files_with_symlinks(self, package, allowlist):
"""
Replace files that cannot be distributed with symlinks into host_injections
"""
extension_based = { "CUDA": False, "cuDNN": True, "cuTENSOR": True }
if package in extension_based:
raise EasyBuildError("Don't know how to strip non-distributable files from package %s.", package)

# iterate over all files in the package installation directory
for dir_path, _, files in os.walk(self.installdir):
for filename in files:
full_path = os.path.join(dir_path, filename)
# we only really care about real files, i.e. not symlinks
if not os.path.islink(full_path):
# check if the current file name stub is part of the allowlist
basename = filename.split('.')[0]
if extension_based[package]:
if '.' in filename:
extension = '.' + filename.split('.')[1]
if basename in allowlist:
self.log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path)
elif extension_based[package] and '.' in filename and extension in allowlist:
self.log.debug("%s is found in allowlist, so keeping it: %s", extension, full_path)
else:
if extension_based[package]:
print_name = filename
else:
print_name = basename
self.log.debug("%s is not found in allowlist, so replacing it with symlink: %s",
print_name, full_path)
# if it is not in the allowlist, delete the file and create a symlink to host_injections
host_inj_path = full_path.replace('versions', 'host_injections')
# make sure source and target of symlink are not the same
if full_path == host_inj_path:
raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you "
"are using this hook for an EESSI installation?",
full_path, host_inj_path)
remove_file(full_path)
symlink(host_inj_path, full_path)


def post_sanitycheck_cuda(self, *args, **kwargs):
"""
Remove files from CUDA installation that we are not allowed to ship,
Expand Down Expand Up @@ -662,33 +703,14 @@ def post_sanitycheck_cuda(self, *args, **kwargs):
if 'libcudart' not in allowlist:
raise EasyBuildError("Did not find 'libcudart' in allowlist: %s" % allowlist)

# iterate over all files in the CUDA installation directory
for dir_path, _, files in os.walk(self.installdir):
for filename in files:
full_path = os.path.join(dir_path, filename)
# we only really care about real files, i.e. not symlinks
if not os.path.islink(full_path):
# check if the current file name stub is part of the allowlist
basename = filename.split('.')[0]
if basename in allowlist:
self.log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path)
else:
self.log.debug("%s is not found in allowlist, so replacing it with symlink: %s",
basename, full_path)
# if it is not in the allowlist, delete the file and create a symlink to host_injections
host_inj_path = full_path.replace('versions', 'host_injections')
# make sure source and target of symlink are not the same
if full_path == host_inj_path:
raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you "
"are using this hook for an EESSI installation?",
full_path, host_inj_path)
remove_file(full_path)
symlink(host_inj_path, full_path)
# replace files that are not distributable with symlinks into
# host_injections
replace_non_distributable_files_with_symlinks(self.name, allowlist)
else:
raise EasyBuildError("CUDA-specific hook triggered for non-CUDA easyconfig?!")


def post_sanitycheck_cuDNN(self, *args, **kwargs):
def post_sanitycheck_cudnn(self, *args, **kwargs):
"""
Remove files from cuDNN installation that we are not allowed to ship,
and replace them with a symlink to a corresponding installation under host_injections.
Expand All @@ -714,79 +736,57 @@ def post_sanitycheck_cuDNN(self, *args, **kwargs):
allowlist = sorted(set(allowlist))
self.log.info("Allowlist for files in cuDNN installation that can be redistributed: " + ', '.join(allowlist))

# iterate over all files in the CUDA installation directory
for dir_path, _, files in os.walk(self.installdir):
for filename in files:
full_path = os.path.join(dir_path, filename)
# we only really care about real files, i.e. not symlinks
if not os.path.islink(full_path):
# check if the current file is part of the allowlist
basename = filename.split('.')[0]
if '.' in filename:
extension = '.' + filename.split('.')[1]
if basename in allowlist:
self.log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path)
elif '.' in filename and extension in allowlist:
self.log.debug("%s is found in allowlist, so keeping it: %s", extension, full_path)
else:
self.log.debug("%s is not found in allowlist, so replacing it with symlink: %s",
filename, full_path)
# if it is not in the allowlist, delete the file and create a symlink to host_injections
host_inj_path = full_path.replace('versions', 'host_injections')
# make sure source and target of symlink are not the same
if full_path == host_inj_path:
raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you "
"are using this hook for a NESSI installation?",
full_path, host_inj_path)
remove_file(full_path)
symlink(host_inj_path, full_path)
# replace files that are not distributable with symlinks into
# host_injections
replace_non_distributable_files_with_symlinks(self.name, allowlist)
else:
raise EasyBuildError("cuDNN-specific hook triggered for non-cuDNN easyconfig?!")


def inject_gpu_property(ec):
"""
Add 'gpu' property, via modluafooter easyconfig parameter
Add 'gpu' property EESSI<PACKAGE>VERSION envvars and drop dependencies to
build dependencies, via modluafooter easyconfig parameter
"""
ec_dict = ec.asdict()
# Check if CUDA is in the dependencies, if so add the 'gpu' Lmod property
if ('CUDA' in [dep[0] for dep in iter(ec_dict['dependencies'])]):
ec.log.info("Injecting gpu as Lmod arch property and envvar with CUDA version")
# check if CUDA, cuDNN, you-name-it is in the dependencies, if so
# - drop dependency to build dependency
# - add 'gpu' Lmod property
# - add envvar with package version
packages_list = ( "CUDA", "cuDNN", "cuTENSOR" )
packages_version = { }
add_gpu_property = ''

for package in packages_list:
# Check if package is in the dependencies, if so drop dependency to build
# dependency and set variable for later adding the 'gpu' Lmod property
if (package in [dep[0] for dep in iter(ec_dict['dependencies'])]):
add_gpu_property = 'add_property("arch","gpu")'
for dep in iter(ec_dict['dependencies']):
if package in dep[0]:
# make package a build dependency only (rpathing saves us from link errors)
ec.log.info("Dropping dependency on %s to build dependency" % package)
ec_dict['dependencies'].remove(dep)
if dep not in ec_dict['builddependencies']:
ec_dict['builddependencies'].append(dep)
# take note of version for creating the modluafooter
packages_version[package] = dep[1]
if add_gpu_property:
ec.log.info("Injecting gpu as Lmod arch property and envvars for dependencies with their version")
key = 'modluafooter'
value = 'add_property("arch","gpu")'
cuda_version = 0
for dep in iter(ec_dict['dependencies']):
# Make CUDA a build dependency only (rpathing saves us from link errors)
if 'CUDA' in dep[0]:
cuda_version = dep[1]
ec_dict['dependencies'].remove(dep)
if dep not in ec_dict['builddependencies']:
ec_dict['builddependencies'].append(dep)
value = '\n'.join([value, 'setenv("EESSICUDAVERSION","%s")' % cuda_version])
if key in ec_dict:
if not value in ec_dict[key]:
ec[key] = '\n'.join([ec_dict[key], value])
values = [add_gpu_property]
for package, version in packages_version.items():
envvar = "EESSI%sVERSION" % package.upper()
values.append('setenv("%s","%s")' % (envvar, version))
if not key in ec_dict:
ec[key] = '\n'.join(values)
else:
ec[key] = value
# Check if cuDNN is in the dependencies, if so add the 'gpu' Lmod property
if ('cuDNN' in [dep[0] for dep in iter(ec_dict['dependencies'])]):
ec.log.info("Injecting gpu as Lmod arch property and envvar with cuDNN version")
key = 'modluafooter'
value = 'add_property("arch","gpu")'
cudnn_version = 0
for dep in iter(ec_dict['dependencies']):
# Make cuDNN a build dependency only (rpathing saves us from link errors)
if 'cuDNN' in dep[0]:
cudnn_version = dep[1]
ec_dict['dependencies'].remove(dep)
if dep not in ec_dict['builddependencies']:
ec_dict['builddependencies'].append(dep)
value = '\n'.join([value, 'setenv("EESSICUDNNVERSION","%s")' % cudnn_version])
if key in ec_dict:
if not value in ec_dict[key]:
ec[key] = '\n'.join([ec_dict[key], value])
else:
ec[key] = value
new_value = ec_dict[key]
for value in values:
if not value in new_value:
new_value = '\n'.join([new_value, value])
ec[key] = new_value

return ec


Expand Down Expand Up @@ -843,5 +843,5 @@ def inject_gpu_property(ec):

POST_SANITYCHECK_HOOKS = {
'CUDA': post_sanitycheck_cuda,
'cuDNN': post_sanitycheck_cuDNN,
'cuDNN': post_sanitycheck_cudnn,
}
Loading

0 comments on commit 433d588

Please sign in to comment.