From 433d588bbf4a150a17aa7cc4e6e9e47badc64dbf Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 22 May 2024 20:24:55 +0200 Subject: [PATCH] adding unstaged changes --- EESSI-install-software.sh | 6 +- create_lmodsitepackage.py | 72 +++----- .../eessi-2023.06-eb-4.9.1-001-system.yml | 3 +- .../2023.06/eessi-2023.06-eb-4.9.1-2023a.yml | 1 - eb_hooks.py | 174 +++++++++--------- install_scripts.sh | 6 +- 6 files changed, 122 insertions(+), 140 deletions(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index d840910516..6c59f46570 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -204,8 +204,10 @@ ${TOPDIR}/install_scripts.sh --prefix ${EESSI_PREFIX} # TODO: We should make a nice yaml and loop over all CUDA versions in that yaml to figure out what to install # Allow skipping CUDA SDK install in e.g. CI environments if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then - ${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_host_injections.sh -c 12.1.1 --accept-cuda-eula - ${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuDNN_host_injections.sh -c 12.1.1 -d 8.9.2.26 + ${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh \ + -e ${EESSI_PREFIX}/scripts/gpu_support/nvidia/eessi-2023.06-cuda-and-libraries.yml \ + -t /tmp/temp \ + --accept-cuda-eula else echo "Skipping installation of CUDA SDK and cu* libraries in host_injections, since the --skip-cuda-install flag was passed" fi diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index bafbb63414..d6137eb901 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -105,45 +105,46 @@ end - -local function eessi_cuda_enabled_load_hook(t) +local function eessi_cuda_and_libraries_enabled_load_hook(t) local frameStk = require("FrameStk"):singleton() local mt = frameStk:mt() local simpleName = string.match(t.modFullName, "(.-)/") - -- If we try to load CUDA itself, check if the full CUDA SDK was installed on the host in host_injections. - -- This is required for end users to build additional CUDA software. If the full SDK isn't present, refuse - -- to load the CUDA module and print an informative message on how to set up GPU support for NESSI + local packagesList = { ["CUDA"] = true, ["cuDNN"] = true, ["cuTENSOR"] = true } + -- If we try to load any of the modules in packagesList, we check if the + -- full package was installed on the host in host_injections. + -- This is required for end users to build additional software that depends + -- on the package. If the full SDK isn't present, refuse + -- to load the module and print an informative message on how to set up GPU support for NESSI local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n" - if simpleName == 'CUDA' then + if packagesList[simpleName] then + -- simpleName is a module in packagesList -- get the full host_injections path local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections') - -- build final path where the CUDA software should be installed - local cudaEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild" - local cudaDirExists = isDir(cudaEasyBuildDir) - if not cudaDirExists then + -- build final path where the software should be installed + local packageEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild" + local packageDirExists = isDir(packageEasyBuildDir) + if not packageDirExists then local advice = "but while the module file exists, the actual software is not entirely shipped with NESSI " - advice = advice .. "due to licencing. You will need to install a full copy of the CUDA SDK where NESSI " + advice = advice .. "due to licencing. You will need to install a full copy of the " .. simpleName .. " package where NESSI " advice = advice .. "can find it.\\n" advice = advice .. refer_to_docs LmodError("\\nYou requested to load ", simpleName, " ", advice) end end - -- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the NESSI linker, + -- when loading CUDA (and cu*) enabled modules check if the necessary driver libraries are accessible to the NESSI linker, -- otherwise, refuse to load the requested module and print error message - local checkGpu = mt:haveProperty(simpleName,"arch","gpu") - local overrideGpuCheck = os.getenv("EESSI_OVERRIDE_GPU_CHECK") - if checkGpu and (overrideGpuCheck == nil) then + local haveGpu = mt:haveProperty(simpleName,"arch","gpu") + if haveGpu then local arch = os.getenv("EESSI_CPU_FAMILY") or "" - local cudaVersionFile = "/cvmfs/pilot.nessi.no/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt" - local cudaDriverFile = "/cvmfs/pilot.nessi.no/host_injections/nvidia/" .. arch .. "/latest/libcuda.so" + local cvmfs_repo = os.getenv("EESSI_CVMFS_REPO") or "" + local cudaVersionFile = cvmfs_repo .. "/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt" + local cudaDriverFile = cvmfs_repo .. "/host_injections/nvidia/" .. arch .. "/latest/libcuda.so" local cudaDriverExists = isFile(cudaDriverFile) local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so") if not (cudaDriverExists or singularityCudaExists) then local advice = "which relies on the CUDA runtime environment and driver libraries. " advice = advice .. "In order to be able to use the module, you will need " - advice = advice .. "to make sure NESSI can find the GPU driver libraries on your host system. You can " - advice = advice .. "override this check by setting the environment variable EESSI_OVERRIDE_GPU_CHECK but " - advice = advice .. "the loaded application will not be able to execute on your system.\\n" + advice = advice .. "to make sure NESSI can find the GPU driver libraries on your host system.\\n" advice = advice .. refer_to_docs LmodError("\\nYou requested to load ", simpleName, " ", advice) else @@ -174,38 +175,13 @@ end end -local function eessi_cudnn_enabled_load_hook(t) - local frameStk = require("FrameStk"):singleton() - local mt = frameStk:mt() - local simpleName = string.match(t.modFullName, "(.-)/") - -- If we try to load cuDNN itself, check if the full cuDNN package was installed on the host in host_injections. - -- This is required for end users to build additional cuDNN dependent software. If the full SDK isn't present, refuse - -- to load the cuDNN module and print an informative message on how to set up GPU support for NESSI - local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n" - if simpleName == 'cuDNN' then - -- get the full host_injections path - local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections') - -- build final path where the cuDNN software should be installed - local cudnnEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild" - local cudnnDirExists = isDir(cudnnEasyBuildDir) - if not cudnnDirExists then - local advice = "but while the module file exists, the actual software is not entirely shipped with NESSI " - advice = advice .. "due to licencing. You will need to install a full copy of the cuDNN package where NESSI " - advice = advice .. "can find it.\\n" - advice = advice .. refer_to_docs - LmodError("\\nYou requested to load ", simpleName, " ", advice) - end - end -end - -- Combine both functions into a single one, as we can only register one function as load hook in lmod -- Also: make it non-local, so it can be imported and extended by other lmodrc files if needed function eessi_load_hook(t) - -- Only apply CUDA and cuDNN hooks if the loaded module is in the NESSI prefix - -- This avoids getting an Lmod Error when trying to load a CUDA or cuDNN module from a local software stack + -- Only apply CUDA and libraries hook if the loaded module is in the NESSI prefix + -- This avoids getting an Lmod Error when trying to load a CUDA or library module from a local software stack if from_eessi_prefix(t) then - eessi_cuda_enabled_load_hook(t) - eessi_cudnn_enabled_load_hook(t) + eessi_cuda_and_libraries_enabled_load_hook(t) end end diff --git a/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-001-system.yml b/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-001-system.yml index 63f8804820..aa5a07d02a 100644 --- a/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-001-system.yml +++ b/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-001-system.yml @@ -3,4 +3,5 @@ easyconfigs: options: from-pr: 20299 - EESSI-extend-2023.06-easybuild.eb -# comment to trigger rebuild + - cuDNN-8.9.2.26-CUDA-12.1.1.eb + - cuTENSOR-2.0.1.2-CUDA-12.1.1.eb diff --git a/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023a.yml b/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023a.yml index 5163806807..276bfa49f7 100644 --- a/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023a.yml +++ b/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023a.yml @@ -34,5 +34,4 @@ easyconfigs: # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19451; options: from-pr: 19451 - - cuDNN-8.9.2.26-CUDA-12.1.1.eb - OSU-Micro-Benchmarks-7.2-gompi-2023a-CUDA-12.1.1.eb diff --git a/eb_hooks.py b/eb_hooks.py index cf1c911b23..7edd67b1fc 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -623,6 +623,47 @@ def post_sanitycheck_hook(self, *args, **kwargs): POST_SANITYCHECK_HOOKS[self.name](self, *args, **kwargs) +def replace_non_distributable_files_with_symlinks(self, package, allowlist): + """ + Replace files that cannot be distributed with symlinks into host_injections + """ + extension_based = { "CUDA": False, "cuDNN": True, "cuTENSOR": True } + if package in extension_based: + raise EasyBuildError("Don't know how to strip non-distributable files from package %s.", package) + + # iterate over all files in the package installation directory + for dir_path, _, files in os.walk(self.installdir): + for filename in files: + full_path = os.path.join(dir_path, filename) + # we only really care about real files, i.e. not symlinks + if not os.path.islink(full_path): + # check if the current file name stub is part of the allowlist + basename = filename.split('.')[0] + if extension_based[package]: + if '.' in filename: + extension = '.' + filename.split('.')[1] + if basename in allowlist: + self.log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path) + elif extension_based[package] and '.' in filename and extension in allowlist: + self.log.debug("%s is found in allowlist, so keeping it: %s", extension, full_path) + else: + if extension_based[package]: + print_name = filename + else: + print_name = basename + self.log.debug("%s is not found in allowlist, so replacing it with symlink: %s", + print_name, full_path) + # if it is not in the allowlist, delete the file and create a symlink to host_injections + host_inj_path = full_path.replace('versions', 'host_injections') + # make sure source and target of symlink are not the same + if full_path == host_inj_path: + raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you " + "are using this hook for an EESSI installation?", + full_path, host_inj_path) + remove_file(full_path) + symlink(host_inj_path, full_path) + + def post_sanitycheck_cuda(self, *args, **kwargs): """ Remove files from CUDA installation that we are not allowed to ship, @@ -662,33 +703,14 @@ def post_sanitycheck_cuda(self, *args, **kwargs): if 'libcudart' not in allowlist: raise EasyBuildError("Did not find 'libcudart' in allowlist: %s" % allowlist) - # iterate over all files in the CUDA installation directory - for dir_path, _, files in os.walk(self.installdir): - for filename in files: - full_path = os.path.join(dir_path, filename) - # we only really care about real files, i.e. not symlinks - if not os.path.islink(full_path): - # check if the current file name stub is part of the allowlist - basename = filename.split('.')[0] - if basename in allowlist: - self.log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path) - else: - self.log.debug("%s is not found in allowlist, so replacing it with symlink: %s", - basename, full_path) - # if it is not in the allowlist, delete the file and create a symlink to host_injections - host_inj_path = full_path.replace('versions', 'host_injections') - # make sure source and target of symlink are not the same - if full_path == host_inj_path: - raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you " - "are using this hook for an EESSI installation?", - full_path, host_inj_path) - remove_file(full_path) - symlink(host_inj_path, full_path) + # replace files that are not distributable with symlinks into + # host_injections + replace_non_distributable_files_with_symlinks(self.name, allowlist) else: raise EasyBuildError("CUDA-specific hook triggered for non-CUDA easyconfig?!") -def post_sanitycheck_cuDNN(self, *args, **kwargs): +def post_sanitycheck_cudnn(self, *args, **kwargs): """ Remove files from cuDNN installation that we are not allowed to ship, and replace them with a symlink to a corresponding installation under host_injections. @@ -714,79 +736,57 @@ def post_sanitycheck_cuDNN(self, *args, **kwargs): allowlist = sorted(set(allowlist)) self.log.info("Allowlist for files in cuDNN installation that can be redistributed: " + ', '.join(allowlist)) - # iterate over all files in the CUDA installation directory - for dir_path, _, files in os.walk(self.installdir): - for filename in files: - full_path = os.path.join(dir_path, filename) - # we only really care about real files, i.e. not symlinks - if not os.path.islink(full_path): - # check if the current file is part of the allowlist - basename = filename.split('.')[0] - if '.' in filename: - extension = '.' + filename.split('.')[1] - if basename in allowlist: - self.log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path) - elif '.' in filename and extension in allowlist: - self.log.debug("%s is found in allowlist, so keeping it: %s", extension, full_path) - else: - self.log.debug("%s is not found in allowlist, so replacing it with symlink: %s", - filename, full_path) - # if it is not in the allowlist, delete the file and create a symlink to host_injections - host_inj_path = full_path.replace('versions', 'host_injections') - # make sure source and target of symlink are not the same - if full_path == host_inj_path: - raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you " - "are using this hook for a NESSI installation?", - full_path, host_inj_path) - remove_file(full_path) - symlink(host_inj_path, full_path) + # replace files that are not distributable with symlinks into + # host_injections + replace_non_distributable_files_with_symlinks(self.name, allowlist) else: raise EasyBuildError("cuDNN-specific hook triggered for non-cuDNN easyconfig?!") def inject_gpu_property(ec): """ - Add 'gpu' property, via modluafooter easyconfig parameter + Add 'gpu' property EESSIVERSION envvars and drop dependencies to + build dependencies, via modluafooter easyconfig parameter """ ec_dict = ec.asdict() - # Check if CUDA is in the dependencies, if so add the 'gpu' Lmod property - if ('CUDA' in [dep[0] for dep in iter(ec_dict['dependencies'])]): - ec.log.info("Injecting gpu as Lmod arch property and envvar with CUDA version") + # check if CUDA, cuDNN, you-name-it is in the dependencies, if so + # - drop dependency to build dependency + # - add 'gpu' Lmod property + # - add envvar with package version + packages_list = ( "CUDA", "cuDNN", "cuTENSOR" ) + packages_version = { } + add_gpu_property = '' + + for package in packages_list: + # Check if package is in the dependencies, if so drop dependency to build + # dependency and set variable for later adding the 'gpu' Lmod property + if (package in [dep[0] for dep in iter(ec_dict['dependencies'])]): + add_gpu_property = 'add_property("arch","gpu")' + for dep in iter(ec_dict['dependencies']): + if package in dep[0]: + # make package a build dependency only (rpathing saves us from link errors) + ec.log.info("Dropping dependency on %s to build dependency" % package) + ec_dict['dependencies'].remove(dep) + if dep not in ec_dict['builddependencies']: + ec_dict['builddependencies'].append(dep) + # take note of version for creating the modluafooter + packages_version[package] = dep[1] + if add_gpu_property: + ec.log.info("Injecting gpu as Lmod arch property and envvars for dependencies with their version") key = 'modluafooter' - value = 'add_property("arch","gpu")' - cuda_version = 0 - for dep in iter(ec_dict['dependencies']): - # Make CUDA a build dependency only (rpathing saves us from link errors) - if 'CUDA' in dep[0]: - cuda_version = dep[1] - ec_dict['dependencies'].remove(dep) - if dep not in ec_dict['builddependencies']: - ec_dict['builddependencies'].append(dep) - value = '\n'.join([value, 'setenv("EESSICUDAVERSION","%s")' % cuda_version]) - if key in ec_dict: - if not value in ec_dict[key]: - ec[key] = '\n'.join([ec_dict[key], value]) + values = [add_gpu_property] + for package, version in packages_version.items(): + envvar = "EESSI%sVERSION" % package.upper() + values.append('setenv("%s","%s")' % (envvar, version)) + if not key in ec_dict: + ec[key] = '\n'.join(values) else: - ec[key] = value - # Check if cuDNN is in the dependencies, if so add the 'gpu' Lmod property - if ('cuDNN' in [dep[0] for dep in iter(ec_dict['dependencies'])]): - ec.log.info("Injecting gpu as Lmod arch property and envvar with cuDNN version") - key = 'modluafooter' - value = 'add_property("arch","gpu")' - cudnn_version = 0 - for dep in iter(ec_dict['dependencies']): - # Make cuDNN a build dependency only (rpathing saves us from link errors) - if 'cuDNN' in dep[0]: - cudnn_version = dep[1] - ec_dict['dependencies'].remove(dep) - if dep not in ec_dict['builddependencies']: - ec_dict['builddependencies'].append(dep) - value = '\n'.join([value, 'setenv("EESSICUDNNVERSION","%s")' % cudnn_version]) - if key in ec_dict: - if not value in ec_dict[key]: - ec[key] = '\n'.join([ec_dict[key], value]) - else: - ec[key] = value + new_value = ec_dict[key] + for value in values: + if not value in new_value: + new_value = '\n'.join([new_value, value]) + ec[key] = new_value + return ec @@ -843,5 +843,5 @@ def inject_gpu_property(ec): POST_SANITYCHECK_HOOKS = { 'CUDA': post_sanitycheck_cuda, - 'cuDNN': post_sanitycheck_cuDNN, + 'cuDNN': post_sanitycheck_cudnn, } diff --git a/install_scripts.sh b/install_scripts.sh index 8bbcb6a7bf..17712a0ae7 100755 --- a/install_scripts.sh +++ b/install_scripts.sh @@ -110,7 +110,11 @@ copy_files_by_list ${TOPDIR}/scripts ${INSTALL_PREFIX}/scripts "${script_files[@ # Copy files for the scripts/gpu_support/nvidia directory nvidia_files=( - install_cuda_host_injections.sh install_cuDNN_host_injections.sh link_nvidia_host_libraries.sh + eessi-2023.06-cuda-and-libraries.yml + install_cuda_and_libraries.sh + install_cuda_host_injections.sh + install_cuDNN_host_injections.sh + link_nvidia_host_libraries.sh ) copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_files[@]}"