Skip to content

Commit

Permalink
various updates to take suggestions into account
Browse files Browse the repository at this point in the history
- `EESSI-install-software.sh`
  - use `scripts/gpu_support/nvidia/install_cuda_and_libraries.sh` with
    `scripts/gpu_support/nvidia/eessi-2023.06-cuda-and-libraries.yml`
- `create_lmodsitepackage.py`
  - consolidate `eessi_{cuda,cudnn}_enabled_load_hook` functions in a single one
    (`eessi_cuda_and_libraries_enabled_load_hook`)
  - the remaining hook is prepared to easily add new modules, e.g., cuTENSOR
- `eb_hooks.py`
  - put code that iterates over all files replacing non-distributable ones with
    symlinks into `host_injections` with a common function
    (`replace_non_distributable_files_with_symlinks`)
- `install_scripts.sh`
  - add files to copy to CVMFS (see `nvidia_files`)
- `scripts/gpu_support/nvidia/install_cuda_and_libraries.sh`
  - improved creation of tmp directory
  • Loading branch information
truib committed May 23, 2024
1 parent 12fcec5 commit 74a9a55
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 304 deletions.
6 changes: 4 additions & 2 deletions EESSI-install-software.sh
Original file line number Diff line number Diff line change
Expand Up @@ -233,8 +233,10 @@ else
fi

if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then
${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_host_injections.sh -c 12.1.1 --accept-cuda-eula
${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cudnn_host_injections.sh -c 12.1.1 -d 8.9.2.26
${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh \
-e ${EESSI_PREFIX}/scripts/gpu_support/nvidia/eessi-2023.06-cuda-and-libraries.yml \
-t /tmp/temp \
--accept-cuda-eula
else
echo "Skipping installation of CUDA SDK and cu* libraries in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found"
fi
Expand Down
59 changes: 20 additions & 39 deletions create_lmodsitepackage.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,35 +107,41 @@
end
local function eessi_cuda_enabled_load_hook(t)
local function eessi_cuda_and_libraries_enabled_load_hook(t)
local frameStk = require("FrameStk"):singleton()
local mt = frameStk:mt()
local simpleName = string.match(t.modFullName, "(.-)/")
-- If we try to load CUDA itself, check if the full CUDA SDK was installed on the host in host_injections.
-- This is required for end users to build additional CUDA software. If the full SDK isn't present, refuse
-- to load the CUDA module and print an informative message on how to set up GPU support for EESSI
local packagesList = { ["CUDA"] = true, ["cuDNN"] = true }
-- If we try to load any of the modules in packagesList, we check if the
-- full package was installed on the host in host_injections.
-- This is required for end users to build additional software that depends
-- on the package. If the full SDK isn't present, refuse
-- to load the module and print an informative message on how to set up GPU support for EESSI
local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n"
if simpleName == 'CUDA' then
if packagesList[simpleName] then
-- simpleName is a module in packagesList
-- get the full host_injections path
local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections')
-- build final path where the CUDA software should be installed
local cudaEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
local cudaDirExists = isDir(cudaEasyBuildDir)
if not cudaDirExists then
-- build final path where the software should be installed
local packageEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
local packageDirExists = isDir(packageEasyBuildDir)
if not packageDirExists then
local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI "
advice = advice .. "due to licencing. You will need to install a full copy of the CUDA SDK where EESSI "
advice = advice .. "due to licencing. You will need to install a full copy of the " .. simpleName .. " package where EESSI "
advice = advice .. "can find it.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYou requested to load ", simpleName, " ", advice)
end
end
-- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the EESSI linker,
-- when loading CUDA (and cu*) enabled modules check if the necessary driver libraries are accessible to the EESSI linker,
-- otherwise, refuse to load the requested module and print error message
local haveGpu = mt:haveProperty(simpleName,"arch","gpu")
if haveGpu then
local arch = os.getenv("EESSI_CPU_FAMILY") or ""
local cudaVersionFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt"
local cudaDriverFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/libcuda.so"
local cvmfs_repo = os.getenv("EESSI_CVMFS_REPO") or ""
local cudaVersionFile = cvmfs_repo .. "/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt"
local cudaDriverFile = cvmfs_repo .. "/host_injections/nvidia/" .. arch .. "/latest/libcuda.so"
local cudaDriverExists = isFile(cudaDriverFile)
local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so")
if not (cudaDriverExists or singularityCudaExists) then
Expand Down Expand Up @@ -172,38 +178,13 @@
end
end
local function eessi_cudnn_enabled_load_hook(t)
local frameStk = require("FrameStk"):singleton()
local mt = frameStk:mt()
local simpleName = string.match(t.modFullName, "(.-)/")
-- If we try to load cuDNN itself, check if the full cuDNN package was installed on the host in host_injections.
-- This is required for end users to build additional cuDNN dependent software. If the full SDK isn't present, refuse
-- to load the cuDNN module and print an informative message on how to set up GPU support for EESSI
local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n"
if simpleName == 'cuDNN' then
-- get the full host_injections path
local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections')
-- build final path where the cuDNN software should be installed
local cudnnEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
local cudnnDirExists = isDir(cudnnEasyBuildDir)
if not cudnnDirExists then
local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI "
advice = advice .. "due to licencing. You will need to install a full copy of the cuDNN package where EESSI "
advice = advice .. "can find it.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYou requested to load ", simpleName, " ", advice)
end
end
end
-- Combine both functions into a single one, as we can only register one function as load hook in lmod
-- Also: make it non-local, so it can be imported and extended by other lmodrc files if needed
function eessi_load_hook(t)
-- Only apply CUDA and cuDNN hooks if the loaded module is in the EESSI prefix
-- This avoids getting an Lmod Error when trying to load a CUDA and cuDNN module from a local software stack
if from_eessi_prefix(t) then
eessi_cuda_enabled_load_hook(t)
eessi_cudnn_enabled_load_hook(t)
eessi_cuda_and_libraries_enabled_load_hook(t)
end
end
Expand Down
98 changes: 48 additions & 50 deletions eb_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,47 @@ def post_sanitycheck_hook(self, *args, **kwargs):
POST_SANITYCHECK_HOOKS[self.name](self, *args, **kwargs)


def replace_non_distributable_files_with_symlinks(log, install_dir, package, allowlist):
"""
Replace files that cannot be distributed with symlinks into host_injections
"""
extension_based = { "CUDA": False, "cuDNN": True }
if not package in extension_based:
raise EasyBuildError("Don't know how to strip non-distributable files from package %s.", package)

# iterate over all files in the package installation directory
for dir_path, _, files in os.walk(install_dir):
for filename in files:
full_path = os.path.join(dir_path, filename)
# we only really care about real files, i.e. not symlinks
if not os.path.islink(full_path):
# check if the current file name stub is part of the allowlist
basename = filename.split('.')[0]
if extension_based[package]:
if '.' in filename:
extension = '.' + filename.split('.')[1]
if basename in allowlist:
log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path)
elif extension_based[package] and '.' in filename and extension in allowlist:
log.debug("%s is found in allowlist, so keeping it: %s", extension, full_path)
else:
if extension_based[package]:
print_name = filename
else:
print_name = basename
log.debug("%s is not found in allowlist, so replacing it with symlink: %s",
print_name, full_path)
# if it is not in the allowlist, delete the file and create a symlink to host_injections
host_inj_path = full_path.replace('versions', 'host_injections')
# make sure source and target of symlink are not the same
if full_path == host_inj_path:
raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you "
"are using this hook for an EESSI installation?",
full_path, host_inj_path)
remove_file(full_path)
symlink(host_inj_path, full_path)


def post_sanitycheck_cuda(self, *args, **kwargs):
"""
Remove files from CUDA installation that we are not allowed to ship,
Expand Down Expand Up @@ -606,28 +647,9 @@ def post_sanitycheck_cuda(self, *args, **kwargs):
if 'libcudart' not in allowlist:
raise EasyBuildError("Did not find 'libcudart' in allowlist: %s" % allowlist)

# iterate over all files in the CUDA installation directory
for dir_path, _, files in os.walk(self.installdir):
for filename in files:
full_path = os.path.join(dir_path, filename)
# we only really care about real files, i.e. not symlinks
if not os.path.islink(full_path):
# check if the current file name stub is part of the allowlist
basename = filename.split('.')[0]
if basename in allowlist:
self.log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path)
else:
self.log.debug("%s is not found in allowlist, so replacing it with symlink: %s",
basename, full_path)
# if it is not in the allowlist, delete the file and create a symlink to host_injections
host_inj_path = full_path.replace('versions', 'host_injections')
# make sure source and target of symlink are not the same
if full_path == host_inj_path:
raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you "
"are using this hook for an EESSI installation?",
full_path, host_inj_path)
remove_file(full_path)
symlink(host_inj_path, full_path)
# replace files that are not distributable with symlinks into
# host_injections
replace_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist)
else:
raise EasyBuildError("CUDA-specific hook triggered for non-CUDA easyconfig?!")

Expand All @@ -643,8 +665,7 @@ def post_sanitycheck_cudnn(self, *args, **kwargs):

allowlist = ['LICENSE']

# read cuDNN LICENSE, construct allowlist based on section 2. Distribution
# that specifies list of files that can be shipped
# read cuDNN LICENSE, construct allowlist based on section "2. Distribution" that specifies list of files that can be shipped
license_path = os.path.join(self.installdir, 'LICENSE')
search_string = "2. Distribution. The following portions of the SDK are distributable under the Agreement:"
with open(license_path) as infile:
Expand All @@ -660,32 +681,9 @@ def post_sanitycheck_cudnn(self, *args, **kwargs):
allowlist = sorted(set(allowlist))
self.log.info("Allowlist for files in cuDNN installation that can be redistributed: " + ', '.join(allowlist))

# iterate over all files in the cuDNN installation directory
for dir_path, _, files in os.walk(self.installdir):
for filename in files:
full_path = os.path.join(dir_path, filename)
# we only really care about real files, i.e. not symlinks
if not os.path.islink(full_path):
# check if the current file is part of the allowlist
basename = filename.split('.')[0]
if '.' in filename:
extension = '.' + filename.split('.')[1]
if basename in allowlist:
self.log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path)
elif '.' in filename and extension in allowlist:
self.log.debug("%s is found in allowlist, so keeping it: %s", extension, full_path)
else:
self.log.debug("%s is not found in allowlist, so replacing it with symlink: %s",
filename, full_path)
# if it is not in the allowlist, delete the file and create a symlink to host_injections
host_inj_path = full_path.replace('versions', 'host_injections')
# make sure source and target of symlink are not the same
if full_path == host_inj_path:
raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you "
"are using this hook for a EESSI installation?",
full_path, host_inj_path)
remove_file(full_path)
symlink(host_inj_path, full_path)
# replace files that are not distributable with symlinks into
# host_injections
replace_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist)
else:
raise EasyBuildError("cuDNN-specific hook triggered for non-cuDNN easyconfig?!")

Expand Down
6 changes: 5 additions & 1 deletion install_scripts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,11 @@ copy_files_by_list ${TOPDIR}/scripts ${INSTALL_PREFIX}/scripts "${script_files[@

# Copy files for the scripts/gpu_support/nvidia directory
nvidia_files=(
install_cuda_host_injections.sh install_cudnn_host_injections.sh link_nvidia_host_libraries.sh
eessi-2023.06-cuda-and-libraries.yml
install_cuda_and_libraries.sh
install_cuda_host_injections.sh
install_cudnn_host_injections.sh
link_nvidia_host_libraries.sh
)
copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_files[@]}"

Expand Down
6 changes: 4 additions & 2 deletions scripts/gpu_support/nvidia/install_cuda_and_libraries.sh
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,13 @@ export EESSI_SITE_INSTALL=${EESSI_SOFTWARE_PATH/versions/host_injections}
if [[ -z "${TEMP_DIR}" ]]; then
tmpdir=$(mktemp -d)
else
tmpdir="${TEMP_DIR}"/temp
if ! mkdir "$tmpdir" ; then
mkdir -p ${TEMP_DIR}
tmpdir=$(mktemp -d --tmpdir=${TEMP_DIR} cuda_n_co.XXX)
if [[ ! -d "$tmpdir" ]] ; then
fatal_error "Could not create directory ${tmpdir}"
fi
fi
echo "Created temporary directory '${tmpdir}'"

# workaround for EasyBuild not being found when loading "extend" module
module load EasyBuild/4.9.1
Expand Down
Loading

0 comments on commit 74a9a55

Please sign in to comment.