Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

{2023.06}[system] cuDNN/8.9.2.26-CUDA-12.1.1 #581

5 changes: 3 additions & 2 deletions EESSI-install-software.sh
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ pr_diff=$(ls [0-9]*.diff | head -1)
# for now, this just reinstalls all scripts. Note the most elegant, but works
${TOPDIR}/install_scripts.sh --prefix ${EESSI_PREFIX}

# Install full CUDA SDK in host_injections
# Install full CUDA SDK and cu* libraries in host_injections
# Hardcode this for now, see if it works
# TODO: We should make a nice yaml and loop over all CUDA versions in that yaml to figure out what to install
# Allow skipping CUDA SDK install in e.g. CI environments
Expand All @@ -234,8 +234,9 @@ fi

if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then
${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_host_injections.sh -c 12.1.1 --accept-cuda-eula
${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cudnn_host_injections.sh -c 12.1.1 --accept-cuda-eula
else
echo "Skipping installation of CUDA SDK in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found"
echo "Skipping installation of CUDA SDK and cu* libraries in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found"
fi

# Install drivers in host_injections
Expand Down
29 changes: 27 additions & 2 deletions create_lmodsitepackage.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,13 +172,38 @@
end
end

local function eessi_cudnn_enabled_load_hook(t)
local frameStk = require("FrameStk"):singleton()
local mt = frameStk:mt()
local simpleName = string.match(t.modFullName, "(.-)/")
-- If we try to load cuDNN itself, check if the full cuDNN package was installed on the host in host_injections.
-- This is required for end users to build additional cuDNN dependent software. If the full SDK isn't present, refuse
-- to load the cuDNN module and print an informative message on how to set up GPU support for EESSI
local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n"
if simpleName == 'cuDNN' then
-- get the full host_injections path
local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections')
-- build final path where the cuDNN software should be installed
local cudnnEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
local cudnnDirExists = isDir(cudnnEasyBuildDir)
if not cudnnDirExists then
local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI "
advice = advice .. "due to licencing. You will need to install a full copy of the cuDNN package where EESSI "
advice = advice .. "can find it.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYou requested to load ", simpleName, " ", advice)
end
end
end

-- Combine both functions into a single one, as we can only register one function as load hook in lmod
-- Also: make it non-local, so it can be imported and extended by other lmodrc files if needed
function eessi_load_hook(t)
-- Only apply CUDA hooks if the loaded module is in the EESSI prefix
-- This avoids getting an Lmod Error when trying to load a CUDA module from a local software stack
-- Only apply CUDA and cuDNN hooks if the loaded module is in the EESSI prefix
-- This avoids getting an Lmod Error when trying to load a CUDA and cuDNN module from a local software stack
if from_eessi_prefix(t) then
eessi_cuda_enabled_load_hook(t)
eessi_cudnn_enabled_load_hook(t)
end
end

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ easyconfigs:
options:
from-pr: 20299
- EESSI-extend-2023.06-easybuild.eb
- cuDNN-8.9.2.26-CUDA-12.1.1.eb
79 changes: 79 additions & 0 deletions eb_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,64 @@ def post_sanitycheck_cuda(self, *args, **kwargs):
raise EasyBuildError("CUDA-specific hook triggered for non-CUDA easyconfig?!")



def post_sanitycheck_cudnn(self, *args, **kwargs):
"""
Remove files from cuDNN installation that we are not allowed to ship,
and replace them with a symlink to a corresponding installation under host_injections.
"""
if self.name == 'cuDNN':
print_msg("Replacing files in cuDNN installation that we can not ship with symlinks to host_injections...")

allowlist = ['LICENSE']

# read cuDNN LICENSE, construct allowlist based on section 2. Distribution
# that specifies list of files that can be shipped
license_path = os.path.join(self.installdir, 'LICENSE')
search_string = "2. Distribution. The following portions of the SDK are distributable under the Agreement:"
with open(license_path) as infile:
for line in infile:
if line.strip().startswith(search_string):
# remove search string, split into words, remove trailing
# dots '.' and only retain words starting with a dot '.'
distributable = line[len(search_string):]
for word in distributable.split():
if word[0] == '.':
allowlist.append(word.rstrip('.'))

allowlist = sorted(set(allowlist))
self.log.info("Allowlist for files in cuDNN installation that can be redistributed: " + ', '.join(allowlist))

# iterate over all files in the CUDA installation directory
for dir_path, _, files in os.walk(self.installdir):
for filename in files:
full_path = os.path.join(dir_path, filename)
# we only really care about real files, i.e. not symlinks
if not os.path.islink(full_path):
# check if the current file is part of the allowlist
basename = filename.split('.')[0]
if '.' in filename:
extension = '.' + filename.split('.')[1]
if basename in allowlist:
self.log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path)
elif '.' in filename and extension in allowlist:
self.log.debug("%s is found in allowlist, so keeping it: %s", extension, full_path)
else:
self.log.debug("%s is not found in allowlist, so replacing it with symlink: %s",
filename, full_path)
# if it is not in the allowlist, delete the file and create a symlink to host_injections
host_inj_path = full_path.replace('versions', 'host_injections')
# make sure source and target of symlink are not the same
if full_path == host_inj_path:
raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you "
"are using this hook for a NESSI installation?",
full_path, host_inj_path)
remove_file(full_path)
symlink(host_inj_path, full_path)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this identical to what is done for CUDA? We should probably just create a function that takes the installdir and allowlist as arguments and does this.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point.

Copy link
Collaborator Author

@trz42 trz42 May 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually there are subtle differences. For CUDA, the EULA/README lists files you can distribute. For cuDNN the LICENSE lists what type of files you can distribute. These differences require small modifications. For example, in the hook for CUDA we have:

                    basename = filename.split('.')[0]
                    if basename in allowlist:
                        self.log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path)
                    else:
                        self.log.debug("%s is not found in allowlist, so replacing it with symlink: %s",
                                       basename, full_path)

For cuDNN, we have

                    basename = filename.split('.')[0]
                    if '.' in filename:
                        extension = '.' + filename.split('.')[1]
                    if basename in allowlist:
                        self.log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path)
                    elif '.' in filename and extension in allowlist:
                        self.log.debug("%s is found in allowlist, so keeping it: %s", extension, full_path)
                    else:
                        self.log.debug("%s is not found in allowlist, so replacing it with symlink: %s",
                                       filename, full_path)

Anyhow, the differences are relatively small, so a function would require a parameter that allows it to distinguish between CUDA and cuDNN (and in the future maybe other packages such as cuTENSOR).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the extension part, perhaps we should split on all . and look for the last non-numeric entry (which should be the extension)? I can imagine there could be files like libcuda.so.520.12.1

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Scratch that, you already have a good solution, you are taking the second entry which is virtually guaranteed to the the extension

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a function that implements the suggestion in 74a9a55

else:
raise EasyBuildError("cuDNN-specific hook triggered for non-cuDNN easyconfig?!")


def inject_gpu_property(ec):
"""
Add 'gpu' property, via modluafooter easyconfig parameter
Expand All @@ -656,6 +714,26 @@ def inject_gpu_property(ec):
ec[key] = '\n'.join([ec_dict[key], value])
else:
ec[key] = value

# Check if cuDNN is in the dependencies, if so add the 'gpu' Lmod property
if ('cuDNN' in [dep[0] for dep in iter(ec_dict['dependencies'])]):
ec.log.info("Injecting gpu as Lmod arch property and envvar with cuDNN version")
key = 'modluafooter'
value = 'add_property("arch","gpu")'
cudnn_version = 0
for dep in iter(ec_dict['dependencies']):
# Make cuDNN a build dependency only (rpathing saves us from link errors)
if 'cuDNN' in dep[0]:
cudnn_version = dep[1]
ec_dict['dependencies'].remove(dep)
if dep not in ec_dict['builddependencies']:
ec_dict['builddependencies'].append(dep)
value = '\n'.join([value, 'setenv("EESSICUDNNVERSION","%s")' % cudnn_version])
if key in ec_dict:
if not value in ec_dict[key]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This check is probably no longer good enough, we're looking for the exact string, but that is not likely to exist (even though the add_property("arch","gpu") most likely does exist since the applications also should have a CUDA dep). What we really need to do is

  • Grab what is there already
  • Split it on \n
  • Add any missing elements
  • Put it back together again and replace it

Either this, or only the modify/add the modluafooter once in the entire function

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean the if not value in ec_dict[key] is not good enough?

Copy link
Member

@ocaisa ocaisa May 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, because the value is a composite string of property and the setenv, and the property will already (very likely) exist from the CUDA part of this hook

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The module file for cuDNN contains the following

-- Built with EasyBuild version 4.9.1

add_property("arch","gpu")
setenv("EESSICUDAVERSION","12.1.1")

For something that builds on top of cuDNN, we would the above and something like

setenv("EESSICUDNNVERSION","8.9.2.26")

Copy link
Member

@ocaisa ocaisa May 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As currently implemented, for something that builds on top of cuDNN I believe you will have

-- Built with EasyBuild version 4.9.1

add_property("arch","gpu")
setenv("EESSICUDAVERSION","12.1.1")
add_property("arch","gpu")
setenv("EESSICUDNNVERSION","8.9.2.26")

as it will see if the entire string add_property("arch","gpu")\nsetenv("EESSICUDNNVERSION","8.9.2.26") is in the footer

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ack. Working on something to implement the desired footer (and avoiding duplication of code).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated the function. It still produces the same footer for cuDNN. I guess a real test would be a build that uses cuDNN. @ocaisa can you check if the function looks better now?

ec[key] = '\n'.join([ec_dict[key], value])
else:
ec[key] = value
return ec


Expand Down Expand Up @@ -709,4 +787,5 @@ def inject_gpu_property(ec):

POST_SANITYCHECK_HOOKS = {
'CUDA': post_sanitycheck_cuda,
'cuDNN': post_sanitycheck_cudnn,
}
2 changes: 1 addition & 1 deletion install_scripts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ copy_files_by_list ${TOPDIR}/scripts ${INSTALL_PREFIX}/scripts "${script_files[@

# Copy files for the scripts/gpu_support/nvidia directory
nvidia_files=(
install_cuda_host_injections.sh link_nvidia_host_libraries.sh
install_cuda_host_injections.sh install_cudnn_host_injections.sh link_nvidia_host_libraries.sh
)
copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_files[@]}"

Expand Down
210 changes: 210 additions & 0 deletions scripts/gpu_support/nvidia/install_cudnn_host_injections.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
#!/usr/bin/env bash

# This script can be used to install cuDNN under the `.../host_injections` directory.
# This provides the parts of the cuDNN installation that cannot be redistributed as
# part of EESSI due to license limitations. While GPU-based software from EESSI will
# _run_ without these, installation of additional software that requires the cuDNN
# installation(s) under `host_injections` to be present.
#
# The `host_injections` directory is a variant symlink that by default points to
# `/opt/eessi`, unless otherwise defined in the local CVMFS configuration (see
# https://cvmfs.readthedocs.io/en/stable/cpt-repo.html#variant-symlinks). For the
# installation to be successful, this directory needs to be writeable by the user
# executing this script.

# Initialise our bash functions
TOPDIR=$(dirname $(realpath $BASH_SOURCE))
source "$TOPDIR"/../../utils.sh

# Function to display help message
show_help() {
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --help Display this help message"
echo " -c, --cuda-version CUDA_VERSION Specify a version of CUDA to be used"
echo " when installing cuDNN (must"
echo " have a corresponding easyconfig in the"
echo " EasyBuild release)"
echo " -d, --cudnn-version CUDNN_VERSION Specify a version of cuDNN to install (must"
echo " have a corresponding easyconfig in the"
echo " EasyBuild release)"
echo " -t, --temp-dir /path/to/tmpdir Specify a location to use for temporary"
echo " storage during the cuDNN install"
echo " (must have >10GB available)"
}

# Initialize variables
cuda_version=""
cudnn_version=""

# Parse command-line options
while [[ $# -gt 0 ]]; do
case "$1" in
--help)
show_help
exit 0
;;
-c|--cuda-version)
if [ -n "$2" ]; then
cuda_version="$2"
shift 2
else
echo "Error: Argument required for $1"
show_help
exit 1
fi
;;
-d|--cudnn-version)
if [ -n "$2" ]; then
cudnn_version="$2"
shift 2
else
echo "Error: Argument required for $1"
show_help
exit 1
fi
;;
-t|--temp-dir)
if [ -n "$2" ]; then
CUDA_TEMP_DIR="$2"
shift 2
else
echo "Error: Argument required for $1"
show_help
exit 1
fi
;;
*)
show_help
fatal_error "Error: Unknown option: $1"
;;
esac
done

# Make sure EESSI is initialised
check_eessi_initialised

# Make sure the CUDA version supplied is a semantic version
is_semantic_version() {
local version=$1
local regex='^[0-9]+\.[0-9]+\.[0-9]+$'

if [[ $version =~ $regex ]]; then
return 0 # Return success (0) if it's a semantic version
else
return 1 # Return failure (1) if it's not a semantic version
fi
}
if ! is_semantic_version "$cuda_version"; then
show_help
error="\nYou must provide a semantic version for CUDA (e.g., 12.1.1) via the appropriate\n"
error="${error}command line option. This script is intended for use with EESSI so the 'correct'\n"
error="${error}version to provide is probably one of those available under\n"
error="${error}$EESSI_SOFTWARE_PATH/software/cuDNN\n"
fatal_error "${error}"
fi

# As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections`
cudnn_install_parent=${EESSI_SOFTWARE_PATH/versions/host_injections}

# Only install cuDNN if specified version is not found.
# (existence of easybuild subdir implies a successful install)
if [ -d "${cudnn_install_parent}"/software/cuDNN/*-CUDA-"${cuda_version}"/easybuild ]; then
echo_green "cuDNN software found! No need to install cuDNN again."
else
# We need to be able write to the installation space so let's make sure we can
if ! create_directory_structure "${cudnn_install_parent}"/software/cuDNN ; then
fatal_error "No write permissions to directory ${cudnn_install_parent}/software/cuDNN"
fi

# we need a directory we can use for temporary storage
if [[ -z "${CUDA_TEMP_DIR}" ]]; then
tmpdir=$(mktemp -d)
else
tmpdir="${CUDA_TEMP_DIR}"/temp
if ! mkdir "$tmpdir" ; then
fatal_error "Could not create directory ${tmpdir}"
fi
fi

required_space_in_tmpdir=50000
# Let's see if we have sources and build locations defined if not, we use the temporary space
if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then
export EASYBUILD_BUILDPATH=${tmpdir}/build
required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000))
fi
if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then
export EASYBUILD_SOURCEPATH=${tmpdir}/sources
required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000))
fi

# The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB),
# need to do a space check before we proceed
avail_space=$(df --output=avail "${cudnn_install_parent}"/ | tail -n 1 | awk '{print $1}')
if (( avail_space < 5000000 )); then
fatal_error "Need at least 5GB disk space to install cuDNN under ${cudnn_install_parent}, exiting now..."
fi
avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}')
if (( avail_space < required_space_in_tmpdir )); then
error="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n"
error="${error}Set the environment variable CUDA_TEMP_DIR to a location with adequate space to pass this check."
error="${error}You can alternatively set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH "
error="${error}to reduce this requirement. Exiting now..."
fatal_error "${error}"
fi

if ! command -v "eb" &>/dev/null; then
echo_yellow "Attempting to load an EasyBuild module to do actual install"
module load EasyBuild
# There are some scenarios where this may fail
if [ $? -ne 0 ]; then
error="'eb' command not found in your environment and\n"
error="${error} module load EasyBuild\n"
error="${error}failed for some reason.\n"
error="${error}Please re-run this script with the 'eb' command available."
fatal_error "${error}"
fi
fi

cudnn_easyconfig="cuDNN-${cudnn_version}-CUDA-${cuda_version}.eb"

# Check the easyconfig file is available in the release
# (eb search always returns 0, so we need a grep to ensure a usable exit code)
eb --search ^${cudnn_easyconfig}|grep cuDNN > /dev/null 2>&1
# Check the exit code
if [ $? -ne 0 ]; then
eb_version=$(eb --version)
available_cudnn_easyconfigs=$(eb --search ^cuDNN-*.eb|grep cuDNN)

error="The easyconfig ${cudnn_easyconfig} was not found in EasyBuild version:\n"
error="${error} ${eb_version}\n"
error="${error}You either need to give a different version of CUDA to install _or_ \n"
error="${error}use a different version of EasyBuild for the installation.\n"
error="${error}\nThe versions of available with the current eb command are:\n"
error="${error}${available_cudnn_easyconfigs}"
fatal_error "${error}"
fi

# We need the --rebuild option, as the cuDNN module may or may not be on the
# `MODULEPATH` yet. Even if it is, we still want to redo this installation
# since it will provide the symlinked targets for the parts of the cuDNN
# installation in the `.../versions/...` prefix
# We install the module in our `tmpdir` since we do not need the modulefile,
# we only care about providing the targets for the symlinks.
extra_args="--rebuild --installpath-modules=${tmpdir}"

# We don't want hooks used in this install, we need a vanilla cuDNN installation
touch "$tmpdir"/none.py
# shellcheck disable=SC2086 # Intended splitting of extra_args
eb --prefix="$tmpdir" ${extra_args} --hooks="$tmpdir"/none.py --installpath="${cudnn_install_parent}"/ "${cudnn_easyconfig}"
ret=$?
if [ $ret -ne 0 ]; then
eb_last_log=$(unset EB_VERBOSE; eb --last-log)
cp -a ${eb_last_log} .
fatal_error "cuDNN installation failed, please check EasyBuild logs $(basename ${eb_last_log})..."
else
echo_green "cuDNN installation at ${cudnn_install_parent}/software/cuDNN/${cudnn_version}-CUDA-${cuda_version} succeeded!"
fi
# clean up tmpdir
rm -rf "${tmpdir}"
fi
Loading