-
Notifications
You must be signed in to change notification settings - Fork 230
[Manual] Devito on TURSA [A100 GPUs].
George Bisbas edited this page Sep 20, 2024
·
16 revisions
DIRAC login page and management Login to SAFE systems.
# After completing the registration
# Do `ssh` to your login node (password only, no keys are used)
ssh <USERNAME>@tursa.dirac.ed.ac.uk
# To quickly see the available versions of any software do not forget that you can do:
module avail -t 2>&1 | grep -i <keyword>
# e.g.
module avail -t 2>&1 | grep -i nvidia
# We need to build our own Python in Tursa since the default is 3.6.
# Then add to PATH
cd Python-3.12.6/
export PATH=${PWD}:$PATH
cd ../devito/
# To build mpi4py
module load gcc/9.3.0
module load nvhpc/23.5-nompi
module load openmpi/4.1.5-cuda12.3
module list
# WIP install mpi4py
Compiled with: bash-4.4$ which mpicc
/mnt/lustre/tursafs1/apps/basestack/cuda-12.3/openmpi/4.1.5-cuda12.3-slurm/bin/mpicc
CXX=$(which nvc++) CC=$(which nvc) python -m pip install --force-reinstall --no-cache-dir mpi4py
# and
bash-4.4$ module list
Currently Loaded Modulefiles:
1) /mnt/lustre/tursafs1/home/y07/shared/tursa-modules/setup-env 4) ucx/1.15.0-cuda12.3
2) gcc/9.3.0 5) openmpi/4.1.5-cuda12.3
3) nvhpc/23.5-nompi
# MPICC=/mnt/lustre/tursafs1/apps/basestack/cuda-12.3/openmpi/4.1.5-cuda12.3-slurm/bin/mpicc CC=nvc python -m pip install --force-reinstall --no-cache-dir mpi4py
# CXX=$(which nvc++) CC=$(which nvc) python -m pip install --force-reinstall --no-cache-dir mpi4py
# This is what worked!!!
we have to rm openmpi and GCC!!!
export PATH=/home/y07/shared/utils/core/nvhpc/23.5/Linux_x86_64/23.5/comm_libs/mpi/bin:$PATH
bash-4.4$ module list
Currently Loaded Modulefiles:
1) /mnt/lustre/tursafs1/home/y07/shared/tursa-modules/setup-env 2) nvhpc/23.5-nompi
srun --nodes=1 --ntasks-per-node=2 --cpus-per-task=16 python examples/seismic/acoustic/acoustic_example.py -d 124 124 124 --tn 1024 -so 8
bash-4.4$ mpicxx --version
nvc++ 23.5-0 64-bit target on x86-64 Linux -tp zen2
NVIDIA Compilers and Tools
Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Requesting an interactive job
salloc --nodes=1 --ntasks-per-node=32 --cpus-per-task=1 --time=01:00:00 --partition=gpu-a100-80 --gres=gpu:2 --qos=dev --account=<code> --gpu-freq=1410
salloc --nodes=2 --cpus-per-task=1 --time=01:00:00 --partition=gpu-a100-80 --gres=gpu:4 --qos=dev --account=<> --job-name=dev_job --gpu-freq=1410
module load gcc/9.3.0
module load nvhpc/23.5-nompi
module load openmpi/4.1.5-cuda12.3
module list
# WIP
export PATH=/home/y07/shared/utils/core/nvhpc/23.5/Linux_x86_64/23.5/comm_libs/mpi/bin:$PATH
#!/bin/bash
# Slurm job options
#SBATCH --job-name=GPU-1-job
#SBATCH --time=01:00:00
#SBATCH --partition=gpu-a100-80
#SBATCH --qos=standard
# Replace [budget code] below with your budget code (e.g. t01)
#SBATCH --account=dp346
# Request right number of full nodes (48 cores by node for A100-80 GPU nodes))
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=48
#SBATCH --cpus-per-task=1
#SBATCH --gres=gpu:1
#SBATCH -o /home/dp346/dp346/dc-bisb2/gpu-jobs/output-1-gpu.%j.out # STDOUT
# Add our Python to PATH
cd /home/dp346/dp346/dc-bisb2/Python-3.12.6/
export PATH=${PWD}:$PATH
cd /home/dp346/dp346/dc-bisb2/devito
# Load needed modules: WARNING: You need other modules to BUILD mpi4py
module load nvhpc/23.5-nompi
export PATH=/home/y07/shared/utils/core/nvhpc/23.5/Linux_x86_64/23.5/comm_libs/mpi/bin:$PATH
mpicxx --version
module list
# Use a custom TMPDIR
export TMPDIR=/home/dp346/dp346/dc-bisb2/devito_temp
# Devito environment
export DEVITO_MPI=1
export DEVITO_LANGUAGE=openacc
export DEVITO_LOGGING=DEBUG
export DEVITO_PROFILING=advanced2
export DEVITO_PLATFORM=nvidiaX
export DEVITO_COMPILER=nvc
# We have reserved the full nodes, now distribute the processes as
# required: 4 MPI processes per node, stride of 12 cores between
# MPI processes
#
=======================
This seems to cause trouble at least for openacc
# Note use of gpu_launch.sh wrapper script for GPU and NIC pinning (???)
==================== ???
export DEVITO_SAFE_HALO=1
srun --nodes=1 --ntasks-per-node=4 --cpus-per-task=12 \
--hint=nomultithread --distribution=block:block \
gpu_launch.sh python examples/seismic/acoustic/acoustic_example.py -d 1158 1158 1158 --tn 1024 -so 8
srun --nodes=1 --ntasks-per-node=4 --cpus-per-task=12 \
--hint=nomultithread --distribution=block:block \
gpu_launch.sh python examples/seismic/acoustic/acoustic_example.py -d 1158 1158 1158 --tn 1024 -so 12
export DEVITO_SAFE_HALO=2
srun --nodes=1 --ntasks-per-node=4 --cpus-per-task=12 \
--hint=nomultithread --distribution=block:block \
gpu_launch.sh python examples/seismic/elastic/elastic_example.py -d 832 832 832 --tn 1024 -so 8
srun --nodes=1 --ntasks-per-node=4 --cpus-per-task=12 \
--hint=nomultithread --distribution=block:block \
gpu_launch.sh python examples/seismic/elastic/elastic_example.py -d 832 832 832 --tn 1024 -so 12
export DEVITO_SAFE_HALO=1
srun --nodes=1 --ntasks-per-node=4 --cpus-per-task=12 \
--hint=nomultithread --distribution=block:block \
gpu_launch.sh python examples/seismic/tti/tti_example.py -d 896 896 896 --tn 1024 -so 8
srun --nodes=1 --ntasks-per-node=4 --cpus-per-task=12 \
--hint=nomultithread --distribution=block:block \
gpu_launch.sh python examples/seismic/tti/tti_example.py -d 896 896 896 --tn 1024 -so 12
export DEVITO_SAFE_HALO=2
srun --nodes=1 --ntasks-per-node=4 --cpus-per-task=12 \
--hint=nomultithread --distribution=block:block \
gpu_launch.sh python examples/seismic/viscoelastic/viscoelastic_example.py -d 704 704 704 --tn 1024 -so 8
srun --nodes=1 --ntasks-per-node=4 --cpus-per-task=12 \
--hint=nomultithread --distribution=block:block \
gpu_launch.sh python examples/seismic/viscoelastic/viscoelastic_example.py -d 704 704 704 --tn 1024 -so 12
watch -n 10 'squeue --me'
watch -n 10 'watch -n 10 'squeue | grep gpu-a100''
watch -n 0.1 'nvidia-smi'
ncu --version
# NVIDIA (R) Nsight Compute Command Line Profiler
# Copyright (c) 2018-2023 NVIDIA Corporation
# Version 2023.1.1.0 (build 32678585) (public-release)
srun --nodes=1 --ntasks-per-node=2 --cpus-per-task=8 --hint=nomultithread --distribution=block:block gpu_launch.sh \ ncu --section "SpeedOfLight" python examples/seismic/acoustic/acoustic_example.py -d 280 158 158 --tn 4 -so 8
Notes:
TURSA BUG: When setting the GPU frequency you will see an error in the output from the job that says control disabled. This is an incorrect message due to an issue with how Slurm sets the GPU frequency and can be safely ignored.