From 4a6f0365c919c0da99ca030358967e500b04bf88 Mon Sep 17 00:00:00 2001 From: Andrea Zen Date: Thu, 24 Jun 2021 19:12:37 +0200 Subject: [PATCH 01/16] Modified nexus/lib/physical_system.py to read poscar files with a different name, specifying "format" --- nexus/lib/physical_system.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/nexus/lib/physical_system.py b/nexus/lib/physical_system.py index 295dfbb93d..93f7b65916 100644 --- a/nexus/lib/physical_system.py +++ b/nexus/lib/physical_system.py @@ -662,8 +662,12 @@ def generate_physical_system(**kwargs): is_str = isinstance(s,str) if is_str: if os.path.exists(s): - if 'elem' in kwargs: + if 'elem' in kwargs and 'format' in kwargs: + s = read_structure(s,elem=kwargs['elem'],format=kwargs['format']) + elif 'elem' in kwargs: s = read_structure(s,elem=kwargs['elem']) + elif 'format' in kwargs: + s = read_structure(s,format=kwargs['format']) else: s = read_structure(s) #end if @@ -699,6 +703,7 @@ def generate_physical_system(**kwargs): del kwargs['net_charge'] del kwargs['tiled_spin'] del kwargs['extensive'] + del kwargs['format'] if 'particles' in kwargs: particles = kwargs['particles'] del kwargs['particles'] From 8fc2c095f5eec45264cf63212d3abc786ae604a2 Mon Sep 17 00:00:00 2001 From: Andrea Zen Date: Tue, 5 Apr 2022 18:14:18 +0100 Subject: [PATCH 02/16] machine archer2 --- nexus/lib/machines.py | 91 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/nexus/lib/machines.py b/nexus/lib/machines.py index 1f7061aa01..c03b2b6181 100644 --- a/nexus/lib/machines.py +++ b/nexus/lib/machines.py @@ -3225,6 +3225,96 @@ def write_job_header(self,job): #end class Andes +## Added 05/04/2022 by A Zen +class Archer2(Supercomputer): + + name = 'archer2' + requires_account = True + batch_capable = True + #executable_subfile = True + prefixed_output = True + outfile_extension = '.output' + errfile_extension = '.error' + + def post_process_job(self,job): + job.run_options.add( + N='-N {}'.format(job.nodes), + n='-n {}'.format(job.processes), + ) + if job.threads>1: + job.run_options.add( + c = '-c {}'.format(job.threads), + ) + if 'cpu_bind' not in job.run_options: + if job.processes_per_node==self.cores_per_node: + cpu_bind = '--cpu-bind=threads' + else: + cpu_bind = '--cpu-bind=cores' + #end if + job.run_options.add( + cpu_bind = cpu_bind + ) + #end if + #end if + #end def post_process_job + + def write_job_header(self,job): + if job.qos is None: + job.qos='standard' + #end if + base_partition = None + if job.qos == 'long': + max_time = 48 + max_partition = 64 + elif 'short' in job.qos: + max_time = 20.0/60.0 + max_partition = 32 + else: + max_time = 24 + max_partition = 1024 + #end if + job.total_hours = job.days*24 + job.hours + job.minutes/60.0 + job.seconds/3600.0 + if job.total_hours > max_time: + self.warn('!!! ATTENTION !!!\n the maximum runtime on {0} should not be more than {1}\n you requested: {2}'.format(job.queue,max_time,job.total_hours)) + job.hours = max_time + job.minutes =0 + job.seconds =0 + #end if + if job.nodes > max_partition: + self.warn('!!! ATTENTION !!!\n the maximum nodes on {0} should not be more than {1}\n you requested: {2}'.format(job.queue,max_partition,job.nodes)) + job.nodes = max_partition + #end if + + c='#!/bin/bash\n' + c+='#SBATCH --job-name '+str(job.name)+'\n' + c+='#SBATCH --account='+str(job.account)+'\n' + c+='#SBATCH -N '+str(job.nodes)+'\n' + c += '#SBATCH --ntasks-per-node={0}\n'.format(job.processes_per_node) + c += '#SBATCH --cpus-per-task={0}\n'.format(job.threads) + c+='#SBATCH -t {0}:{1}:{2}\n'.format(str(job.hours+24*job.days).zfill(2),str(job.minutes).zfill(2),str(job.seconds).zfill(2)) + c+='#SBATCH -o {0}\n'.format(job.outfile) + c+='#SBATCH -e {0}\n'.format(job.errfile) + c+='#SBATCH --partition=standard\n' + c+='#SBATCH --qos={0}\n'.format(job.qos) + if job.email is not None: + c+='#SBATCH --mail-user {}\n'.format(job.email) + c+='#SBATCH --mail-type ALL\n' + #c+='#SBATCH --mail-type FAIL\n' + #end if + c+='\n' + c+='cd $SLURM_SUBMIT_DIR\n' + c+='\n' + c+='echo JobID : $SLURM_JOBID \n' + c+='echo Number of nodes requested: $SLURM_JOB_NUM_NODES \n' + c+='echo List of nodes assigned to the job: $SLURM_NODELIST \n' + c+='\n' + c+='export OMP_NUM_THREADS={0}\n'.format(job.threads) + c+='\n' + return c + #end def write_job_header +#end class Archer2 + + class Tomcat3(Supercomputer): name = 'tomcat3' requires_account = False @@ -3297,6 +3387,7 @@ def write_job_header(self,job): Andes( 704, 2, 16, 256, 1000, 'srun', 'sbatch', 'squeue', 'scancel') Tomcat3( 8, 1, 64, 192, 1000, 'mpirun', 'sbatch', 'sacct', 'scancel') SuperMUC_NG( 6336, 1, 48, 96, 1000,'mpiexec', 'sbatch', 'sacct', 'scancel') +Archer2( 5860, 2, 128, 256, 1000, 'srun', 'sbatch', 'squeue', 'scancel') #machine accessor functions From d4b04f2a1bf01ccf9c0fa544277591f5334fb8aa Mon Sep 17 00:00:00 2001 From: Andrea Zen Date: Tue, 5 Apr 2022 18:15:36 +0100 Subject: [PATCH 03/16] config for archer2 --- config/build_archer2.sh | 51 +++++++++++++++++++++++++++++++++++++++++ config/load_archer2.sh | 19 +++++++++++++++ 2 files changed, 70 insertions(+) create mode 100755 config/build_archer2.sh create mode 100755 config/load_archer2.sh diff --git a/config/build_archer2.sh b/config/build_archer2.sh new file mode 100755 index 0000000000..e27dd983a7 --- /dev/null +++ b/config/build_archer2.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +BUILD_MODULES=config/load_archer2.sh + +#module purge +#echo "Purging current module set" +echo "Sourcing file: $BUILD_MODULES to build QMCPACK" + +. $BUILD_MODULES + +module list + +echo "Either source $BUILD_MODULES or load these same modules to run QMCPACK" + +#export BLAS_LIBS="-L$OLCF_OPENBLAS_ROOT/lib -lopenblas" +#export LAPACK_LIBS="$BLAS_LIBS $OLCF_NETLIB_LAPACK_ROOT/lib64/liblapack.a" + +declare -A builds=( ["cpu"]="-DBUILD_PPCONVERT=1" \ + ["complex_cpu"]="-DQMC_COMPLEX=1" \ +# ["legacy_gpu"]="-DQMC_CUDA=1 " \ +# ["complex_legacy_gpu"]="-DQMC_CUDA=1 -DQMC_COMPLEX=1 " \ + ) + +mkdir bin + +for build in "${!builds[@]}" +do + echo "building: $build with ${builds[$build]}" + rm bin/qmcpack_${build} + mkdir build_${build} + cd build_${build} + cmake -DCMAKE_C_COMPILER="cc" \ + -DCMAKE_CXX_COMPILER="CC" \ + -DCMAKE_SYSTEM_NAME=CrayLinuxEnvironment \ + -D LIBXML2_LIBRARY="$LIBXML2_ROOT/lib/libxml2.so" \ + -D LIBXML2_INCLUDE_DIR="$LIBXML2_ROOT//include/libxml2" \ + -DBUILD_LMYENGINE_INTERFACE=0 \ + ${builds[$build]} \ + .. + make -j 20 + if [ $? -eq 0 ]; then + build_dir=$(pwd) + if [ -e ${build_dir}/bin/qmcpack_complex ]; then + ln -sf ${build_dir}/bin/qmcpack_complex ${build_dir}/../bin/qmcpack_${build} + else + ln -sf ${build_dir}/bin/qmcpack ${build_dir}/../bin/qmcpack_${build} + fi + fi + cd .. +done + diff --git a/config/load_archer2.sh b/config/load_archer2.sh new file mode 100755 index 0000000000..5d0d09d4c4 --- /dev/null +++ b/config/load_archer2.sh @@ -0,0 +1,19 @@ +#!/bin/bash +echo "Loading QMCPACK dependency modules for archer2" +echo "https://docs.archer2.ac.uk/" +echo +module swap PrgEnv-cray PrgEnv-gnu/8.1.0 +#module load openmpi/4.0.4 +#module load openblas/0.3.12 +#module load netlib-lapack +#module load netlib-scalapack +#module load cray-hdf5 +module load cray-hdf5-parallel +module load cray-fftw +export FFTW_ROOT=$FFTW_DIR/.. +#export FFTW_HOME=$FFTW_DIR/.. +module load libxml2 +module load cmake +module load boost +module load cray-python + From 1b550d0fd8aca685ee71014ac908ad2c8cbd3e86 Mon Sep 17 00:00:00 2001 From: Andrea Zen Date: Mon, 11 Apr 2022 16:24:41 +0100 Subject: [PATCH 04/16] corrected error in nu. cpus per node --- nexus/lib/machines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/lib/machines.py b/nexus/lib/machines.py index c03b2b6181..94107db712 100644 --- a/nexus/lib/machines.py +++ b/nexus/lib/machines.py @@ -3387,7 +3387,7 @@ def write_job_header(self,job): Andes( 704, 2, 16, 256, 1000, 'srun', 'sbatch', 'squeue', 'scancel') Tomcat3( 8, 1, 64, 192, 1000, 'mpirun', 'sbatch', 'sacct', 'scancel') SuperMUC_NG( 6336, 1, 48, 96, 1000,'mpiexec', 'sbatch', 'sacct', 'scancel') -Archer2( 5860, 2, 128, 256, 1000, 'srun', 'sbatch', 'squeue', 'scancel') +Archer2( 5860, 1, 128, 256, 1000, 'srun', 'sbatch', 'squeue', 'scancel') #machine accessor functions From b5ada6c30ef46f48ad856a5ec23b9766f7ed2f8e Mon Sep 17 00:00:00 2001 From: Andrea Zen Date: Fri, 22 Apr 2022 12:27:44 +0100 Subject: [PATCH 05/16] config archer2 updates, right mpi libs --- config/load_archer2.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/config/load_archer2.sh b/config/load_archer2.sh index 5d0d09d4c4..7aa6299842 100755 --- a/config/load_archer2.sh +++ b/config/load_archer2.sh @@ -2,7 +2,11 @@ echo "Loading QMCPACK dependency modules for archer2" echo "https://docs.archer2.ac.uk/" echo -module swap PrgEnv-cray PrgEnv-gnu/8.1.0 +#module swap PrgEnv-cray PrgEnv-gnu +module purge +module load PrgEnv-gnu +module load craype-network-ucx +module load cray-mpich-ucx #module load openmpi/4.0.4 #module load openblas/0.3.12 #module load netlib-lapack From 7a20892d1de7aa2c3318c8ef135fb7eb3bd9b365 Mon Sep 17 00:00:00 2001 From: Andrea Zen Date: Tue, 26 Apr 2022 09:54:07 +0100 Subject: [PATCH 06/16] config for archer2 https://docs.archer2.ac.uk/user-guide/hardware/ --- config/load_archer2.sh | 13 +++---------- nexus/lib/machines.py | 23 ++++++++++++++--------- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/config/load_archer2.sh b/config/load_archer2.sh index 7aa6299842..45827ea431 100755 --- a/config/load_archer2.sh +++ b/config/load_archer2.sh @@ -2,22 +2,15 @@ echo "Loading QMCPACK dependency modules for archer2" echo "https://docs.archer2.ac.uk/" echo -#module swap PrgEnv-cray PrgEnv-gnu -module purge +module restore module load PrgEnv-gnu -module load craype-network-ucx -module load cray-mpich-ucx -#module load openmpi/4.0.4 -#module load openblas/0.3.12 -#module load netlib-lapack -#module load netlib-scalapack -#module load cray-hdf5 module load cray-hdf5-parallel module load cray-fftw export FFTW_ROOT=$FFTW_DIR/.. -#export FFTW_HOME=$FFTW_DIR/.. module load libxml2 module load cmake module load boost module load cray-python +#module load craype-network-ucx +#module load cray-mpich-ucx diff --git a/nexus/lib/machines.py b/nexus/lib/machines.py index 94107db712..eb2be16305 100644 --- a/nexus/lib/machines.py +++ b/nexus/lib/machines.py @@ -3227,6 +3227,7 @@ def write_job_header(self,job): ## Added 05/04/2022 by A Zen class Archer2(Supercomputer): + # https://docs.archer2.ac.uk/user-guide/hardware/ name = 'archer2' requires_account = True @@ -3238,6 +3239,7 @@ class Archer2(Supercomputer): def post_process_job(self,job): job.run_options.add( + distribution='--distribution=block:block', N='-N {}'.format(job.nodes), n='-n {}'.format(job.processes), ) @@ -3246,13 +3248,16 @@ def post_process_job(self,job): c = '-c {}'.format(job.threads), ) if 'cpu_bind' not in job.run_options: - if job.processes_per_node==self.cores_per_node: - cpu_bind = '--cpu-bind=threads' - else: - cpu_bind = '--cpu-bind=cores' - #end if +# if job.processes_per_node==self.cores_per_node: +# cpu_bind = '--cpu-bind=threads' +# else: +# cpu_bind = '--cpu-bind=cores' +# #end if +# job.run_options.add( +# cpu_bind = cpu_bind +# ) job.run_options.add( - cpu_bind = cpu_bind + hint='--hint=nomultithread', ) #end if #end if @@ -3289,8 +3294,8 @@ def write_job_header(self,job): c+='#SBATCH --job-name '+str(job.name)+'\n' c+='#SBATCH --account='+str(job.account)+'\n' c+='#SBATCH -N '+str(job.nodes)+'\n' - c += '#SBATCH --ntasks-per-node={0}\n'.format(job.processes_per_node) - c += '#SBATCH --cpus-per-task={0}\n'.format(job.threads) + c+='#SBATCH --ntasks-per-node={0}\n'.format(job.processes_per_node) + c+='#SBATCH --cpus-per-task={0}\n'.format(job.threads) c+='#SBATCH -t {0}:{1}:{2}\n'.format(str(job.hours+24*job.days).zfill(2),str(job.minutes).zfill(2),str(job.seconds).zfill(2)) c+='#SBATCH -o {0}\n'.format(job.outfile) c+='#SBATCH -e {0}\n'.format(job.errfile) @@ -3387,7 +3392,7 @@ def write_job_header(self,job): Andes( 704, 2, 16, 256, 1000, 'srun', 'sbatch', 'squeue', 'scancel') Tomcat3( 8, 1, 64, 192, 1000, 'mpirun', 'sbatch', 'sacct', 'scancel') SuperMUC_NG( 6336, 1, 48, 96, 1000,'mpiexec', 'sbatch', 'sacct', 'scancel') -Archer2( 5860, 1, 128, 256, 1000, 'srun', 'sbatch', 'squeue', 'scancel') +Archer2( 5860, 2, 64, 512, 1000, 'srun', 'sbatch', 'squeue', 'scancel') #machine accessor functions From 0b72c9e1071f4d6b012d239bfd2a20f22617493c Mon Sep 17 00:00:00 2001 From: Andrea Zen Date: Tue, 3 May 2022 21:09:29 +0100 Subject: [PATCH 07/16] more efficient mpi moduli in Archer2 --- config/load_archer2.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/load_archer2.sh b/config/load_archer2.sh index 45827ea431..5dec88c28f 100755 --- a/config/load_archer2.sh +++ b/config/load_archer2.sh @@ -11,6 +11,6 @@ module load libxml2 module load cmake module load boost module load cray-python -#module load craype-network-ucx -#module load cray-mpich-ucx +module load craype-network-ucx +module load cray-mpich-ucx From 6843092762d6ed6dde770721038afc5654fcb1b2 Mon Sep 17 00:00:00 2001 From: Andrea Zen Date: Wed, 4 May 2022 09:18:08 +0100 Subject: [PATCH 08/16] build script for archer2, some modificationsWq --- config/build_archer2.sh | 38 ++++++++++++++++++++++++++------------ config/load_archer2.sh | 16 ---------------- 2 files changed, 26 insertions(+), 28 deletions(-) delete mode 100755 config/load_archer2.sh diff --git a/config/build_archer2.sh b/config/build_archer2.sh index e27dd983a7..b63b323e1a 100755 --- a/config/build_archer2.sh +++ b/config/build_archer2.sh @@ -1,24 +1,37 @@ #!/bin/bash -BUILD_MODULES=config/load_archer2.sh - -#module purge -#echo "Purging current module set" -echo "Sourcing file: $BUILD_MODULES to build QMCPACK" - -. $BUILD_MODULES +echo "ARCHER2: Information on hardware and software" +echo "https://www.archer2.ac.uk/about/hardware.html" +echo "and documentation:" +echo "https://docs.archer2.ac.uk" +echo +echo "Loading QMCPACK dependency modules for archer2" +echo +module restore +module load PrgEnv-gnu +module load cray-hdf5-parallel +module load cray-fftw +export FFTW_ROOT=$FFTW_DIR/.. +module load libxml2 +module load cmake +module load boost +module load cray-python +echo +echo "Loaded moduli:" module list -echo "Either source $BUILD_MODULES or load these same modules to run QMCPACK" +echo +echo "In the running scipt (but not in compilation) also load the following two modules:" +echo " module load craype-network-ucx" +echo " module load cray-mpich-ucx" +echo "which improves a lot the scaling efficiency. " +echo +echo -#export BLAS_LIBS="-L$OLCF_OPENBLAS_ROOT/lib -lopenblas" -#export LAPACK_LIBS="$BLAS_LIBS $OLCF_NETLIB_LAPACK_ROOT/lib64/liblapack.a" declare -A builds=( ["cpu"]="-DBUILD_PPCONVERT=1" \ ["complex_cpu"]="-DQMC_COMPLEX=1" \ -# ["legacy_gpu"]="-DQMC_CUDA=1 " \ -# ["complex_legacy_gpu"]="-DQMC_CUDA=1 -DQMC_COMPLEX=1 " \ ) mkdir bin @@ -37,6 +50,7 @@ do -DBUILD_LMYENGINE_INTERFACE=0 \ ${builds[$build]} \ .. +# -D LibXml2_ROOT=$LIBXML2_ROOT \ make -j 20 if [ $? -eq 0 ]; then build_dir=$(pwd) diff --git a/config/load_archer2.sh b/config/load_archer2.sh deleted file mode 100755 index 5dec88c28f..0000000000 --- a/config/load_archer2.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash -echo "Loading QMCPACK dependency modules for archer2" -echo "https://docs.archer2.ac.uk/" -echo -module restore -module load PrgEnv-gnu -module load cray-hdf5-parallel -module load cray-fftw -export FFTW_ROOT=$FFTW_DIR/.. -module load libxml2 -module load cmake -module load boost -module load cray-python -module load craype-network-ucx -module load cray-mpich-ucx - From 1c5037ec1bc43589611b1f2c8a97cf9266bc4d0f Mon Sep 17 00:00:00 2001 From: Andrea Zen Date: Wed, 4 May 2022 17:37:54 +0100 Subject: [PATCH 09/16] update config/build_archer2.sh --- config/build_archer2.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/config/build_archer2.sh b/config/build_archer2.sh index b63b323e1a..7aa2696506 100755 --- a/config/build_archer2.sh +++ b/config/build_archer2.sh @@ -45,12 +45,10 @@ do cmake -DCMAKE_C_COMPILER="cc" \ -DCMAKE_CXX_COMPILER="CC" \ -DCMAKE_SYSTEM_NAME=CrayLinuxEnvironment \ - -D LIBXML2_LIBRARY="$LIBXML2_ROOT/lib/libxml2.so" \ - -D LIBXML2_INCLUDE_DIR="$LIBXML2_ROOT//include/libxml2" \ + -D LibXml2_ROOT=$LIBXML2_ROOT \ -DBUILD_LMYENGINE_INTERFACE=0 \ ${builds[$build]} \ .. -# -D LibXml2_ROOT=$LIBXML2_ROOT \ make -j 20 if [ $? -eq 0 ]; then build_dir=$(pwd) From 7a0fe99a2cd957e213f0ed27e0eb00a02db1c996 Mon Sep 17 00:00:00 2001 From: Andrea Zen Date: Thu, 5 May 2022 06:30:51 +0100 Subject: [PATCH 10/16] undo change to "physical_system.py" --- nexus/lib/physical_system.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/nexus/lib/physical_system.py b/nexus/lib/physical_system.py index 93f7b65916..295dfbb93d 100644 --- a/nexus/lib/physical_system.py +++ b/nexus/lib/physical_system.py @@ -662,12 +662,8 @@ def generate_physical_system(**kwargs): is_str = isinstance(s,str) if is_str: if os.path.exists(s): - if 'elem' in kwargs and 'format' in kwargs: - s = read_structure(s,elem=kwargs['elem'],format=kwargs['format']) - elif 'elem' in kwargs: + if 'elem' in kwargs: s = read_structure(s,elem=kwargs['elem']) - elif 'format' in kwargs: - s = read_structure(s,format=kwargs['format']) else: s = read_structure(s) #end if @@ -703,7 +699,6 @@ def generate_physical_system(**kwargs): del kwargs['net_charge'] del kwargs['tiled_spin'] del kwargs['extensive'] - del kwargs['format'] if 'particles' in kwargs: particles = kwargs['particles'] del kwargs['particles'] From 25830a3b08d2eb576b1546e5285e350e65ae45a2 Mon Sep 17 00:00:00 2001 From: Andrea Zen Date: Thu, 5 May 2022 08:57:16 +0100 Subject: [PATCH 11/16] changes to machines and test_machines for Archer2 --- nexus/lib/machines.py | 10 ++++------ nexus/tests/unit/test_machines.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/nexus/lib/machines.py b/nexus/lib/machines.py index eb2be16305..db3f6ed5f9 100644 --- a/nexus/lib/machines.py +++ b/nexus/lib/machines.py @@ -3240,6 +3240,7 @@ class Archer2(Supercomputer): def post_process_job(self,job): job.run_options.add( distribution='--distribution=block:block', + hint='--hint=nomultithread', N='-N {}'.format(job.nodes), n='-n {}'.format(job.processes), ) @@ -3247,7 +3248,7 @@ def post_process_job(self,job): job.run_options.add( c = '-c {}'.format(job.threads), ) - if 'cpu_bind' not in job.run_options: +# if 'cpu_bind' not in job.run_options: # if job.processes_per_node==self.cores_per_node: # cpu_bind = '--cpu-bind=threads' # else: @@ -3256,9 +3257,6 @@ def post_process_job(self,job): # job.run_options.add( # cpu_bind = cpu_bind # ) - job.run_options.add( - hint='--hint=nomultithread', - ) #end if #end if #end def post_process_job @@ -3307,8 +3305,8 @@ def write_job_header(self,job): #c+='#SBATCH --mail-type FAIL\n' #end if c+='\n' - c+='cd $SLURM_SUBMIT_DIR\n' - c+='\n' + #c+='cd $SLURM_SUBMIT_DIR\n' + #c+='\n' c+='echo JobID : $SLURM_JOBID \n' c+='echo Number of nodes requested: $SLURM_JOB_NUM_NODES \n' c+='echo List of nodes assigned to the job: $SLURM_NODELIST \n' diff --git a/nexus/tests/unit/test_machines.py b/nexus/tests/unit/test_machines.py index f2b60ec54d..43a699cbe1 100644 --- a/nexus/tests/unit/test_machines.py +++ b/nexus/tests/unit/test_machines.py @@ -1033,6 +1033,12 @@ def job_commands_equal(c1,c2): ('andes' , 'n2_t2' ) : 'srun -N 2 -c 2 --cpu-bind=cores -n 32 test.x', ('andes' , 'n2_t2_e' ) : 'srun -N 2 -c 2 --cpu-bind=cores -n 32 test.x', ('andes' , 'n2_t2_p2' ) : 'srun -N 2 -c 2 --cpu-bind=cores -n 4 test.x', + ('archer2' , 'n1' ) : 'srun --distribution=block:block --hint=nomultithread -N 1 -n 128 test.x', + ('archer2' , 'n1_p1' ) : 'srun --distribution=block:block --hint=nomultithread -N 1 -n 1 test.x', + ('archer2' , 'n2' ) : 'srun --distribution=block:block --hint=nomultithread -N 2 -n 256 test.x', + ('archer2' , 'n2_t2' ) : 'srun --distribution=block:block --hint=nomultithread -N 2 -c 2 -n 128 test.x', + ('archer2' , 'n2_t2_e' ) : 'srun --distribution=block:block --hint=nomultithread -N 2 -c 2 -n 128 test.x', + ('archer2' , 'n2_t2_p2' ) : 'srun --distribution=block:block --hint=nomultithread -N 2 -c 2 -n 4 test.x', ('attaway' , 'n1' ) : 'srun test.x', ('attaway' , 'n1_p1' ) : 'srun test.x', ('attaway' , 'n2' ) : 'srun test.x', @@ -1244,6 +1250,8 @@ def job_commands_equal(c1,c2): m = supercomputers[name] if m.requires_account: acc = 'ABC123' + if m.name=='archer2': + acc = 'e743' else: acc = None #end if @@ -1295,6 +1303,8 @@ def job_commands_equal(c1,c2): #end if if m.requires_account: acc = 'ABC123' + if m.name=='archer2': + acc = 'e743' else: acc = None #end if @@ -1382,6 +1392,25 @@ def test_write_job(): export ENV_VAR=1 export OMP_NUM_THREADS=1 srun -N 2 -n 64 test.x''', + archer2 = '''#!/bin/bash -x +#SBATCH --job-name=jobname +#SBATCH --account=e743 +#SBATCH -N 2 +#SBATCH --ntasks-per-node=128 +#SBATCH --cpus-per-task=1 +#SBATCH -t 01:00:00 +#SBATCH -o test.out +#SBATCH -e test.err +#SBATCH --partition=standard +#SBATCH --qos=standard + +echo JobID : $SLURM_JOBID +echo Number of nodes requested: $SLURM_JOB_NUM_NODES +echo List of nodes assigned to the job: $SLURM_NODELIST + +export OMP_NUM_THREADS=1 + +srun --distribution=block:block --hint=nomultithread -N 2 -n 256 test.x''', attaway = '''#!/bin/bash #SBATCH -p batch #SBATCH --job-name jobname @@ -1913,6 +1942,8 @@ def job_files_same(jf1,jf2): m = supercomputers[name] if m.requires_account: acc = 'ABC123' + if m.name=='archer2': + acc = 'e743' else: acc = None #end if From 07f794541c0748104f69037f4a9b90576bda2db0 Mon Sep 17 00:00:00 2001 From: Andrea Zen Date: Sat, 7 May 2022 22:11:47 +0100 Subject: [PATCH 12/16] fix error test on archer2 machine --- nexus/lib/machines.py | 16 +++++++--------- nexus/tests/unit/test_machines.py | 16 ++++++---------- 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/nexus/lib/machines.py b/nexus/lib/machines.py index db3f6ed5f9..453ce06907 100644 --- a/nexus/lib/machines.py +++ b/nexus/lib/machines.py @@ -3160,10 +3160,6 @@ class Andes(Supercomputer): errfile_extension = '.error' def post_process_job(self,job): - job.run_options.add( - N='-N {}'.format(job.nodes), - n='-n {}'.format(job.processes), - ) if job.threads>1: job.run_options.add( c = '-c {}'.format(job.threads), @@ -3179,6 +3175,10 @@ def post_process_job(self,job): ) #end if #end if + job.run_options.add( + N='-N {}'.format(job.nodes), + n='-n {}'.format(job.processes), + ) #end def post_process_job def write_job_header(self,job): @@ -3307,11 +3307,9 @@ def write_job_header(self,job): c+='\n' #c+='cd $SLURM_SUBMIT_DIR\n' #c+='\n' - c+='echo JobID : $SLURM_JOBID \n' - c+='echo Number of nodes requested: $SLURM_JOB_NUM_NODES \n' - c+='echo List of nodes assigned to the job: $SLURM_NODELIST \n' - c+='\n' - c+='export OMP_NUM_THREADS={0}\n'.format(job.threads) + c+='echo JobID : $SLURM_JOBID\n' + c+='echo Number of nodes requested: $SLURM_JOB_NUM_NODES\n' + c+='echo List of nodes assigned to the job: $SLURM_NODELIST\n' c+='\n' return c #end def write_job_header diff --git a/nexus/tests/unit/test_machines.py b/nexus/tests/unit/test_machines.py index 43a699cbe1..90b7b04e7e 100644 --- a/nexus/tests/unit/test_machines.py +++ b/nexus/tests/unit/test_machines.py @@ -1250,8 +1250,6 @@ def job_commands_equal(c1,c2): m = supercomputers[name] if m.requires_account: acc = 'ABC123' - if m.name=='archer2': - acc = 'e743' else: acc = None #end if @@ -1303,8 +1301,6 @@ def job_commands_equal(c1,c2): #end if if m.requires_account: acc = 'ABC123' - if m.name=='archer2': - acc = 'e743' else: acc = None #end if @@ -1392,13 +1388,13 @@ def test_write_job(): export ENV_VAR=1 export OMP_NUM_THREADS=1 srun -N 2 -n 64 test.x''', - archer2 = '''#!/bin/bash -x -#SBATCH --job-name=jobname -#SBATCH --account=e743 + archer2 = '''#!/bin/bash +#SBATCH --job-name jobname +#SBATCH --account=ABC123 #SBATCH -N 2 #SBATCH --ntasks-per-node=128 #SBATCH --cpus-per-task=1 -#SBATCH -t 01:00:00 +#SBATCH -t 06:30:00 #SBATCH -o test.out #SBATCH -e test.err #SBATCH --partition=standard @@ -1408,6 +1404,7 @@ def test_write_job(): echo Number of nodes requested: $SLURM_JOB_NUM_NODES echo List of nodes assigned to the job: $SLURM_NODELIST +export ENV_VAR=1 export OMP_NUM_THREADS=1 srun --distribution=block:block --hint=nomultithread -N 2 -n 256 test.x''', @@ -1922,6 +1919,7 @@ def process_job_file(jf): def job_files_same(jf1,jf2): jf1 = process_job_file(jf1) jf2 = process_job_file(jf2) + if not object_eq(jf1,jf2): print(f"compare --------------------\n * wj *\n{jf1}\n * ref_wj *\n{jf2}\n") return object_eq(jf1,jf2) #end def job_files_same @@ -1942,8 +1940,6 @@ def job_files_same(jf1,jf2): m = supercomputers[name] if m.requires_account: acc = 'ABC123' - if m.name=='archer2': - acc = 'e743' else: acc = None #end if From 26c3c67d5ec3b8d78f22758f2922b4530574c5d6 Mon Sep 17 00:00:00 2001 From: Andrea Zen Date: Mon, 20 Jun 2022 15:15:48 +0200 Subject: [PATCH 13/16] Daint machine --- nexus/lib/machines.py | 104 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/nexus/lib/machines.py b/nexus/lib/machines.py index 453ce06907..f54850c7df 100644 --- a/nexus/lib/machines.py +++ b/nexus/lib/machines.py @@ -3316,6 +3316,109 @@ def write_job_header(self,job): #end class Archer2 +## Added 16/06/2022 by A Zen +class Daint(Supercomputer): + # https://www.cscs.ch/computers/piz-daint/ + + name = 'daint' + requires_account = True + batch_capable = True + #executable_subfile = True + prefixed_output = True + outfile_extension = '.output' + errfile_extension = '.error' + + def post_process_job(self,job): + if job.gpus is None: + job.gpus = 0 # gpus to use per node + elif job.gpus == 1 and job.processes_per_node is None: + job.threads = 12 # OpenMP thread(s) + job.processes_per_node = 1 # MPI rank(s) + elif job.gpus > 1: + self.warn('!!! ATTENTION !!!\n there is only 1 GPU/node in Daint. It is not possible to set gpus={}'.format(job.gpus)) + if job.processes_per_node is None: + job.threads = 1 # OpenMP thread(s) + job.processes_per_node = 12 # MPI rank(s) + #end if + #end def post_process_job + + def write_job_header(self,job): + if job.queue is None: + job.queue='normal' + #end if + ### Slurm batch queues + # https://user.cscs.ch/access/running/piz_daint/ + base_partition = None + max_partition = 2400 + if job.queue == 'long': # Maximum 5 long jobs in total (one per user) + max_time = 7*24 + max_partition = 4 + elif job.queue == 'large': # Large scale work, by arrangement only + max_time = 12 + max_partition = 4400 + elif job.queue == 'low': # Up to 130% of project's quarterly allocation + max_time = 6 + max_partition = 2400 + elif job.queue == 'prepost': # High priority pre/post processing + max_time = 0.5 # 30 min + max_partition = 1 + else: + max_time = 24 + max_partition = 2400 + #end if + job.total_hours = job.days*24 + job.hours + job.minutes/60.0 + job.seconds/3600.0 + if job.total_hours > max_time: + self.warn('!!! ATTENTION !!!\n the maximum runtime on {0} should not be more than {1}\n you requested: {2}'.format(job.queue,max_time,job.total_hours)) + job.hours = max_time + job.minutes = 0 + job.seconds = 0 + #end if + if job.nodes > max_partition: + self.warn('!!! ATTENTION !!!\n the maximum nodes on {0} should not be more than {1}\n you requested: {2}'.format(job.queue,max_partition,job.nodes)) + job.nodes = max_partition + #end if + + c='#!/bin/bash\n' + c+='#SBATCH --job-name '+str(job.name)+'\n' + c+='#SBATCH --account='+str(job.account)+'\n' + c+='#SBATCH -N '+str(job.nodes)+'\n' + c+='#SBATCH --ntasks-per-node={0}\n'.format(job.processes_per_node) + c+='#SBATCH --cpus-per-task={0}\n'.format(job.threads) + c+='#SBATCH -t {0}:{1}:{2}\n'.format(str(job.hours+24*job.days).zfill(2),str(job.minutes).zfill(2),str(job.seconds).zfill(2)) + c+='#SBATCH -o {0}\n'.format(job.outfile) + c+='#SBATCH -e {0}\n'.format(job.errfile) + c+='#SBATCH --partition={}\n'.format(job.queue) + c+='#SBATCH --constraint=gpu\n' + if job.hyperthreads is None or job.hyperthreads==1: + c+='#SBATCH --hint=nomultithread\n' + c+='#SBATCH --ntasks-per-core=1\n' + elif job.hyperthreads==2: + c+='#SBATCH --hint=multithread\n' + c+='#SBATCH --ntasks-per-core=2\n' + #end if job.hyperthreads + if job.email is not None: + c+='#SBATCH --mail-user {}\n'.format(job.email) + c+='#SBATCH --mail-type ALL\n' + #c+='#SBATCH --mail-type FAIL\n' + #end if + c+='\n' + #c+='module load daint-gpu\n' + #c+='\n' + c+='echo JobID : $SLURM_JOBID\n' + c+='echo Number of nodes requested: $SLURM_JOB_NUM_NODES\n' + c+='echo List of nodes assigned to the job: $SLURM_NODELIST\n' + c+='\n' + c+='export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK\n' + if job.gpus==1: + c+='export CRAY_CUDA_MPS=1\n' + c+='\n' + c+='ulimit -s unlimited\n' + c+='\n' + return c + #end def write_job_header +#end class Daint + + class Tomcat3(Supercomputer): name = 'tomcat3' requires_account = False @@ -3389,6 +3492,7 @@ def write_job_header(self,job): Tomcat3( 8, 1, 64, 192, 1000, 'mpirun', 'sbatch', 'sacct', 'scancel') SuperMUC_NG( 6336, 1, 48, 96, 1000,'mpiexec', 'sbatch', 'sacct', 'scancel') Archer2( 5860, 2, 64, 512, 1000, 'srun', 'sbatch', 'squeue', 'scancel') +Daint( 5704, 1, 12, 64, 1000, 'srun', 'sbatch', 'squeue', 'scancel') #machine accessor functions From 32a0ec7e2711eafb707fcb71fdc3b868131ebbe8 Mon Sep 17 00:00:00 2001 From: Andrea Zen Date: Mon, 20 Jun 2022 15:18:59 +0200 Subject: [PATCH 14/16] config file Daint --- config/build_cscs_daint.sh | 62 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100755 config/build_cscs_daint.sh diff --git a/config/build_cscs_daint.sh b/config/build_cscs_daint.sh new file mode 100755 index 0000000000..0b0325f64e --- /dev/null +++ b/config/build_cscs_daint.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# module purge +#echo "Purging current module set" + +#BUILD_MODULES=config/load_cscs_daint_modules.sh +#echo "Sourcing file: $BUILD_MODULES to build QMCPACK" +#. $BUILD_MODULES + +echo "Loading QMCPACK dependency modules for cscs piz-daint" +echo "https://user.cscs.ch/access/running/piz_daint/" +echo +module swap PrgEnv-cray PrgEnv-intel +module load daint-gpu +module load cudatoolkit +module load EasyBuild-custom/cscs +module load cray-hdf5-parallel +module load CMake +module load cray-python +module load Boost +# install libxml2 for CrayIntel +#eb libxml2-2.9.7-CrayIntel-20.08.eb -r +#module load libxml2/2.9.7-CrayIntel-20.08 +module load libxml2 +module unload cray-libsci +module unload cray-libsci_acc +# make sure there is a recent gcc compiler in the path +#module load gcc/8.3.0 + +module list + +echo "Either source $BUILD_MODULES or load these same modules to run QMCPACK" + +declare -A builds=( \ +["cpu"]=" -DQMC_COMPLEX=0 -DQMC_CUDA=0" \ +["complex_cpu"]="-DQMC_COMPLEX=1 -DQMC_CUDA=0" \ +["legacy_gpu"]=" -DQMC_COMPLEX=0 -DQMC_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES=60 -DENABLE_PHDF5=On -DCUDA_PROPAGATE_HOST_FLAGS=Off -DCUDA_HOST_COMPILER=`which gcc`" \ +["complex_legacy_gpu"]="-DQMC_COMPLEX=1 -DQMC_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES=60 -DENABLE_PHDF5=On -DCUDA_PROPAGATE_HOST_FLAGS=Off -DCUDA_HOST_COMPILER=`which gcc`" \ +) + +mkdir bin + +for build in "${!builds[@]}" +do + echo "building: $build with ${builds[$build]}" + rm bin/qmcpack_${build} + rm -rf build_${build} + mkdir build_${build} + cd build_${build} + cmake \ + -DBUILD_LMYENGINE_INTERFACE=0 \ + -DQMC_MPI=On -DQMC_OMP=On \ + -DCMAKE_SYSTEM_NAME=CrayLinuxEnvironment \ + ${builds[$build]} \ + .. + make -j 20 + if [ $? -eq 0 ]; then + build_dir=$(pwd) + ln -sf ${build_dir}/bin/qmcpack ${build_dir}/../bin/qmcpack_${build} + fi + cd .. +done From 002036bb8e6a4cad682d03e411131ed98b227e7e Mon Sep 17 00:00:00 2001 From: Andrea Zen Date: Sat, 23 Jul 2022 14:46:47 +0100 Subject: [PATCH 15/16] specifications Daint --- nexus/tests/unit/test_machines.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/nexus/tests/unit/test_machines.py b/nexus/tests/unit/test_machines.py index 90b7b04e7e..3f1050dac6 100644 --- a/nexus/tests/unit/test_machines.py +++ b/nexus/tests/unit/test_machines.py @@ -1093,6 +1093,12 @@ def job_commands_equal(c1,c2): ('cori' , 'n2_t2' ) : 'srun test.x', ('cori' , 'n2_t2_e' ) : 'srun test.x', ('cori' , 'n2_t2_p2' ) : 'srun test.x', + ('daint' , 'n1' ) : 'srun -N 1 -n 12 test.x', + ('daint' , 'n1_p1' ) : 'srun -N 1 -n 1 test.x', + ('daint' , 'n2' ) : 'srun -N 2 -n 24 test.x', + ('daint' , 'n2_t2' ) : 'srun -N 2 -c 2 --cpu-bind=cores -n 12 test.x', + ('daint' , 'n2_t2_e' ) : 'srun -N 2 -c 2 --cpu-bind=cores -n 12 test.x', + ('daint' , 'n2_t2_p2' ) : 'srun -N 2 -c 2 --cpu-bind=cores -n 4 test.x', ('eclipse' , 'n1' ) : 'srun test.x', ('eclipse' , 'n1_p1' ) : 'srun test.x', ('eclipse' , 'n2' ) : 'srun test.x', @@ -1538,6 +1544,24 @@ def test_write_job(): export OMP_NUM_THREADS=1 export ENV_VAR=1 srun test.x''', + daint = '''#!/bin/bash +#SBATCH --job-name jobname +#SBATCH --account=ABC123 +#SBATCH -N 2 +#SBATCH -t 06:30:00 +#SBATCH -o test.out +#SBATCH -e test.err + +cd $SLURM_SUBMIT_DIR + +echo JobID : $SLURM_JOBID +echo Number of nodes requested: $SLURM_JOB_NUM_NODES +echo List of nodes assigned to the job: $SLURM_NODELIST + + +export ENV_VAR=1 +export OMP_NUM_THREADS=1 +srun -N 2 -n 64 test.x''', eclipse = '''#!/bin/bash #SBATCH -p batch #SBATCH --job-name jobname From 4cdd6f9bcebca95c82f7b7245248ddecb3ce0252 Mon Sep 17 00:00:00 2001 From: Andrea Zen Date: Tue, 26 Jul 2022 16:07:50 +0200 Subject: [PATCH 16/16] changes in machines.py --- nexus/lib/machines.py | 2 ++ nexus/tests/unit/test_machines.py | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/nexus/lib/machines.py b/nexus/lib/machines.py index f54850c7df..d285715723 100644 --- a/nexus/lib/machines.py +++ b/nexus/lib/machines.py @@ -3334,11 +3334,13 @@ def post_process_job(self,job): elif job.gpus == 1 and job.processes_per_node is None: job.threads = 12 # OpenMP thread(s) job.processes_per_node = 1 # MPI rank(s) + job.hyperthreads = 1 elif job.gpus > 1: self.warn('!!! ATTENTION !!!\n there is only 1 GPU/node in Daint. It is not possible to set gpus={}'.format(job.gpus)) if job.processes_per_node is None: job.threads = 1 # OpenMP thread(s) job.processes_per_node = 12 # MPI rank(s) + job.hyperthreads = 1 #end if #end def post_process_job diff --git a/nexus/tests/unit/test_machines.py b/nexus/tests/unit/test_machines.py index 3f1050dac6..263fbc5555 100644 --- a/nexus/tests/unit/test_machines.py +++ b/nexus/tests/unit/test_machines.py @@ -1548,9 +1548,15 @@ def test_write_job(): #SBATCH --job-name jobname #SBATCH --account=ABC123 #SBATCH -N 2 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=12 #SBATCH -t 06:30:00 #SBATCH -o test.out #SBATCH -e test.err +#SBATCH --partition=normal +#SBATCH --constraint=gpu +#SBATCH --hint=nomultithread +#SBATCH --ntasks-per-core=1 cd $SLURM_SUBMIT_DIR @@ -1561,6 +1567,9 @@ def test_write_job(): export ENV_VAR=1 export OMP_NUM_THREADS=1 +export CRAY_CUDA_MPS=1 +ulimit -s unlimited\n + srun -N 2 -n 64 test.x''', eclipse = '''#!/bin/bash #SBATCH -p batch