From 3cbe71f7f0ba97bec4a6563fb84cf103f92c48de Mon Sep 17 00:00:00 2001 From: david huber Date: Tue, 24 Sep 2024 11:36:17 +0000 Subject: [PATCH] Initial port to Acorn. --- modulefiles/gsi_acorn.intel.lua | 50 ++++++++++ regression/regression_param.sh | 41 ++++---- regression/regression_var.sh | 10 +- ush/detect_machine.sh | 4 +- ush/module-setup.sh | 4 + ush/sub_acorn | 171 ++++++++++++++++++++++++++++++++ 6 files changed, 257 insertions(+), 23 deletions(-) create mode 100644 modulefiles/gsi_acorn.intel.lua create mode 100755 ush/sub_acorn diff --git a/modulefiles/gsi_acorn.intel.lua b/modulefiles/gsi_acorn.intel.lua new file mode 100644 index 0000000000..401a2a98e3 --- /dev/null +++ b/modulefiles/gsi_acorn.intel.lua @@ -0,0 +1,50 @@ +help([[ +]]) + + +local PrgEnv_intel_ver=os.getenv("PrgEnv_intel_ver") or "8.1.0" +local intel_ver=os.getenv("intel_ver") or "19.1.3.304" +local craype_ver=os.getenv("craype_ver") or "2.7.8" +local cray_mpich_ver=os.getenv("cray_mpich_ver") or "8.1.7" +local cmake_ver= os.getenv("cmake_ver") or "3.20.2" +local python_ver=os.getenv("python_ver") or "3.8.6" +local prod_util_ver=os.getenv("prod_util_ver") or "2.0.10" + +local netcdf_ver=os.getenv("netcdf_ver") or "4.7.4" +local bufr_ver=os.getenv("bufr_ver") or "11.7.0" +local bacio_ver=os.getenv("bacio_ver") or "2.4.1" +local w3emc_ver=os.getenv("w3emc_ver") or "2.9.2" +local sp_ver=os.getenv("sp_ver") or "2.3.3" +local ip_ver=os.getenv("ip_ver") or "3.3.3" +local sigio_ver=os.getenv("sigio_ver") or "2.3.2" +local sfcio_ver=os.getenv("sfcio_ver") or "1.4.1" +local nemsio_ver=os.getenv("nemsio_ver") or "2.5.4" +local wrf_io_ver=os.getenv("wrf_io_ver") or "1.2.0" +local ncio_ver=os.getenv("ncio_ver") or "1.1.2" +local crtm_ver=os.getenv("crtm_ver") or "2.4.0" +local ncdiag_ver=os.getenv("ncdiag_ver") or "1.1.1" + +load("PrgEnv-intel") +load("intel") +load("craype") +load("cray-mpich") +load(pathJoin("cmake", cmake_ver)) +load(pathJoin("python", python_ver)) +load(pathJoin("prod_util", prod_util_ver)) +load(pathJoin("netcdf", netcdf_ver)) +load(pathJoin("bufr", bufr_ver)) +load(pathJoin("bacio", bacio_ver)) +load(pathJoin("w3emc", w3emc_ver)) +load(pathJoin("sp", sp_ver)) +load(pathJoin("ip", ip_ver)) +load(pathJoin("sigio", sigio_ver)) +load(pathJoin("sfcio", sfcio_ver)) +load(pathJoin("nemsio", nemsio_ver)) +load(pathJoin("wrf_io", wrf_io_ver)) +load(pathJoin("ncio", ncio_ver)) +load(pathJoin("crtm", crtm_ver)) +load(pathJoin("ncdiag",ncdiag_ver)) + +pushenv("GSI_BINARY_SOURCE_DIR", "/lfs/h2/emc/global/noscrub/emc.global/FIX/fix/gsi/20230911") + +whatis("Description: GSI environment on WCOSS2 Acorn") diff --git a/regression/regression_param.sh b/regression/regression_param.sh index 209762569b..34d5964d87 100755 --- a/regression/regression_param.sh +++ b/regression/regression_param.sh @@ -4,8 +4,8 @@ regtest=$1 case $machine in - Hera) - sub_cmd="sub_hera" + Hera) + sub_cmd="sub_hera" memnode=96 numcore=40 ;; @@ -19,23 +19,28 @@ case $machine in memnode=512 numcore=40 ;; - Jet) - sub_cmd="sub_jet" + Jet) + sub_cmd="sub_jet" memnode=96 numcore=40 ;; - Gaea) - sub_cmd="sub_gaea" + Gaea) + sub_cmd="sub_gaea" memnode=251 numcore=128 ;; - wcoss2) - sub_cmd="sub_wcoss2" + wcoss2) + sub_cmd="sub_wcoss2" memnode=512 numcore=128 ;; - Discover) - sub_cmd="sub_discover" + acorn) + sub_cmd="sub_acorn" + memnode=512 + numcore=128 + ;; + Discover) + sub_cmd="sub_discover" ;; *) # EXIT out for unresolved machine echo "unknown $machine" @@ -71,7 +76,7 @@ case $regtest in elif [[ "$machine" = "Gaea" ]]; then topts[1]="0:10:00" ; popts[1]="12/8/" ; ropts[1]="/1" topts[2]="0:10:00" ; popts[2]="12/10/" ; ropts[2]="/2" - elif [[ "$machine" = "wcoss2" ]]; then + elif [[ "$machine" = "wcoss2" || "$machine" = "acorn" ]]; then topts[1]="0:10:00" ; popts[1]="12/8/" ; ropts[1]="/1" topts[2]="0:10:00" ; popts[2]="12/10/" ; ropts[2]="/2" fi @@ -101,7 +106,7 @@ case $regtest in elif [[ "$machine" = "Gaea" ]]; then topts[1]="0:15:00" ; popts[1]="64/1/" ; ropts[1]="/1" topts[2]="0:15:00" ; popts[2]="128/2/" ; ropts[2]="/1" - elif [[ "$machine" = "wcoss2" ]]; then + elif [[ "$machine" = "wcoss2" || "$machine" = "acorn" ]]; then topts[1]="0:15:00" ; popts[1]="64/1/" ; ropts[1]="/1" topts[2]="0:15:00" ; popts[2]="128/2/" ; ropts[2]="/1" fi @@ -131,7 +136,7 @@ case $regtest in elif [[ "$machine" = "Gaea" ]]; then topts[1]="0:15:00" ; popts[1]="64/1/" ; ropts[1]="/1" topts[2]="0:15:00" ; popts[2]="128/2/" ; ropts[2]="/1" - elif [[ "$machine" = "wcoss2" ]]; then + elif [[ "$machine" = "wcoss2" || "$machine" = "acorn" ]]; then topts[1]="0:15:00" ; popts[1]="64/1/" ; ropts[1]="/1" topts[2]="0:15:00" ; popts[2]="128/2/" ; ropts[2]="/1" fi @@ -160,7 +165,7 @@ case $regtest in elif [[ "$machine" = "Gaea" ]]; then topts[1]="0:15:00" ; popts[1]="64/1/" ; ropts[1]="/1" topts[2]="0:15:00" ; popts[2]="128/2/" ; ropts[2]="/1" - elif [[ "$machine" = "wcoss2" ]]; then + elif [[ "$machine" = "wcoss2" || "$machine" = "acorn" ]]; then topts[1]="0:15:00" ; popts[1]="64/1/" ; ropts[1]="/1" topts[2]="0:15:00" ; popts[2]="128/2/" ; ropts[2]="/1" fi @@ -190,7 +195,7 @@ case $regtest in elif [[ "$machine" = "Gaea" ]]; then topts[1]="0:15:00" ; popts[1]="28/1/" ; ropts[1]="/1" topts[2]="0:15:00" ; popts[2]="28/2/" ; ropts[2]="/1" - elif [[ "$machine" = "wcoss2" ]]; then + elif [[ "$machine" = "wcoss2" || "$machine" = "acorn" ]]; then topts[1]="0:15:00" ; popts[1]="64/1/" ; ropts[1]="/1" topts[2]="0:15:00" ; popts[2]="64/2/" ; ropts[2]="/1" fi @@ -220,7 +225,7 @@ case $regtest in elif [[ "$machine" = "Gaea" ]]; then topts[1]="0:30:00" ; popts[1]="14/8/" ; ropts[1]="/1" topts[2]="0:30:00" ; popts[2]="14/14/" ; ropts[2]="/1" - elif [[ "$machine" = "wcoss2" ]]; then + elif [[ "$machine" = "wcoss2" || "$machine" = "acorn" ]]; then topts[1]="0:30:00" ; popts[1]="14/8/" ; ropts[1]="/1" topts[2]="0:30:00" ; popts[2]="14/14/" ; ropts[2]="/2" fi @@ -250,7 +255,7 @@ case $regtest in elif [[ "$machine" = "Gaea" ]]; then topts[1]="0:10:00" ; popts[1]="16/2/" ; ropts[1]="/1" topts[2]="0:10:00" ; popts[2]="16/4/" ; ropts[2]="/2" - elif [[ "$machine" = "wcoss2" ]]; then + elif [[ "$machine" = "wcoss2" || "$machine" = "acorn" ]]; then topts[1]="0:10:00" ; popts[1]="16/2/" ; ropts[1]="/1" topts[2]="0:10:00" ; popts[2]="16/4/" ; ropts[2]="/2" fi @@ -316,7 +321,7 @@ elif [[ "$machine" = "Gaea" ]]; then export MPI_BUFS_PER_HOST=256 export MPI_GROUP_MAX=256 export APRUN="srun --export=ALL -n \$ntasks" -elif [[ "$machine" = "wcoss2" ]]; then +elif [[ "$machine" = "wcoss2" || "$machine" = "acorn" ]]; then export OMP_PLACES=cores export OMP_STACKSIZE=2G export FORT_BUFFERED=true diff --git a/regression/regression_var.sh b/regression/regression_var.sh index 4a2bc85874..d4e7d99aa0 100755 --- a/regression/regression_var.sh +++ b/regression/regression_var.sh @@ -47,8 +47,12 @@ elif [[ -d /work ]]; then # Orion or Hercules else export machine="Orion" fi -elif [[ -d /lfs/h2 ]]; then # wcoss2 - export machine="wcoss2" +elif [[ -d /lfs/h2 ]]; then # wcoss2 or acorn + if [[ $(hostname -f) =~ "alogin" ]]; then + export machine="acorn" + else + export machine="wcoss2" + fi fi echo "Running Regression Tests on '$machine'"; @@ -63,7 +67,7 @@ case $machine in export check_resource="no" export accnt="ufs-ard" ;; - wcoss2) + wcoss2 | acorn) export local_or_default="${local_or_default:-/lfs/h2/emc/da/noscrub/$LOGNAME}" if [ -d $local_or_default ]; then export noscrub="$local_or_default/noscrub" diff --git a/ush/detect_machine.sh b/ush/detect_machine.sh index 0beb937f7e..e06775dbbe 100755 --- a/ush/detect_machine.sh +++ b/ush/detect_machine.sh @@ -14,8 +14,8 @@ # First detect w/ hostname case $(hostname -f) in - adecflow0[12].acorn.wcoss2.ncep.noaa.gov) MACHINE_ID=acorn ;; ### acorn - alogin0[12].acorn.wcoss2.ncep.noaa.gov) MACHINE_ID=acorn ;; ### acorn + adecflow0[1-3].acorn.wcoss2.ncep.noaa.gov) MACHINE_ID=acorn ;; ### acorn + alogin0[1-3].acorn.wcoss2.ncep.noaa.gov) MACHINE_ID=acorn ;; ### acorn clogin0[1-9].cactus.wcoss2.ncep.noaa.gov) MACHINE_ID=wcoss2 ;; ### cactus01-9 clogin10.cactus.wcoss2.ncep.noaa.gov) MACHINE_ID=wcoss2 ;; ### cactus10 dlogin0[1-9].dogwood.wcoss2.ncep.noaa.gov) MACHINE_ID=wcoss2 ;; ### dogwood01-9 diff --git a/ush/module-setup.sh b/ush/module-setup.sh index 299e13aa4e..41d0d7c655 100755 --- a/ush/module-setup.sh +++ b/ush/module-setup.sh @@ -40,6 +40,10 @@ elif [[ $MACHINE_ID = wcoss2 ]]; then # We are on WCOSS2 module reset +elif [[ $MACHINE_ID = acorn ]]; then + # We are on WCOSS2-Acorn + module reset + elif [[ $MACHINE_ID = stampede* ]] ; then # We are on TACC Stampede if ( ! eval module help > /dev/null 2>&1 ) ; then diff --git a/ush/sub_acorn b/ush/sub_acorn new file mode 100755 index 0000000000..ed0f74c7cc --- /dev/null +++ b/ush/sub_acorn @@ -0,0 +1,171 @@ +#!/bin/sh --login +set -x +echo "starting sub_acorn" +usage="\ +Usage: $0 [options] executable [args] + where the options are: + -a account account (default: none) + -b binding run smt binding or not (default:NO) + -d dirin initial directory (default: cwd) + -e envars copy comma-separated environment variables + -g group group name + -i append standard input to command file + -j jobname specify jobname (default: executable basename) + -m machine machine on which to run (default: current) + -n write command file to stdout rather than submitting it + -o output specify output file (default: jobname.out) + -p procs[/nodes[/ppreq] + number of MPI tasks and optional nodes or Bblocking and + ppreq option (N or S) (defaults: serial, Bunlimited, S) + -q queue[/qpreq] queue name and optional requirement, e.g. dev/P + (defaults: 1 if serial or dev if parallel and none) + (queue 3 or 4 is dev or prod with twice tasks over ip) + (options: P=parallel, B=bigmem, b=batch) + -r rmem[/rcpu] resources memory and cpus/task (default: '1024 mb', 1) + -t timew wall time limit in [[hh:]mm:]ss format (default: 900) + -u userid userid to run under (default: self) + -v verbose mode + -w when when to run, in yyyymmddhh[mm], +hh[mm], thh[mm], or + Thh[mm] (full, incremental, today or tomorrow) format + (default: now) +Function: This command submits a job to the batch queue." +subcmd="$*" +stdin=NO +nosub=NO +account="" +binding="NO" +dirin="" +envars="" +group="" +jobname="" +machine="" +output="" +procs=0 +nodes="" +ppreq="" +queue="" +qpreq="" +rmem="1024" +rcpu="1" +timew="900" +userid="" +verbose=NO +when="" +while getopts a:b:d:e:g:ij:m:no:p:q:r:t:u:vw: opt;do + case $opt in + a) account="$OPTARG";; + b) binding="$OPTARG";; + d) dirin="$OPTARG";; + e) envars="$OPTARG";; + g) group="$OPTARG";; + i) stdin=YES;; + j) jobname=$OPTARG;; + m) machine="$OPTARG";; + n) nosub=YES;; + o) output=$OPTARG;; + p) procs=$(echo $OPTARG/|cut -d/ -f1);nodes=$(echo $OPTARG/|cut -d/ -f2);ppreq=$(echo $OPTARG/|cut -d/ -f3);; + q) queue=$(echo $OPTARG/|cut -d/ -f1);qpreq=$(echo $OPTARG/|cut -d/ -f2);; + r) rmem=$(echo $OPTARG/|cut -d/ -f1);rcpu=$(echo $OPTARG/|cut -d/ -f2);; + t) timew=$OPTARG;; + u) userid=$OPTARG;; + v) verbose=YES;; + w) when=$OPTARG;; + \?) echo $0: invalid option >&2;echo "$usage" >&2;exit 1;; + esac +done +shift $(($OPTIND-1)) +if [[ $# -eq 0 ]];then + echo $0: missing executable name >&2;echo "$usage" >&2;exit 1 +fi +exec=$1 +if [[ ! -s $exec ]]&&which $exec >/dev/null 2>&1;then + exec=$(which $exec) +fi +shift +args="$*" +bn=$(basename $exec) +export jobname=${jobname:-$bn} +output=${output:-$jobname.out} +myuser=$LOGNAME +myhost=$(hostname) + +DATA=/lfs/h2/emc/stmp/$LOGNAME/tmp +mkdir -p $DATA + +queue=${queue:-dev} +timew=${timew:-01:20:00} +task_node=${task_node:-$procs} +size=$((nodes*task_node)) +envars=$envars +threads=${rcpu:-1} +ncpus=$((procs*threads)) + +export TZ=GMT +cfile=$DATA/sub$$ +> $cfile +echo "#!/bin/bash" >> $cfile +echo "" >> $cfile +echo "#PBS -o $output" >> $cfile +echo "#PBS -N $jobname" >> $cfile +echo "#PBS -q $queue" >> $cfile +echo "#PBS -l walltime=$timew" >> $cfile +echo "#PBS -l select=$nodes:mpiprocs=$procs:ompthreads=$threads:ncpus=$ncpus" >> $cfile +echo "#PBS -l place=vscatter:exclhost" >> $cfile +echo "#PBS -j oe" >> $cfile +echo "#PBS -A "$accnt >> $cfile + +echo "" >> $cfile +echo "export OMP_NUM_THREADS=$threads" >> $cfile +echo "export ntasks=$(( $nodes * $procs ))" >> $cfile +echo "export ppn=$procs" >> $cfile +echo "export threads=$threads" >> $cfile +echo "" >> $cfile +echo ". "$(awk '{ print $1, $2, $3, $4, $5, $6, $7, $8, $9 }' $regdir/regression_var.out) >>$cfile +echo "" >> $cfile + +echo "module reset" >> $cfile +echo "module use $modulefiles" >> $cfile +echo "module load gsi_acorn.intel" >> $cfile +echo "module load envvar/1.0" >> $cfile +echo "module load cray-pals/1.2.2" >> $cfile +echo "module -t list 2>&1 | while read line;do module show $line 2>&1 | sed -n -e '2p';done | sort" >> $cfile +echo "module avail" >> $cfile + +echo "" >> $cfile + +cat $exec >> $cfile + +if [[ $nosub = YES ]];then + cat $cfile + exit +elif [[ $verbose = YES ]];then + set -x + cat $cfile +fi + +if [[ $stdin = YES ]];then + cat +fi >>$cfile +if [[ $nosub = YES ]];then + cat $cfile + exit +elif [[ $verbose = YES ]];then + set -x + cat $cfile +fi +qsub=${qsub:-qsub} + +ofile=$DATA/subout$$ +>$ofile +chmod 777 $ofile +$qsub -V $cfile >$ofile +rc=$? +cat $ofile +if [[ -w $SUBLOG ]];then + jobn=$(grep -i submitted $ofile|head -n1|cut -d\" -f2) + date -u +"%Y%m%d%H%M%S : $subcmd : $jobn" >>$SUBLOG +fi +##rm $cfile $ofile +##[[ $MKDATA = YES ]] && rmdir $DATA +echo "ending sub_acorn" +exit $rc