Skip to content

Commit

Permalink
Add initial Hercules support. NOAA-EMC#574
Browse files Browse the repository at this point in the history
  • Loading branch information
David Huber committed Nov 9, 2023
1 parent 9596bd6 commit 21a9a88
Show file tree
Hide file tree
Showing 7 changed files with 251 additions and 8 deletions.
26 changes: 26 additions & 0 deletions modulefiles/gsi_hercules.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
help([[
]])

prepend_path("MODULEPATH", "/work/noaa/epic/role-epic/spack-stack/hercules/spack-stack-1.5.1/envs/gsi-addon/install/modulefiles/Core")

local stack_python_ver=os.getenv("stack_python_ver") or "3.10.8"
local stack_intel_ver=os.getenv("stack_intel_ver") or "2021.9.0"
local stack_impi_ver=os.getenv("stack_impi_ver") or "2021.9.0"
local cmake_ver=os.getenv("cmake_ver") or "3.23.1"
local prod_util_ver=os.getenv("prod_util_ver") or "1.2.2"

load(pathJoin("stack-intel", stack_intel_ver))
load(pathJoin("stack-intel-oneapi-mpi", stack_impi_ver))
load(pathJoin("python", stack_python_ver))
load(pathJoin("cmake", cmake_ver))

load("gsi_common")
load(pathJoin("prod_util", prod_util_ver))
load("intel-oneapi-mkl/2022.2.1")

pushenv("CFLAGS", "-xHOST")
pushenv("FFLAGS", "-xHOST")

pushenv("GSI_BINARY_SOURCE_DIR", "/work/noaa/global/glopara/fix/gsi/20230911")

whatis("Description: GSI environment on Hercules with Intel Compilers")
33 changes: 31 additions & 2 deletions regression/regression_param.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,15 @@ case $machine in
memnode=96
numcore=40
;;
Orion)
Orion)
sub_cmd="sub_orion"
memnode=192
numcore=40
;;
Hercules)
sub_cmd="sub_hercules"
memnode=512
numcore=40
;;
Jet)
sub_cmd="sub_jet"
Expand Down Expand Up @@ -59,6 +64,9 @@ case $regtest in
elif [[ "$machine" = "Orion" ]]; then
topts[1]="0:15:00" ; popts[1]="12/5/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="12/9/" ; ropts[2]="/2"
elif [[ "$machine" = "Hercules" ]]; then
topts[1]="0:15:00" ; popts[1]="12/5/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="12/9/" ; ropts[2]="/2"
elif [[ "$machine" = "Jet" ]]; then
topts[1]="0:15:00" ; popts[1]="12/5/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="12/9/" ; ropts[2]="/2"
Expand Down Expand Up @@ -92,6 +100,9 @@ case $regtest in
elif [[ "$machine" = "Orion" ]]; then
topts[1]="0:15:00" ; popts[1]="12/3/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="12/5/" ; ropts[2]="/2"
elif [[ "$machine" = "Hercules" ]]; then
topts[1]="0:15:00" ; popts[1]="12/3/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="12/5/" ; ropts[2]="/2"
elif [[ "$machine" = "Jet" ]]; then
topts[1]="0:15:00" ; popts[1]="12/3/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="12/5/" ; ropts[2]="/2"
Expand Down Expand Up @@ -137,6 +148,9 @@ case $regtest in
elif [[ "$machine" = "Orion" ]]; then
topts[1]="0:10:00" ; popts[1]="12/8/" ; ropts[1]="/1"
topts[2]="0:10:00" ; popts[2]="12/12/" ; ropts[2]="/2"
elif [[ "$machine" = "Hercules" ]]; then
topts[1]="0:10:00" ; popts[1]="12/8/" ; ropts[1]="/1"
topts[2]="0:10:00" ; popts[2]="12/12/" ; ropts[2]="/2"
elif [[ "$machine" = "Jet" ]]; then
topts[1]="0:10:00" ; popts[1]="12/8/" ; ropts[1]="/1"
topts[2]="0:10:00" ; popts[2]="12/10/" ; ropts[2]="/2"
Expand Down Expand Up @@ -170,6 +184,9 @@ case $regtest in
elif [[ "$machine" = "Orion" ]]; then
topts[1]="0:15:00" ; popts[1]="20/1/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="20/2/" ; ropts[2]="/1"
elif [[ "$machine" = "Hercules" ]]; then
topts[1]="0:15:00" ; popts[1]="20/1/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="20/2/" ; ropts[2]="/1"
elif [[ "$machine" = "Jet" ]]; then
topts[1]="0:15:00" ; popts[1]="20/1/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="20/2/" ; ropts[2]="/1"
Expand Down Expand Up @@ -200,6 +217,9 @@ case $regtest in
elif [[ "$machine" = "Orion" ]]; then
topts[1]="0:15:00" ; popts[1]="4/4/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="6/6/" ; ropts[2]="/1"
elif [[ "$machine" = "Hercules" ]]; then
topts[1]="0:15:00" ; popts[1]="4/4/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="6/6/" ; ropts[2]="/1"
elif [[ "$machine" = "Jet" ]]; then
topts[1]="0:15:00" ; popts[1]="4/4/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="6/6/" ; ropts[2]="/1"
Expand Down Expand Up @@ -230,6 +250,9 @@ case $regtest in
elif [[ "$machine" = "Orion" ]]; then
topts[1]="0:30:00" ; popts[1]="6/12/" ; ropts[1]="/1"
topts[2]="0:30:00" ; popts[2]="8/12/" ; ropts[2]="/1"
elif [[ "$machine" = "Hercules" ]]; then
topts[1]="0:30:00" ; popts[1]="6/12/" ; ropts[1]="/1"
topts[2]="0:30:00" ; popts[2]="8/12/" ; ropts[2]="/1"
elif [[ "$machine" = "Jet" ]]; then
topts[1]="0:30:00" ; popts[1]="6/12/" ; ropts[1]="/1"
topts[2]="0:30:00" ; popts[2]="8/12/" ; ropts[2]="/1"
Expand Down Expand Up @@ -290,6 +313,9 @@ case $regtest in
elif [[ "$machine" = "Orion" ]]; then
topts[1]="0:10:00" ; popts[1]="12/3/" ; ropts[1]="/1"
topts[2]="0:10:00" ; popts[2]="12/5/" ; ropts[2]="/2"
elif [[ "$machine" = "Hercules" ]]; then
topts[1]="0:10:00" ; popts[1]="12/3/" ; ropts[1]="/1"
topts[2]="0:10:00" ; popts[2]="12/5/" ; ropts[2]="/2"
elif [[ "$machine" = "Jet" ]]; then
topts[1]="0:10:00" ; popts[1]="12/3/" ; ropts[1]="/1"
topts[2]="0:10:00" ; popts[2]="12/5/" ; ropts[2]="/2"
Expand Down Expand Up @@ -349,7 +375,10 @@ if [[ "$machine" = "Hera" ]]; then
export APRUN="srun"
elif [[ "$machine" = "Orion" ]]; then
export OMP_STACKSIZE=2048M
export APRUN="srun -n \$ntasks"
export APRUN="srun -n \$ntasks --cpus-per-task=\$threads"
elif [[ "$machine" = "Hercules" ]]; then
export OMP_STACKSIZE=2048M
export APRUN="srun -n \$ntasks --cpus-per-task=\$threads"
elif [[ "$machine" = "Jet" ]]; then
export OMP_STACKSIZE=1024M
export MPI_BUFS_PER_PROC=256
Expand Down
19 changes: 14 additions & 5 deletions regression/regression_var.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,10 @@ elif [[ -d /sw/gaea ]]; then # Gaea
export machine="Gaea"
elif [[ -d /data/prod ]]; then # S4
export machine="S4"
elif [[ -d /work ]]; then # Orion
elif [[ -d /work && $(hostname -d) =~ "Orion" ]]; then # Orion
export machine="Orion"
elif [[ -d /work && $(hostname -d) =~ "Hercules" ]]; then # Hercules
export machine="Hercules"
elif [[ -d /lfs/h2 ]]; then # wcoss2
export machine="wcoss2"
fi
Expand Down Expand Up @@ -98,18 +100,25 @@ case $machine in
export check_resource="no"
export accnt="${accnt:-GFS-DEV}"
;;
Orion)
Orion | Hercules)
export local_or_default="${local_or_default:-/work/noaa/da/$LOGNAME}"
if [ -d $local_or_default ]; then
export noscrub="$local_or_default/noscrub"
export noscrub="$local_or_default/noscrub"
elif [ -d /work/noaa/global/$LOGNAME ]; then
export noscrub="/work/noaa/global/$LOGNAME/noscrub"
export noscrub="/work/noaa/global/$LOGNAME/noscrub"
fi

export queue="${queue:-batch}"

if [[ "${machine}" == "Orion" ]]; then
export partition="${partition:-orion}"
else
export partition="${partition:-hercules}"
fi

export group="${group:-global}"
if [[ "$cmaketest" = "false" ]]; then
export basedir="/work/noaa/da/$LOGNAME/gsi"
export basedir="/work/noaa/da/$LOGNAME/gsi"
fi
export ptmp="${ptmp:-/work/noaa/stmp/$LOGNAME/$ptmpName}"

Expand Down
2 changes: 1 addition & 1 deletion ush/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ module list
set -x

# Set CONTROLPATH variable to user develop installation
CONTROLPATH="$DIR_ROOT/../develop/install/bin"
CONTROLPATH="$DIR_ROOT/install/bin"
# Collect BUILD Options
CMAKE_OPTS+=" -DCMAKE_BUILD_TYPE=$BUILD_TYPE"

Expand Down
2 changes: 2 additions & 0 deletions ush/detect_machine.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ case $(hostname -f) in

Orion-login-[1-4].HPC.MsState.Edu) MACHINE_ID=orion ;; ### orion1-4

Hercules-login-[1-4].HPC.MsState.Edu) MACHINE_ID=hercules ;; ### hercules1-4

cheyenne[1-6].cheyenne.ucar.edu) MACHINE_ID=cheyenne ;; ### cheyenne1-6
cheyenne[1-6].ib0.cheyenne.ucar.edu) MACHINE_ID=cheyenne ;; ### cheyenne1-6
chadmin[1-6].ib0.cheyenne.ucar.edu) MACHINE_ID=cheyenne ;; ### cheyenne1-6
Expand Down
7 changes: 7 additions & 0 deletions ush/module-setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@ elif [[ $MACHINE_ID = hera* ]] ; then
fi
module purge

elif [[ $MACHINE_ID = hercules* ]] ; then
# We are on Hercules
if ( ! eval module help > /dev/null 2>&1 ) ; then
source /apps/lmod/lmod/init/bash
fi
module purge

elif [[ $MACHINE_ID = orion* ]] ; then
# We are on Orion
if ( ! eval module help > /dev/null 2>&1 ) ; then
Expand Down
170 changes: 170 additions & 0 deletions ush/sub_hercules
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
#!/bin/sh --login
set -x
usage="\
Usage: $0 [options] executable [args]
where the options are:
-a account account (default: none)
-b binding run smt binding or not (default:NO)
-d dirin initial directory (default: cwd)
-e envars copy comma-separated environment variables
-g group group name
-i append standard input to command file
-j jobname specify jobname (default: executable basename)
-m machine machine on which to run (default: current)
-n write command file to stdout rather than submitting it
-o output specify output file (default: jobname.out)
-p procs[/nodes[/ppreq]
number of MPI tasks and optional nodes or Bblocking and
ppreq option (N or S) (defaults: serial, Bunlimited, S)
-q queue[/qpreq] queue name and optional requirement, e.g. dev/P
(defaults: 1 if serial or dev if parallel and none)
(queue 3 or 4 is dev or prod with twice tasks over ip)
(options: P=parallel, B=bigmem, b=batch)
-r rmem[/rcpu] resources memory and cpus/task (default: '1024 mb', 1)
-t timew wall time limit in [[hh:]mm:]ss format (default: 900)
-u userid userid to run under (default: self)
-v verbose mode
-w when when to run, in yyyymmddhh[mm], +hh[mm], thh[mm], or
Thh[mm] (full, incremental, today or tomorrow) format
(default: now)
Function: This command submits a job to the batch queue."
subcmd="$*"
stdin=NO
nosub=NO
account=""
binding="NO"
dirin=""
envars=""
group=""
jobname=""
machine=""
output=""
procs=0
nodes=""
ppreq=""
queue=""
qpreq=""
rmem="1024"
rcpu="1"
timew="900"
userid=""
verbose=NO
when=""
while getopts a:b:d:e:g:ij:m:no:p:q:r:t:u:vw: opt;do
case $opt in
a) account="$OPTARG";;
b) binding="$OPTARG";;
d) dirin="$OPTARG";;
e) envars="$OPTARG";;
g) group="$OPTARG";;
i) stdin=YES;;
j) jobname=$OPTARG;;
m) machine="$OPTARG";;
n) nosub=YES;;
o) output=$OPTARG;;
p) procs=$(echo $OPTARG/|cut -d/ -f1);nodes=$(echo $OPTARG/|cut -d/ -f2);ppreq=$(echo $OPTARG/|cut -d/ -f3);;
q) queue=$(echo $OPTARG/|cut -d/ -f1);qpreq=$(echo $OPTARG/|cut -d/ -f2);;
r) rmem=$(echo $OPTARG/|cut -d/ -f1);rcpu=$(echo $OPTARG/|cut -d/ -f2);;
t) timew=$OPTARG;;
u) userid=$OPTARG;;
v) verbose=YES;;
w) when=$OPTARG;;
\?) echo $0: invalid option >&2;echo "$usage" >&2;exit 1;;
esac
done
shift $(($OPTIND-1))
if [[ $# -eq 0 ]];then
echo $0: missing executable name >&2;echo "$usage" >&2;exit 1
fi
exec=$1
if [[ ! -s $exec ]]&&which $exec >/dev/null 2>&1;then
exec=$(which $exec)
fi
shift
args="$*"
bn=$(basename $exec)
export jobname=${jobname:-$bn}
output=${output:-$jobname.out}
myuser=$LOGNAME
myhost=$(hostname)
exp=${jobname}

DATA=${ptmp:-/work/noaa/da/stmp/$LOGNAME/tmp}
mkdir -p $DATA

#partition=${partition:-c1ms}
queue=${queue:-batch}
timew=${timew:-01:20:00}
task_node=${task_node:-$procs}
size=$((nodes*task_node))
envars=$envars
threads=${rcpu:-1}

export TZ=GMT
cfile=$DATA/sub$$
> $cfile
#echo "#PBS -S /bin/sh" >> $cfile
echo "#!/bin/sh --login" >> $cfile
echo "" >> $cfile
echo "#SBATCH --output=$output" >> $cfile
echo "#SBATCH --job-name=$jobname" >> $cfile
echo "#SBATCH --qos=$queue" >> $cfile
echo "#SBATCH --partition=$partition" >> $cfile
echo "#SBATCH --time=$timew" >> $cfile
echo "#SBATCH --nodes=$nodes --ntasks-per-node=$procs --cpus-per-task=$threads" >> $cfile
echo "#SBATCH --account=$accnt" >> $cfile

echo "" >>$cfile
echo "export ntasks=$(( $nodes * $procs ))" >> $cfile
echo "export ppn=$procs" >> $cfile
echo "export threads=$threads" >> $cfile
echo "export OMP_NUM_THREADS=$threads" >> $cfile
##echo "export OMP_STACKSIZE=2048M" >> $cfile
echo "ulimit -s unlimited" >> $cfile

echo "" >>$cfile
echo ". "$(awk '{ print $1, $2, $3, $4, $5, $6, $7, $8, $9 }' $regdir/regression_var.out) >>$cfile
echo "" >>$cfile

echo ". /apps/lmod/lmod/init/sh" >> $cfile
echo "module purge" >> $cfile
echo "module use $modulefiles" >> $cfile
echo "module load gsi_hercules" >> $cfile
echo "module list" >> $cfile
echo "" >> $cfile
cat $exec >> $cfile

if [[ $nosub = YES ]];then
cat $cfile
exit
elif [[ $verbose = YES ]];then
set -x
cat $cfile
fi


if [[ $stdin = YES ]];then
cat
fi >>$cfile
if [[ $nosub = YES ]];then
cat $cfile
exit
elif [[ $verbose = YES ]];then
set -x
cat $cfile
fi
sbatch=${sbatch:-sbatch}

ofile=$DATA/subout$$
>$ofile
chmod 777 $ofile
$sbatch --export=ALL $cfile >$ofile
rc=$?
cat $ofile
if [[ -w $SUBLOG ]];then
jobn=$(grep -i submitted $ofile|head -n1|cut -d\" -f2)
date -u +"%Y%m%d%H%M%S : $subcmd : $jobn" >>$SUBLOG
fi
#rm $cfile $ofile
#[[ $MKDATA = YES ]] && rmdir $DATA
exit $rc

0 comments on commit 21a9a88

Please sign in to comment.