diff --git a/CMakeLists.txt b/CMakeLists.txt index 84b31a6..f594214 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,10 +20,18 @@ EXECUTE_PROCESS( find_package(OpenMP) # Add rocM root dir to CMAKE_PREFIX_PATH, usually /opt/rocm -list(APPEND CMAKE_PREFIX_PATH "/opt/rocm") -include(/opt/rocm/lib/cmake/hip/FindHIP.cmake) -include(/opt/rocm/share/rocm/cmake/ROCMCheckTargetIds.cmake) -find_package( hip REQUIRED ) +if(NOT DEFINED ROCM_PATH) + if(DEFINED ENV{ROCM_PATH}) + set(ROCM_PATH $ENV{ROCM_PATH}) + else() + set(ROCM_PATH "/opt/rocm") + endif() +endif() +list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}) +set(CMAKE_MODULE_PATH "${ROCM_PATH}/hip/cmake" ${CMAKE_MODULE_PATH}) +set(CMAKE_MODULE_PATH "${ROCM_PATH}/share/rocm/cmake" ${CMAKE_MODULE_PATH}) +include(ROCMCheckTargetIds) +find_package( HIP REQUIRED ) find_package( rocblas REQUIRED ) # switch compiler and linker on non-Windows @@ -83,15 +91,6 @@ else () endif () endif () -# find_library(BLAS_LIBRARIES NAMES blis -# PATHS ${BLAS_DIR} -# NO_DEFAULT_PATH) -# if (BLAS_LIBRARIES) -# message(STATUS "Found BLAS: ${BLAS_LIBRARIES}") -# else() -# find_package(BLAS REQUIRED) -# endif() - if(NOT DEFINED BLAS_DIR) if(DEFINED ENV{BLAS_DIR}) set(BLAS_DIR $ENV{BLAS_DIR}) @@ -100,7 +99,18 @@ if(NOT DEFINED BLAS_DIR) else() list(APPEND CMAKE_PREFIX_PATH ${BLAS_DIR}) endif() -find_package( BLAS REQUIRED ) + +find_library(BLAS_LIBRARIES NAMES blis openblas + PATHS ${BLAS_DIR} + HINTS ${BLAS_DIR}/lib/zen3 ${BLAS_DIR}/lib + NO_DEFAULT_PATH) +if (BLAS_LIBRARIES) + message(STATUS "Found BLAS: ${BLAS_LIBRARIES}") +else() + message(STATUS "BLAS NOT Found: ${BLAS_LIBRARIES}") + find_package(BLAS REQUIRED) +endif() +# find_package( BLAS REQUIRED ) # append math library, if found find_library(MATH_LIBRARY m) @@ -217,7 +227,6 @@ target_include_directories( xhplhip PUBLIC hip:device $ $ ) -# target_link_libraries( xhplhip roc::rocblas roc::rocrand ${BLAS_LIBRARIES} ${MPI_CXX_LIBRARIES} OpenMP::OpenMP_CXX /global/home/lulu/mun-node-3/blis-multi-thread/lib/zen3/libblis.so) target_link_libraries( xhplhip roc::rocblas roc::rocrand ${BLAS_LIBRARIES} ${MPI_CXX_LIBRARIES} OpenMP::OpenMP_CXX) configure_file( include/hplhip_config.hin ${CMAKE_CURRENT_SOURCE_DIR}/include/hplhip_config.h @ONLY NEWLINE_STYLE LF ) diff --git a/scripts/config/HPL_16GPU.dat b/scripts/config/HPL_16GPU.dat index 679c241..665aa5e 100644 --- a/scripts/config/HPL_16GPU.dat +++ b/scripts/config/HPL_16GPU.dat @@ -6,7 +6,7 @@ HPL.out output file name (if any) 364032 N 1 # of NBs 384 NBs -0 PMAP process mapping (0=Row-,1=Column-major) +1 PMAP process mapping (0=Row-,1=Column-major) 1 # of process grids (P x Q) 4 Ps 4 Qs diff --git a/scripts/config/HPL_1GPU.dat b/scripts/config/HPL_1GPU.dat index ee8e10a..3413c87 100644 --- a/scripts/config/HPL_1GPU.dat +++ b/scripts/config/HPL_1GPU.dat @@ -6,7 +6,7 @@ HPL.out output file name (if any) 91008 N 1 # of NBs 384 NBs -0 PMAP process mapping (0=Row-,1=Column-major) +1 PMAP process mapping (0=Row-,1=Column-major) 1 # of process grids (P x Q) 1 Ps 1 Qs diff --git a/scripts/config/HPL_2GPU.dat b/scripts/config/HPL_2GPU.dat index 4897416..0362234 100644 --- a/scripts/config/HPL_2GPU.dat +++ b/scripts/config/HPL_2GPU.dat @@ -6,7 +6,7 @@ HPL.out output file name (if any) 128256 N 1 # of NBs 384 NBs -0 PMAP process mapping (0=Row-,1=Column-major) +1 PMAP process mapping (0=Row-,1=Column-major) 1 # of process grids (P x Q) 2 Ps 1 Qs diff --git a/scripts/config/HPL_32GPU.dat b/scripts/config/HPL_32GPU.dat index cf14244..43fb13a 100644 --- a/scripts/config/HPL_32GPU.dat +++ b/scripts/config/HPL_32GPU.dat @@ -6,7 +6,7 @@ HPL.out output file name (if any) 513024 N 1 # of NBs 384 NBs -0 PMAP process mapping (0=Row-,1=Column-major) +1 PMAP process mapping (0=Row-,1=Column-major) 1 # of process grids (P x Q) 8 Ps 4 Qs diff --git a/scripts/config/HPL_4GPU.dat b/scripts/config/HPL_4GPU.dat index 6eb090a..e336bbd 100644 --- a/scripts/config/HPL_4GPU.dat +++ b/scripts/config/HPL_4GPU.dat @@ -6,7 +6,7 @@ HPL.out output file name (if any) 180864 N 1 # of NBs 384 NBs -0 PMAP process mapping (0=Row-,1=Column-major) +1 PMAP process mapping (0=Row-,1=Column-major) 1 # of process grids (P x Q) 2 Ps 2 Qs diff --git a/scripts/config/HPL_8GPU.dat b/scripts/config/HPL_8GPU.dat index 575e1a9..3d3c3c6 100644 --- a/scripts/config/HPL_8GPU.dat +++ b/scripts/config/HPL_8GPU.dat @@ -6,7 +6,7 @@ HPL.out output file name (if any) 256512 N 1 # of NBs 384 NBs -0 PMAP process mapping (0=Row-,1=Column-major) +1 PMAP process mapping (0=Row-,1=Column-major) 1 # of process grids (P x Q) 4 Ps 2 Qs @@ -19,7 +19,7 @@ HPL.out output file name (if any) 2 NDIVs 1 # of recursive panel fact. 2 RFACTs (0=left, 1=Crout, 2=Right) -8 # of broadcast +1 # of broadcast 6 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM,6=ibcast,7=BiDir) 1 # of lookahead depth 1 DEPTHs (>=0) diff --git a/scripts/env.mun.sh b/scripts/env.mun.sh new file mode 100755 index 0000000..5ef304e --- /dev/null +++ b/scripts/env.mun.sh @@ -0,0 +1,7 @@ +module reset + +module load rocm/5.3.0-10584 + +# export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/global/software/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.4.0/openblas-0.3.20-qbm5uv3ntjerkx4jzrprmelytviwoq2e/lib:/global/software/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.4.0/openmpi-4.1.4-3z7jsddbvczl4duixalzrtap3q5nuvjk/lib" +export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/global/home/lulu/blis/lib:/global/home/lulu/ompi/lib:/global/software_internal/rocm/rocm-5.3.0-10584/lib" +# export MPICH_GPU_SUPPORT_ENABLED=1 diff --git a/scripts/mpirun_xhplhip.sh b/scripts/mpirun_xhplhip.sh index 617143a..8e0fc76 100644 --- a/scripts/mpirun_xhplhip.sh +++ b/scripts/mpirun_xhplhip.sh @@ -7,6 +7,9 @@ mpi_bin=${mpi_dir}/bin/mpiexec mpi_lib=${mpi_dir}/lib hpl_runscript=./run_xhplhip.sh +if [ -z "${ROCM_PATH}" ]; then rocm_dir="/opt/rocm/lib"; +else rocm_dir="${ROCM_PATH}/lib"; fi + filename=HPL.dat P=$(sed -n "11, 1p" ${filename} | awk '{print $1}') @@ -18,9 +21,9 @@ num_cpu_sockets=$(lscpu | grep Socket | awk '{print $2}') total_cpu_cores=$(($num_cpu_cores*$num_cpu_sockets)) export LD_LIBRARY_PATH=${mpi_lib}:$LD_LIBRARY_PATH -export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH="${rocm_dir}":$LD_LIBRARY_PATH #Default MPI options -mpi_args="--map-by slot:PE=${total_cpu_cores} --bind-to core:overload-allowed --mca btl ^openib --mca pml ucx -x LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib ${mpi_args}" +mpi_args="--map-by slot:PE=${total_cpu_cores} --bind-to core:overload-allowed --mca btl ^openib --mca pml ucx -x LD_LIBRARY_PATH="${rocm_dir}/lib":$LD_LIBRARY_PATH ${mpi_args}" ${mpi_bin} --allow-run-as-root -np ${np} ${mpi_args} ${hpl_runscript} # ${mpi_bin} --hostfile hostfile --allow-run-as-root -np ${np} ${mpi_args} ${hpl_runscript} diff --git a/scripts/run_hpl.slurm b/scripts/run_hpl.slurm index 381779b..7e8f995 100755 --- a/scripts/run_hpl.slurm +++ b/scripts/run_hpl.slurm @@ -5,14 +5,16 @@ #SBATCH -c 8 #SBATCH -t 2:00:00 ##SBATCH -A VEN114 -#SBATCH -A project_462000075 #SBATCH -J xhplhip #SBATCH --gpu-bind=closest #SBATCH --ntasks-per-node=8 #SBATCH --gpus-per-node=8 +#SBATCH -p MI250-x4-IB +#SBATCH -w mun-node-4 +#SBATCH --exclusive #source ../env/env.crusher.sh -source ../env/env.lumi.sh +source env.mun.sh NP=$SLURM_NPROCS NODES=$SLURM_NNODES @@ -21,20 +23,24 @@ LOG=log.hpl-gpu-${NP}np-${HOSTNAME}-${DATE}.txt cp config/HPL_${NP}GPU.dat HPL.dat -EXE="../build/xhplhip" -CMD="" -CMD+="srun " -CMD+="-v " -CMD+="-n $NP " -CMD+="-N $NODES " -CMD+="-A VEN114 " -CMD+="--gpu-bind=closest " -CMD+="--ntasks-per-node=8 " -CMD+="--gpus-per-node=8 " -CMD+="-c 8 " -CMD+="-o $LOG -e $LOG " -#CMD+="${HOME}/mpich_bind.sh " -CMD+="$EXE" +# EXE="../build/xhplhip" +# CMD="" +# CMD+="srun " +# CMD+="-v " +# CMD+="-n $NP " +# CMD+="-N $NODES " +# CMD+="-A VEN114 " +# CMD+="--gpu-bind=closest " +# CMD+="--ntasks-per-node=8 " +# CMD+="--gpus-per-node=8 " +# CMD+="-c 8 " +# CMD+="-o $LOG -e $LOG " +# #CMD+="${HOME}/mpich_bind.sh " +# CMD+="$EXE" + +sh mpirun_xhplhip.sh + + if [ $NODES -gt 8 ]; then echo "export FI_MR_CACHE_MAX_COUNT=0" diff --git a/scripts/run_xhplhip.sh b/scripts/run_xhplhip.sh index ea4d314..c48d2f8 100755 --- a/scripts/run_xhplhip.sh +++ b/scripts/run_xhplhip.sh @@ -2,8 +2,9 @@ # set -x #echo on hpl_bin=./xhplhip -rocblas_dir=/opt/rocm-5.2.0/lib -blas_dir=/global/home/lulu/hyc/rocHPL-main/tpl/blis/lib +if [ -z "${ROCM_PATH}" ]; then rocblas_dir="/opt/rocm/lib" +else rocblas_dir="${ROCM_PATH}/lib"; fi +blas_dir="${BLAS_DIR}/lib/zen3" filename=./HPL.dat p=-1 @@ -229,4 +230,4 @@ if [[ $globalRank -lt $size ]]; then echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] CPU Cores: $omp_num_threads - $places" fi #run -${hpl_bin} +${hpl_bin} -p ${p} -q ${q} diff --git a/src/grid/HPL_grid_init.cpp b/src/grid/HPL_grid_init.cpp index 2130c0a..7c20055 100644 --- a/src/grid/HPL_grid_init.cpp +++ b/src/grid/HPL_grid_init.cpp @@ -140,7 +140,7 @@ int HPL_grid_init local_mycol = local_rank % q; local_myrow = local_rank / q; int noderow = node / (NPCOL / q); int nodecol = node % (NPCOL / q); myrow = noderow * p + local_myrow; mycol = nodecol * q + local_mycol; - myrow = rank / NPCOL; mycol = rank - myrow * NPCOL; + // myrow = rank / NPCOL; mycol = rank - myrow * NPCOL; } else { diff --git a/testing/backend/HPL_backendHIP.cpp b/testing/backend/HPL_backendHIP.cpp index 36b07ee..b425d75 100644 --- a/testing/backend/HPL_backendHIP.cpp +++ b/testing/backend/HPL_backendHIP.cpp @@ -35,6 +35,7 @@ void HIP::init(const HPL_T_grid* GRID) HIP_CHECK_ERROR(hipGetDeviceCount(&count)); //TODO: set dynamic device id int device_id = localRank % count; + // printf("host: %s, device id: %d, myrow: %d, mycol: %d\n", host_name, device_id, GRID->local_myrow, GRID->local_mycol); HIP_CHECK_ERROR(hipSetDevice(device_id)); // Get device properties @@ -1653,7 +1654,7 @@ void HIP::panel_init(HPL_T_grid *GRID, HPL_T_palg *ALGO, const int M, const int */ PANEL->ldl2 = 0; /* local leading dim of array L2 */ PANEL->dldl2 = 0; /* local leading dim of array L2 */ - PANEL->dldl1 = 1.02 * A->dN; // padding + PANEL->dldl1 = 1.015 * A->dN; // padding to avoid stack overflow PANEL->len = 0; /* length of the buffer to broadcast */ PANEL->nu0 = 0; PANEL->nu1 = 0; @@ -1666,9 +1667,10 @@ void HIP::panel_init(HPL_T_grid *GRID, HPL_T_palg *ALGO, const int M, const int /*Split fraction*/ const double fraction = 0.6; - if ((double)M / A->dN > 0.97) { + // get the panel init time + if ((double)M / A->dN > 0.985) { HPL_ptimer_boot(); - HPL_ptimer( 0 ); + HPL_ptimer( HPL_rzero ); } dalign = ALGO->align * sizeof(double); size_t lpiv = (5 * JB * sizeof(int) + sizeof(double) - 1) / (sizeof(double)); diff --git a/testing/ptest/HPL_pddriver.cpp b/testing/ptest/HPL_pddriver.cpp index f942e8e..3ddc5c0 100644 --- a/testing/ptest/HPL_pddriver.cpp +++ b/testing/ptest/HPL_pddriver.cpp @@ -220,7 +220,7 @@ int main( ARGC, ARGV ) algo.fswap = fswap; algo.fsthr = tswap; algo.equil = equil; algo.align = align; - + HPL_pdtest( &test, &grid, &algo, nval[in], nbval[inb] ); } diff --git a/testing/ptest/HPL_pdtest.cpp b/testing/ptest/HPL_pdtest.cpp index afea16a..e25ac5f 100644 --- a/testing/ptest/HPL_pdtest.cpp +++ b/testing/ptest/HPL_pdtest.cpp @@ -230,6 +230,7 @@ void HPL_pdtest (void) vsip_blockrelease_d( mat.block, VSIP_TRUE ); vsip_blockdestroy_d( mat.block ); #endif + /* * Gather max of all CPU and WALL clock timings and print timing results */ @@ -510,7 +511,6 @@ void HPL_pdtest HPL_fprintf( TEST->outfp, "%s%16.8e%s%s\n", "||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)= ", resid1, " ...... ", ( resid1 < TEST->thrsh ? "PASSED" : "FAILED" ) ); - if( resid1 >= TEST->thrsh ) { HPL_fprintf( TEST->outfp, "%s%18.6f\n",