Skip to content

Commit

Permalink
change the process mapping to column major
Browse files Browse the repository at this point in the history
  • Loading branch information
Redtorm committed Sep 12, 2022
1 parent 4854af4 commit 6d0d889
Show file tree
Hide file tree
Showing 15 changed files with 77 additions and 49 deletions.
39 changes: 24 additions & 15 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,18 @@ EXECUTE_PROCESS(
find_package(OpenMP)

# Add rocM root dir to CMAKE_PREFIX_PATH, usually /opt/rocm
list(APPEND CMAKE_PREFIX_PATH "/opt/rocm")
include(/opt/rocm/lib/cmake/hip/FindHIP.cmake)
include(/opt/rocm/share/rocm/cmake/ROCMCheckTargetIds.cmake)
find_package( hip REQUIRED )
if(NOT DEFINED ROCM_PATH)
if(DEFINED ENV{ROCM_PATH})
set(ROCM_PATH $ENV{ROCM_PATH})
else()
set(ROCM_PATH "/opt/rocm")
endif()
endif()
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
set(CMAKE_MODULE_PATH "${ROCM_PATH}/hip/cmake" ${CMAKE_MODULE_PATH})
set(CMAKE_MODULE_PATH "${ROCM_PATH}/share/rocm/cmake" ${CMAKE_MODULE_PATH})
include(ROCMCheckTargetIds)
find_package( HIP REQUIRED )
find_package( rocblas REQUIRED )

# switch compiler and linker on non-Windows
Expand Down Expand Up @@ -83,15 +91,6 @@ else ()
endif ()
endif ()

# find_library(BLAS_LIBRARIES NAMES blis
# PATHS ${BLAS_DIR}
# NO_DEFAULT_PATH)
# if (BLAS_LIBRARIES)
# message(STATUS "Found BLAS: ${BLAS_LIBRARIES}")
# else()
# find_package(BLAS REQUIRED)
# endif()

if(NOT DEFINED BLAS_DIR)
if(DEFINED ENV{BLAS_DIR})
set(BLAS_DIR $ENV{BLAS_DIR})
Expand All @@ -100,7 +99,18 @@ if(NOT DEFINED BLAS_DIR)
else()
list(APPEND CMAKE_PREFIX_PATH ${BLAS_DIR})
endif()
find_package( BLAS REQUIRED )

find_library(BLAS_LIBRARIES NAMES blis openblas
PATHS ${BLAS_DIR}
HINTS ${BLAS_DIR}/lib/zen3 ${BLAS_DIR}/lib
NO_DEFAULT_PATH)
if (BLAS_LIBRARIES)
message(STATUS "Found BLAS: ${BLAS_LIBRARIES}")
else()
message(STATUS "BLAS NOT Found: ${BLAS_LIBRARIES}")
find_package(BLAS REQUIRED)
endif()
# find_package( BLAS REQUIRED )

# append math library, if found
find_library(MATH_LIBRARY m)
Expand Down Expand Up @@ -217,7 +227,6 @@ target_include_directories( xhplhip PUBLIC hip:device
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>
)
# target_link_libraries( xhplhip roc::rocblas roc::rocrand ${BLAS_LIBRARIES} ${MPI_CXX_LIBRARIES} OpenMP::OpenMP_CXX /global/home/lulu/mun-node-3/blis-multi-thread/lib/zen3/libblis.so)
target_link_libraries( xhplhip roc::rocblas roc::rocrand ${BLAS_LIBRARIES} ${MPI_CXX_LIBRARIES} OpenMP::OpenMP_CXX)

configure_file( include/hplhip_config.hin ${CMAKE_CURRENT_SOURCE_DIR}/include/hplhip_config.h @ONLY NEWLINE_STYLE LF )
Expand Down
2 changes: 1 addition & 1 deletion scripts/config/HPL_16GPU.dat
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ HPL.out output file name (if any)
364032 N
1 # of NBs
384 NBs
0 PMAP process mapping (0=Row-,1=Column-major)
1 PMAP process mapping (0=Row-,1=Column-major)
1 # of process grids (P x Q)
4 Ps
4 Qs
Expand Down
2 changes: 1 addition & 1 deletion scripts/config/HPL_1GPU.dat
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ HPL.out output file name (if any)
91008 N
1 # of NBs
384 NBs
0 PMAP process mapping (0=Row-,1=Column-major)
1 PMAP process mapping (0=Row-,1=Column-major)
1 # of process grids (P x Q)
1 Ps
1 Qs
Expand Down
2 changes: 1 addition & 1 deletion scripts/config/HPL_2GPU.dat
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ HPL.out output file name (if any)
128256 N
1 # of NBs
384 NBs
0 PMAP process mapping (0=Row-,1=Column-major)
1 PMAP process mapping (0=Row-,1=Column-major)
1 # of process grids (P x Q)
2 Ps
1 Qs
Expand Down
2 changes: 1 addition & 1 deletion scripts/config/HPL_32GPU.dat
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ HPL.out output file name (if any)
513024 N
1 # of NBs
384 NBs
0 PMAP process mapping (0=Row-,1=Column-major)
1 PMAP process mapping (0=Row-,1=Column-major)
1 # of process grids (P x Q)
8 Ps
4 Qs
Expand Down
2 changes: 1 addition & 1 deletion scripts/config/HPL_4GPU.dat
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ HPL.out output file name (if any)
180864 N
1 # of NBs
384 NBs
0 PMAP process mapping (0=Row-,1=Column-major)
1 PMAP process mapping (0=Row-,1=Column-major)
1 # of process grids (P x Q)
2 Ps
2 Qs
Expand Down
4 changes: 2 additions & 2 deletions scripts/config/HPL_8GPU.dat
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ HPL.out output file name (if any)
256512 N
1 # of NBs
384 NBs
0 PMAP process mapping (0=Row-,1=Column-major)
1 PMAP process mapping (0=Row-,1=Column-major)
1 # of process grids (P x Q)
4 Ps
2 Qs
Expand All @@ -19,7 +19,7 @@ HPL.out output file name (if any)
2 NDIVs
1 # of recursive panel fact.
2 RFACTs (0=left, 1=Crout, 2=Right)
8 # of broadcast
1 # of broadcast
6 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM,6=ibcast,7=BiDir)
1 # of lookahead depth
1 DEPTHs (>=0)
Expand Down
7 changes: 7 additions & 0 deletions scripts/env.mun.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
module reset

module load rocm/5.3.0-10584

# export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/global/software/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.4.0/openblas-0.3.20-qbm5uv3ntjerkx4jzrprmelytviwoq2e/lib:/global/software/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.4.0/openmpi-4.1.4-3z7jsddbvczl4duixalzrtap3q5nuvjk/lib"
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/global/home/lulu/blis/lib:/global/home/lulu/ompi/lib:/global/software_internal/rocm/rocm-5.3.0-10584/lib"
# export MPICH_GPU_SUPPORT_ENABLED=1
7 changes: 5 additions & 2 deletions scripts/mpirun_xhplhip.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ mpi_bin=${mpi_dir}/bin/mpiexec
mpi_lib=${mpi_dir}/lib
hpl_runscript=./run_xhplhip.sh

if [ -z "${ROCM_PATH}" ]; then rocm_dir="/opt/rocm/lib";
else rocm_dir="${ROCM_PATH}/lib"; fi

filename=HPL.dat

P=$(sed -n "11, 1p" ${filename} | awk '{print $1}')
Expand All @@ -18,9 +21,9 @@ num_cpu_sockets=$(lscpu | grep Socket | awk '{print $2}')
total_cpu_cores=$(($num_cpu_cores*$num_cpu_sockets))

export LD_LIBRARY_PATH=${mpi_lib}:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH="${rocm_dir}":$LD_LIBRARY_PATH
#Default MPI options
mpi_args="--map-by slot:PE=${total_cpu_cores} --bind-to core:overload-allowed --mca btl ^openib --mca pml ucx -x LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib ${mpi_args}"
mpi_args="--map-by slot:PE=${total_cpu_cores} --bind-to core:overload-allowed --mca btl ^openib --mca pml ucx -x LD_LIBRARY_PATH="${rocm_dir}/lib":$LD_LIBRARY_PATH ${mpi_args}"

${mpi_bin} --allow-run-as-root -np ${np} ${mpi_args} ${hpl_runscript}
# ${mpi_bin} --hostfile hostfile --allow-run-as-root -np ${np} ${mpi_args} ${hpl_runscript}
Expand Down
38 changes: 22 additions & 16 deletions scripts/run_hpl.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@
#SBATCH -c 8
#SBATCH -t 2:00:00
##SBATCH -A VEN114
#SBATCH -A project_462000075
#SBATCH -J xhplhip
#SBATCH --gpu-bind=closest
#SBATCH --ntasks-per-node=8
#SBATCH --gpus-per-node=8
#SBATCH -p MI250-x4-IB
#SBATCH -w mun-node-4
#SBATCH --exclusive

#source ../env/env.crusher.sh
source ../env/env.lumi.sh
source env.mun.sh

NP=$SLURM_NPROCS
NODES=$SLURM_NNODES
Expand All @@ -21,20 +23,24 @@ LOG=log.hpl-gpu-${NP}np-${HOSTNAME}-${DATE}.txt

cp config/HPL_${NP}GPU.dat HPL.dat

EXE="../build/xhplhip"
CMD=""
CMD+="srun "
CMD+="-v "
CMD+="-n $NP "
CMD+="-N $NODES "
CMD+="-A VEN114 "
CMD+="--gpu-bind=closest "
CMD+="--ntasks-per-node=8 "
CMD+="--gpus-per-node=8 "
CMD+="-c 8 "
CMD+="-o $LOG -e $LOG "
#CMD+="${HOME}/mpich_bind.sh "
CMD+="$EXE"
# EXE="../build/xhplhip"
# CMD=""
# CMD+="srun "
# CMD+="-v "
# CMD+="-n $NP "
# CMD+="-N $NODES "
# CMD+="-A VEN114 "
# CMD+="--gpu-bind=closest "
# CMD+="--ntasks-per-node=8 "
# CMD+="--gpus-per-node=8 "
# CMD+="-c 8 "
# CMD+="-o $LOG -e $LOG "
# #CMD+="${HOME}/mpich_bind.sh "
# CMD+="$EXE"

sh mpirun_xhplhip.sh



if [ $NODES -gt 8 ]; then
echo "export FI_MR_CACHE_MAX_COUNT=0"
Expand Down
7 changes: 4 additions & 3 deletions scripts/run_xhplhip.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
# set -x #echo on

hpl_bin=./xhplhip
rocblas_dir=/opt/rocm-5.2.0/lib
blas_dir=/global/home/lulu/hyc/rocHPL-main/tpl/blis/lib
if [ -z "${ROCM_PATH}" ]; then rocblas_dir="/opt/rocm/lib"
else rocblas_dir="${ROCM_PATH}/lib"; fi
blas_dir="${BLAS_DIR}/lib/zen3"

filename=./HPL.dat
p=-1
Expand Down Expand Up @@ -229,4 +230,4 @@ if [[ $globalRank -lt $size ]]; then
echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] CPU Cores: $omp_num_threads - $places"
fi
#run
${hpl_bin}
${hpl_bin} -p ${p} -q ${q}
2 changes: 1 addition & 1 deletion src/grid/HPL_grid_init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ int HPL_grid_init
local_mycol = local_rank % q; local_myrow = local_rank / q;
int noderow = node / (NPCOL / q); int nodecol = node % (NPCOL / q);
myrow = noderow * p + local_myrow; mycol = nodecol * q + local_mycol;
myrow = rank / NPCOL; mycol = rank - myrow * NPCOL;
// myrow = rank / NPCOL; mycol = rank - myrow * NPCOL;
}
else
{
Expand Down
8 changes: 5 additions & 3 deletions testing/backend/HPL_backendHIP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ void HIP::init(const HPL_T_grid* GRID)
HIP_CHECK_ERROR(hipGetDeviceCount(&count));
//TODO: set dynamic device id
int device_id = localRank % count;
// printf("host: %s, device id: %d, myrow: %d, mycol: %d\n", host_name, device_id, GRID->local_myrow, GRID->local_mycol);
HIP_CHECK_ERROR(hipSetDevice(device_id));

// Get device properties
Expand Down Expand Up @@ -1653,7 +1654,7 @@ void HIP::panel_init(HPL_T_grid *GRID, HPL_T_palg *ALGO, const int M, const int
*/
PANEL->ldl2 = 0; /* local leading dim of array L2 */
PANEL->dldl2 = 0; /* local leading dim of array L2 */
PANEL->dldl1 = 1.02 * A->dN; // padding
PANEL->dldl1 = 1.015 * A->dN; // padding to avoid stack overflow
PANEL->len = 0; /* length of the buffer to broadcast */
PANEL->nu0 = 0;
PANEL->nu1 = 0;
Expand All @@ -1666,9 +1667,10 @@ void HIP::panel_init(HPL_T_grid *GRID, HPL_T_palg *ALGO, const int M, const int
/*Split fraction*/
const double fraction = 0.6;

if ((double)M / A->dN > 0.97) {
// get the panel init time
if ((double)M / A->dN > 0.985) {
HPL_ptimer_boot();
HPL_ptimer( 0 );
HPL_ptimer( HPL_rzero );
}
dalign = ALGO->align * sizeof(double);
size_t lpiv = (5 * JB * sizeof(int) + sizeof(double) - 1) / (sizeof(double));
Expand Down
2 changes: 1 addition & 1 deletion testing/ptest/HPL_pddriver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ int main( ARGC, ARGV )

algo.fswap = fswap; algo.fsthr = tswap;
algo.equil = equil; algo.align = align;

HPL_pdtest( &test, &grid, &algo, nval[in], nbval[inb] );

}
Expand Down
2 changes: 1 addition & 1 deletion testing/ptest/HPL_pdtest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ void HPL_pdtest
(void) vsip_blockrelease_d( mat.block, VSIP_TRUE );
vsip_blockdestroy_d( mat.block );
#endif

/*
* Gather max of all CPU and WALL clock timings and print timing results
*/
Expand Down Expand Up @@ -510,7 +511,6 @@ void HPL_pdtest
HPL_fprintf( TEST->outfp, "%s%16.8e%s%s\n",
"||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)= ", resid1,
" ...... ", ( resid1 < TEST->thrsh ? "PASSED" : "FAILED" ) );

if( resid1 >= TEST->thrsh )
{
HPL_fprintf( TEST->outfp, "%s%18.6f\n",
Expand Down

0 comments on commit 6d0d889

Please sign in to comment.