Skip to content

Commit

Permalink
Optimize multi-node running
Browse files Browse the repository at this point in the history
  • Loading branch information
lueelu committed Jul 4, 2022
1 parent 2457e76 commit f09d648
Show file tree
Hide file tree
Showing 11 changed files with 101 additions and 27 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/scripts/mpirun_xhplhip.sh
DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/scripts/run_xhplhip.sh
DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/scripts/hostfile
DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
# don't need MPI C++ bindings, now deprecated
if(NOT DEFINED MPI_DIR)
if(DEFINED ENV{MPI_DIR})
Expand Down
2 changes: 1 addition & 1 deletion HPL.dat
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ HPL.out output file name (if any)
1 # of recursive panel fact.
2 RFACTs (0=left, 1=Crout, 2=Right)
1 # of broadcast
6 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM,6=ibcast)
0 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM,6=ibcast)
1 # of lookahead depth
1 DEPTHs (>=0)
1 SWAP (0=bin-exch,1=long,2=mix)
Expand Down
1 change: 1 addition & 0 deletions include/backend/hpl_backendCPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <assert.h>

extern "C" {
//#include "hpl.h"
Expand Down
2 changes: 2 additions & 0 deletions include/backend/hpl_backendHIP.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ namespace HIP {
float elapsedTime(const HPL_T_UPD);
void device_sync();
int bcast_ibcst(HPL_T_panel*, int*);
int bwait_ibcast(HPL_T_panel* PANEL);
/*
* ----------------------------------------------------------------------
* - BLAS ---------------------------------------------------------------
Expand Down Expand Up @@ -178,6 +179,7 @@ namespace HIP {
hipEvent_t swapStartEvent[HPL_N_UPD], update[HPL_N_UPD];
hipEvent_t swapUCopyEvent[HPL_N_UPD], swapWCopyEvent[HPL_N_UPD];
hipEvent_t dgemmStart[HPL_N_UPD], dgemmStop[HPL_N_UPD];
MPI_Request bcast_req;
std::map<int, const char*> _memcpyKind;
}
}
4 changes: 4 additions & 0 deletions scripts/hostfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
172.16.12.5 slots=8
172.16.12.6 slots=8
172.16.12.7 slots=8
172.16.12.8 slots=8
10 changes: 5 additions & 5 deletions scripts/mpirun_xhplhip.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,20 @@ mpi_bin=${mpi_dir}/bin/mpiexec
mpi_lib=${mpi_dir}/lib
hpl_runscript=./run_xhplhip.sh

filename=./HPL.dat
filename=HPL.dat

P=$(sed -n "11, 1p" ${filename} | awk '{print $1}')
Q=$(sed -n "12, 1p" ${filename} | awk '{print $1}')
np=$(($P*$Q))

# count the number of physical cores
echo ${np}
num_cpu_cores=$(lscpu | grep "Core(s)" | awk '{print $4}')
num_cpu_sockets=$(lscpu | grep Socket | awk '{print $2}')
total_cpu_cores=$(($num_cpu_cores*$num_cpu_sockets))

export LD_LIBRARY_PATH=${mpi_lib}:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
#Default MPI options
mpi_args="--map-by node:PE=${total_cpu_cores} --bind-to core:overload-allowed --mca pml ucx --mca btl ^vader,tcp,openib,uct ${mpi_args}"
mpi_args="--map-by slot:PE=${total_cpu_cores} --bind-to core:overload-allowed --mca btl ^openib --mca pml ucx --report-bindings -x LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib ${mpi_args}"

${mpi_bin} --allow-run-as-root -np ${np} ${mpi_args} ${hpl_runscript}
${mpi_bin} --hostfile hostfile --allow-run-as-root -np ${np} ${mpi_args} ${hpl_runscript}
grep --color "e+" HPL.out
12 changes: 10 additions & 2 deletions src/pgesv/HPL_pdgesvK2_HIP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ void HPL_pdgesvK2_HIP
*/
float smallDgemmTime, largeDgemm1Time, largeDgemm2Time;
double smallDgemmGflops, pdfactGflops, largeDgemm1Gflops, largeDgemm2Gflops;
double stepStart, stepEnd, pdfactStart, pdfactEnd;
double stepStart, stepEnd, pdfactStart, pdfactEnd, bcastStart, bcastEnd;
for( j = jstart; j < N; j += nb )
{
stepStart = MPI_Wtime();
Expand Down Expand Up @@ -232,7 +232,13 @@ void HPL_pdgesvK2_HIP
HPL_pdupdate(HPL_UPD_2, NULL, NULL, panel[0], panel[0]->nu2);
}

HPL_pdpanel_bcast(panel[1]);
bcastStart = MPI_Wtime();
(void) HPL_binit( panel[1] );
do
{ (void) HPL_bcast( panel[1], &test ); }
while( test != HPL_SUCCESS );
(void) HPL_bwait( panel[1] );
bcastEnd = MPI_Wtime();

// start local row swapping for second part
HIP::HPL_pdlaswp_hip(panel[1], icurcol, {SU2});
Expand Down Expand Up @@ -289,9 +295,11 @@ void HPL_pdgesvK2_HIP
if (panel[0]->nu0) {
printf("Small DGEMM Gflops=%9.3e ", smallDgemmGflops);
printf("pdfact Gflops=%9.3e ", pdfactGflops);
printf("Bcast Time(ms)=%9.7f ", (bcastEnd - bcastStart)*1000);
} else {
printf("Small DGEMM Gflops=--------- ");
printf("pdfact Gflops=--------- ");
printf("Bcast Time(ms)=--------- ");
}
if (panel[0]->nu2) {
printf("DGEMM1 Gflops=%9.3e ", largeDgemm2Gflops);
Expand Down
80 changes: 64 additions & 16 deletions testing/backend/HPL_backendCPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -432,11 +432,50 @@ void CPU::HPL_idamax_omp(const int N, const double* X, const int INCX, const int
}
}
}

MPI_Op HPL_DMXSWP;
MPI_Datatype PDFACT_ROW;

/* Swap-broadcast comparison function usable in MPI_Allreduce */
void HPL_dmxswp(void* invec, void* inoutvec, int* len,
MPI_Datatype* datatype) {

assert(*datatype == PDFACT_ROW);
assert(*len == 1);

int N;
MPI_Type_size(PDFACT_ROW, &N);

double* Wwork = static_cast<double*>(invec);
double* WORK = static_cast<double*>(inoutvec);

const int jb = ((N/sizeof(double))-4)/2;

//check max column value and overwirte row if new max is found
const double gmax = Mabs(WORK[0]);
const double tmp1 = Mabs(Wwork[0]);
if((tmp1 > gmax) || ((tmp1 == gmax) && (Wwork[3] < WORK[3]))) {
HPL_dcopy(jb+4, Wwork, 1, WORK, 1);
}

// Add the input top row to the inout top row.
HPL_daxpy(jb, 1.0, Wwork+jb+4, 1, WORK+jb+4, 1);

}

const int max_req = 128;
MPI_Request reqs[max_req];
int req_idx = 0;

void CPU::HPL_all_reduce_dmxswp(double* BUFFER, const int COUNT, const int ROOT, MPI_Comm COMM, double* WORK)
{

#if 0
MPI_Op_create(HPL_dmxswp, true, &HPL_DMXSWP);
MPI_Request req;
(void) MPI_Iallreduce(MPI_IN_PLACE, BUFFER, 1, PDFACT_ROW, HPL_DMXSWP, COMM, &req);
MPI_Wait(&req, MPI_STATUS_IGNORE);

#else
double gmax, tmp1;
double * A0, *Wmx;
unsigned int hdim, ip2, ip2_, ipow, k, mask;
Expand Down Expand Up @@ -465,15 +504,21 @@ void CPU::HPL_all_reduce_dmxswp(double* BUFFER, const int COUNT, const int ROOT,

if((Np2 != 0) && ((partner = (int)((unsigned int)(mydist) ^ ip2)) < nprow)) {
if((mydist & ip2) != 0) {
if(mydist == (int)(ip2))
(void)HPL_sdrv(BUFFER, cnt_, MSGID_BEGIN_PFACT, A0, n0, MSGID_BEGIN_PFACT, MModAdd(partner, icurrow, nprow), COMM);
else
(void)HPL_send(BUFFER, cnt_, MModAdd(partner, icurrow, nprow), MSGID_BEGIN_PFACT, COMM);
if(mydist == (int)(ip2)) {
int mpartner = MModAdd(partner, icurrow, nprow);
MPI_Sendrecv(BUFFER, cnt_, MPI_DOUBLE, mpartner, MSGID_BEGIN_PFACT, A0, n0, MPI_DOUBLE, mpartner, MSGID_BEGIN_PFACT, COMM, MPI_STATUS_IGNORE);
}
else {
MPI_Isend(BUFFER, cnt_, MPI_DOUBLE, MModAdd(partner, icurrow, nprow), MSGID_BEGIN_PFACT, COMM, &reqs[req_idx++]);
}
} else {
if(mydist == 0)
(void)HPL_sdrv(A0, n0, MSGID_BEGIN_PFACT, WORK, cnt_, MSGID_BEGIN_PFACT, MModAdd(partner, icurrow, nprow), COMM);
else
(void)HPL_recv(WORK, cnt_, MModAdd(partner, icurrow, nprow), MSGID_BEGIN_PFACT, COMM);
if(mydist == 0) {
int mpartner = MModAdd(partner, icurrow, nprow);
MPI_Sendrecv(A0, n0, MPI_DOUBLE, mpartner, MSGID_BEGIN_PFACT, WORK, cnt_, MPI_DOUBLE, mpartner, MSGID_BEGIN_PFACT, COMM, MPI_STATUS_IGNORE);
}
else {
MPI_Irecv(WORK, cnt_, MPI_DOUBLE, MModAdd(partner, icurrow, nprow), MSGID_BEGIN_PFACT, COMM, &reqs[req_idx++]);
}

tmp1 = Mabs(WORK[0]);
gmax = Mabs(BUFFER[0]);
Expand Down Expand Up @@ -501,7 +546,8 @@ void CPU::HPL_all_reduce_dmxswp(double* BUFFER, const int COUNT, const int ROOT,
}

partner = (int)((unsigned int)(mydist) ^ ipow);
(void)HPL_sdrv(BUFFER, scnt, MSGID_BEGIN_PFACT, WORK, rcnt, MSGID_BEGIN_PFACT, MModAdd(partner, icurrow, nprow), COMM);
int mpartner = MModAdd(partner, icurrow, nprow);
MPI_Sendrecv(BUFFER, scnt, MPI_DOUBLE, mpartner, MSGID_BEGIN_PFACT, WORK, rcnt, MPI_DOUBLE, mpartner, MSGID_BEGIN_PFACT, COMM, MPI_STATUS_IGNORE);

tmp1 = Mabs(WORK[0]);
gmax = Mabs(BUFFER[0]);
Expand Down Expand Up @@ -532,23 +578,25 @@ void CPU::HPL_all_reduce_dmxswp(double* BUFFER, const int COUNT, const int ROOT,
if((mydis_ & mask) == 0) {
partner = (int)(mydis_ ^ ip2_);
if((mydis_ & ip2_) != 0) {
(void)HPL_recv(
A0, n0, MModAdd(root, partner, nprow), MSGID_BEGIN_PFACT, COMM);
MPI_Irecv(A0, n0, MPI_DOUBLE, MModAdd(root, partner, nprow), MSGID_BEGIN_PFACT, COMM, &reqs[req_idx++]);
} else if(partner < size_) {
(void)HPL_send(
A0, n0, MModAdd(root, partner, nprow), MSGID_BEGIN_PFACT, COMM);
MPI_Isend(A0, n0, MPI_DOUBLE, MModAdd(root, partner, nprow), MSGID_BEGIN_PFACT, COMM, &reqs[req_idx++]);
}
}
ip2_ >>= 1;
} while(ip2_ > 0);
}
if((Np2 != 0) && ((partner = (int)((unsigned int)(mydist) ^ ip2)) < nprow)) {
if((mydist & ip2) != 0) {
(void)HPL_recv(BUFFER, cnt_, MModAdd(partner, icurrow, nprow), MSGID_BEGIN_PFACT, COMM);
MPI_Irecv(BUFFER, cnt_, MPI_DOUBLE, MModAdd(partner, icurrow, nprow), MSGID_BEGIN_PFACT, COMM, &reqs[req_idx++]);
} else {
(void)HPL_send(BUFFER, cnt_, MModAdd(partner, icurrow, nprow), MSGID_BEGIN_PFACT, COMM);
MPI_Isend(BUFFER, cnt_, MPI_DOUBLE, MModAdd(partner, icurrow, nprow), MSGID_BEGIN_PFACT, COMM, &reqs[req_idx++]);
}
}

MPI_Waitall(req_idx, reqs, MPI_STATUSES_IGNORE);
req_idx = 0;
#endif
}

void CPU::HPL_set_zero(const int N, double* __restrict__ X) {
Expand Down
4 changes: 3 additions & 1 deletion testing/backend/HPL_backendCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,9 @@ int HPL_scatterv(double* BUF, const int* SCOUNT, const int* DISPL,
int HPL_allgatherv(double* BUF, const int SCOUNT, const int* RCOUNT,
const int* DISPL, MPI_Comm COMM) {

int ierr = MPI_Allgatherv(MPI_IN_PLACE, SCOUNT, MPI_DOUBLE, BUF, RCOUNT, DISPL, MPI_DOUBLE, COMM);
MPI_Request req;
int ierr = MPI_Iallgatherv(MPI_IN_PLACE, SCOUNT, MPI_DOUBLE, BUF, RCOUNT, DISPL, MPI_DOUBLE, COMM, &req);
MPI_Wait(&req, MPI_STATUS_IGNORE);

return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
}
Expand Down
9 changes: 8 additions & 1 deletion testing/backend/HPL_backendHIP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1348,7 +1348,14 @@ int HIP::bcast_ibcst(HPL_T_panel* PANEL, int* IFLAG) {
int root = PANEL->pcol;

if(PANEL->len <= 0) return HPL_SUCCESS;
int ierr = MPI_Bcast(L2ptr, PANEL->len, MPI_DOUBLE, root, comm);
int ierr = MPI_Ibcast(L2ptr, PANEL->len, MPI_DOUBLE, root, comm, &bcast_req);
*IFLAG = ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
return *IFLAG;
}

int HIP::bwait_ibcast(HPL_T_panel* PANEL) {
int ierr;
ierr = MPI_Wait(&bcast_req, MPI_STATUS_IGNORE);
return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
}

Expand Down
2 changes: 1 addition & 1 deletion testing/backend/HPL_backendWrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ extern "C" {
case T_CPU :
return HPL_bwait_2rinM( PANEL );
case T_HIP:
return HPL_SUCCESS;
return HPL::dispatch(HIP::bwait_ibcast, PANEL);
default:
return HPL_SUCCESS;
}
Expand Down

0 comments on commit f09d648

Please sign in to comment.