From 0e44ecd48e2cfa404a0c9fb9ac040f04d9a0775e Mon Sep 17 00:00:00 2001 From: reger-men Date: Mon, 5 Jul 2021 01:46:35 +0000 Subject: [PATCH] move data H<->D --- include/backend/hpl_backendHIP.h | 5 +++ include/backend/hpl_backendWrapper.h | 8 ++++ src/pfact/HPL_dlocmax.cpp | 4 +- testing/backend/HPL_backendHIP.cpp | 57 ++++++++++++++++++++++++-- testing/backend/HPL_backendWrapper.cpp | 15 +++++++ testing/ptest/HPL_pdtest.cpp | 42 +++++++++++++------ 6 files changed, 115 insertions(+), 16 deletions(-) diff --git a/include/backend/hpl_backendHIP.h b/include/backend/hpl_backendHIP.h index c5e3ca2..c29c504 100644 --- a/include/backend/hpl_backendHIP.h +++ b/include/backend/hpl_backendHIP.h @@ -7,6 +7,8 @@ #include #include +#include +#include extern "C" { #include "hpl_pmatgen.h" @@ -118,8 +120,11 @@ namespace HIP { void atcpy(const int, const int, const double *, const int, double *, const int); + void move_data(double *, const double *, const size_t, const int); + // BLAS members namespace { rocblas_handle _handle; + std::map _memcpyKind; } } \ No newline at end of file diff --git a/include/backend/hpl_backendWrapper.h b/include/backend/hpl_backendWrapper.h index 8ee4843..3f97aa7 100644 --- a/include/backend/hpl_backendWrapper.h +++ b/include/backend/hpl_backendWrapper.h @@ -15,6 +15,12 @@ extern "C" { #include "hpl_panel.h" enum HPL_TARGET {T_DEFAULT, T_CPU, T_HIP, T_TEMPO}; +enum HPL_MOVE_DIRECTION {M_H2H = 0, + M_H2D = 1, + M_D2H = 2, + M_D2D = 3, + M_DEFAULT = 4}; + void HPL_BE_init(size_t, enum HPL_TARGET); @@ -76,6 +82,8 @@ void HPL_BE_dlacpy(const int, const int, const double *, const int, double *, co void HPL_BE_dlatcpy(const int, const int, const double *, const int, double *, const int, enum HPL_TARGET); +void HPL_BE_move_data(double *, const double *, const size_t, enum HPL_MOVE_DIRECTION, enum HPL_TARGET); + #ifdef __cplusplus } #endif \ No newline at end of file diff --git a/src/pfact/HPL_dlocmax.cpp b/src/pfact/HPL_dlocmax.cpp index 6446414..1d8f03b 100644 --- a/src/pfact/HPL_dlocmax.cpp +++ b/src/pfact/HPL_dlocmax.cpp @@ -122,7 +122,9 @@ void HPL_dlocmax myrow = PANEL->grid->myrow; nprow = PANEL->grid->nprow; nb = PANEL->nb; - kk = PANEL->ii + II + ( ilindx = HPL_idamax( N, A, 1 ) ); + //Adil + kk = PANEL->ii + II + ( ilindx = HPL_BE_idamax(N, A, 1, T_DEFAULT)); + /*kk = PANEL->ii + II + ( ilindx = HPL_idamax( N, A, 1 ) );*/ Mindxl2g( igindx, kk, nb, nb, myrow, 0, nprow ); /* * WORK[0] := local maximum absolute value scalar, diff --git a/testing/backend/HPL_backendHIP.cpp b/testing/backend/HPL_backendHIP.cpp index eed0389..b758725 100644 --- a/testing/backend/HPL_backendHIP.cpp +++ b/testing/backend/HPL_backendHIP.cpp @@ -23,7 +23,7 @@ void HIP::init(size_t num_gpus) GPUInfo("%-25s %-20lld", "[SharedMem]", "Shared Memory Per Block", (unsigned long long int)hipDeviceProp.sharedMemPerBlock); GPUInfo("%-25s %-20d", "[Regs]", "Registers Per Block", hipDeviceProp.regsPerBlock); GPUInfo("%-25s %-20d", "[WarpSize]", "WaveFront Size", hipDeviceProp.warpSize); - GPUInfo("%-25s %-20d", "[MaxThreads]", "Max Threads Per Block", hipDeviceProp.warpSize); + GPUInfo("%-25s %-20d", "[MaxThreads]", "Max Threads Per Block", hipDeviceProp.maxThreadsPerBlock); GPUInfo("%-25s %-4d %-4d %-4d", "[MaxThreadsDim]", "Max Threads Dimension", hipDeviceProp.maxThreadsDim[0], hipDeviceProp.maxThreadsDim[1], hipDeviceProp.maxThreadsDim[2]); GPUInfo("%-25s %-4d %-4d %-4d", "[MaxGridSize]", "Max Grid Size", hipDeviceProp.maxGridSize[0], hipDeviceProp.maxGridSize[1], hipDeviceProp.maxGridSize[2]); GPUInfo("%-25s %-20lld", "[ConstMem]", "Total Constant Memory", (unsigned long long int)hipDeviceProp.totalConstMem); @@ -38,6 +38,12 @@ void HIP::init(size_t num_gpus) //Init ROCBlas rocblas_initialize(); ROCBLAS_CHECK_STATUS(rocblas_create_handle(&_handle)); + + _memcpyKind[0] = "H2H"; + _memcpyKind[1] = "H2D"; + _memcpyKind[2] = "D2H"; + _memcpyKind[3] = "D2D"; + _memcpyKind[4] = "DEFAULT"; } void HIP::release() @@ -106,7 +112,7 @@ void HIP::matgen(const HPL_T_grid *GRID, const int M, const int N, //TODO: generate numbers in this range (-0.5, 0.5] ROCRAND_CHECK_STATUS(rocrand_generate_normal_double(generator, A, mp*nq, 0, 0.1)); ROCRAND_CHECK_STATUS(rocrand_destroy_generator(generator)); - //gPrintMat(M,N,LDA,A); + //gPrintMat(5,5,LDA,A); } int HIP::idamax(const int N, const double *DX, const int INCX) @@ -150,9 +156,26 @@ void HIP::trsm( const enum HPL_ORDER ORDER, const enum HPL_SIDE SIDE, const double ALPHA, const double *A, const int LDA, double *B, const int LDB) { GPUInfo("%-25s %-8d%-8d \t%-5s", "[TRSM]", "With B of (R:C)", M, N, "HIP"); +#if 0 //rocBLAS uses column-major storage for 2D arrays ROCBLAS_CHECK_STATUS(rocblas_dtrsm(_handle, (rocblas_side)SIDE, (rocblas_fill)UPLO, (rocblas_operation)TRANSA, (rocblas_diagonal)DIAG, M, N, &ALPHA, A, LDA, B, LDB)); +#else + double * d_A, * d_B; + HIP::malloc((void**)&d_A, LDA*M*sizeof(double)); + HIP::malloc((void**)&d_B, LDB*N*sizeof(double)); + + HIP::move_data(d_A, A, LDA*M*sizeof(double), 1); + HIP::move_data(d_B, B, LDB*N*sizeof(double), 1); + + ROCBLAS_CHECK_STATUS(rocblas_dtrsm(_handle, (rocblas_side)SIDE, (rocblas_fill)UPLO, (rocblas_operation)TRANSA, + (rocblas_diagonal)DIAG, M, N, &ALPHA, d_A, LDA, d_B, LDB)); + + HIP::move_data(B, d_B, LDB*N*sizeof(double), 2); + + HIP::free((void**)&d_A); + HIP::free((void**)&d_B); +#endif } void HIP::trsv(const enum HPL_ORDER ORDER, const enum HPL_UPLO UPLO, @@ -172,10 +195,32 @@ void HIP::dgemm(const enum HPL_ORDER ORDER, const enum HPL_TRANS TRANSA, const double *B, const int LDB, const double BETA, double *C, const int LDC) { - GPUInfo("%-25s %-8d%-8d \t%-5s", "[DGEMM]", "With C of (R:C)", M, N, "HIP"); + GPUInfo("%-25s %-8d%-8d \t%-5s", "[DGEMM]", "With C of (R:C)", LDC, N, "HIP"); +#if 0 //rocBLAS uses column-major storage for 2D arrays ROCBLAS_CHECK_STATUS(rocblas_dgemm(_handle, (rocblas_operation)TRANSA, (rocblas_operation)TRANSB, M, N, K, &ALPHA, A, LDA, B, LDB, &BETA, C, LDC)); +#else + double * d_A, * d_B, * d_C; + HIP::malloc((void**)&d_A, LDA*K*sizeof(double)); + HIP::malloc((void**)&d_B, LDB*N*sizeof(double)); + HIP::malloc((void**)&d_C, LDC*N*sizeof(double)); + + HIP::move_data(d_A, A, LDA*K*sizeof(double), 1); + HIP::move_data(d_B, B, LDB*N*sizeof(double), 1); + HIP::move_data(d_C, C, LDC*N*sizeof(double), 1); + + ROCBLAS_CHECK_STATUS(rocblas_dgemm(_handle, (rocblas_operation)TRANSA, (rocblas_operation)TRANSB, + M, N, K, &ALPHA, d_A, LDA, d_B, LDB, &BETA, d_C, LDC)); + + HIP::move_data(C, d_C, LDC*N*sizeof(double), 2); + + HIP::free((void**)&d_A); + HIP::free((void**)&d_B); + HIP::free((void**)&d_C); +#endif + + hipDeviceSynchronize(); } void HIP::dgemv(const enum HPL_ORDER ORDER, const enum HPL_TRANS TRANS, const int M, const int N, @@ -240,3 +285,9 @@ void HIP::atcpy(const int M, const int N, const double *A, const int LDA, _dlatcpy<<>>(M, N, A, LDA, B, LDB); } +void HIP::move_data(double *DST, const double *SRC, const size_t SIZE, const int KIND) +{ + char title[25] = "[MOVE_"; strcat(title,_memcpyKind[KIND]); strcat(title,"]"); + GPUInfo("%-25s %-12ld (B) \t%-5s", title, "Memory of size", SIZE, "HIP"); + HIP_CHECK_ERROR(hipMemcpy(DST, SRC, SIZE, (hipMemcpyKind)KIND)); +} \ No newline at end of file diff --git a/testing/backend/HPL_backendWrapper.cpp b/testing/backend/HPL_backendWrapper.cpp index 4fa928b..2a1aaba 100644 --- a/testing/backend/HPL_backendWrapper.cpp +++ b/testing/backend/HPL_backendWrapper.cpp @@ -335,4 +335,19 @@ extern "C" { HPL::dispatch(CPU::atcpy, M, N, A, LDA, B, LDB); } } + + void HPL_BE_move_data(double *DST, const double *SRC, const size_t SIZE, + enum HPL_MOVE_DIRECTION KIND, enum HPL_TARGET TR) + { + switch(TR) { + case T_CPU : + DO_NOTHING(); + break; + case T_HIP: + HPL::dispatch(HIP::move_data, DST, SRC, SIZE, (int)KIND); + break; + default: + DO_NOTHING(); + } + } } //extern "C" \ No newline at end of file diff --git a/testing/ptest/HPL_pdtest.cpp b/testing/ptest/HPL_pdtest.cpp index 05c6914..5e74566 100644 --- a/testing/ptest/HPL_pdtest.cpp +++ b/testing/ptest/HPL_pdtest.cpp @@ -161,12 +161,13 @@ void HPL_pdtest /* * Allocate dynamic memory */ - //Adil - HPL_BE_malloc((void**)&vptr, ((size_t)(ALGO->align) + (size_t)(mat.ld+1) * (size_t)(mat.nq) ) * sizeof(double), T_TEMPO); - /*vptr = (void*)malloc( ( (size_t)(ALGO->align) + - (size_t)(mat.ld+1) * (size_t)(mat.nq) ) * - sizeof(double) ); - */ + //Adil: temp: generate mat on CPU and move it to the CPU. FIXME: Generate the correct Matrix. + size_t bytes = ((size_t)(ALGO->align) + (size_t)(mat.ld+1) * (size_t)(mat.nq) ) * sizeof(double); + + void * d_vptr = NULL; + HPL_BE_malloc((void**)&d_vptr, bytes, T_HIP); + vptr = (void*)malloc(bytes); + info[0] = (vptr == NULL); info[1] = myrow; info[2] = mycol; (void) HPL_all_reduce( (void *)(info), 3, HPL_INT, HPL_max, GRID->all_comm ); @@ -179,7 +180,7 @@ void HPL_pdtest (TEST->kskip)++; /* some processes might have succeeded with allocation */ //Adil - if( vptr ) HPL_BE_free((void**)&vptr, T_TEMPO); + if( vptr ) HPL_BE_free((void**)&vptr, T_DEFAULT); /*if (vptr) free(vptr);*/ return; } @@ -189,8 +190,25 @@ void HPL_pdtest mat.A = (double *)HPL_PTR( vptr, ((size_t)(ALGO->align) * sizeof(double) ) ); mat.X = Mptr( mat.A, 0, mat.nq, mat.ld ); //Adil - HPL_BE_dmatgen(GRID, N, N+1, NB, mat.A, mat.ld, HPL_ISEED, T_TEMPO); + HPL_BE_dmatgen(GRID, N, N+1, NB, mat.A, mat.ld, HPL_ISEED, T_CPU); //HPL_pdmatgen( GRID, N, N+1, NB, mat.A, mat.ld, HPL_ISEED ); + +#if 0 + mat.d_A = (double *)HPL_PTR( d_vptr, ((size_t)(ALGO->align) * sizeof(double) ) ); + mat.d_X = Mptr( mat.d_A, 0, mat.nq, mat.ld ); + HPL_BE_move_data(mat.d_A, mat.A, (N*(N+1)) * sizeof(double), M_H2D, T_HIP); + /*{ + // Last row is the vector b + for(int y=0;y<6; y++){ + for(int x=0;x<4; x++){ + int index = x+y*mat.ld; + printf("%-4d:%-8lf\t", index, mat.d_A[index]); + } + printf("\n"); + } + }*/ +#endif + #ifdef HPL_CALL_VSIPL mat.block = vsip_blockbind_d( (vsip_scalar_d *)(mat.A), (vsip_length)(mat.ld * mat.nq), @@ -331,7 +349,7 @@ void HPL_pdtest (TEST->kpass)++; //Adil - if( vptr ) HPL_BE_free((void**)&vptr, T_TEMPO); + if( vptr ) HPL_BE_free((void**)&vptr, T_DEFAULT); /*if( vptr ) free( vptr ); */ return; } @@ -353,7 +371,7 @@ void HPL_pdtest * and norm inf of b - A x. Display residual checks. */ //Adil - HPL_BE_dmatgen(GRID, N, N+1, NB, mat.A, mat.ld, HPL_ISEED, T_TEMPO); + HPL_BE_dmatgen(GRID, N, N+1, NB, mat.A, mat.ld, HPL_ISEED, T_DEFAULT); /*HPL_pdmatgen( GRID, N, N+1, NB, mat.A, mat.ld, HPL_ISEED );*/ Anorm1 = HPL_pdlange( GRID, HPL_NORM_1, N, N, NB, mat.A, mat.ld ); AnormI = HPL_pdlange( GRID, HPL_NORM_I, N, N, NB, mat.A, mat.ld ); @@ -391,7 +409,7 @@ void HPL_pdtest { //Adil HPL_BE_dgemv( HplColumnMajor, HplNoTrans, mat.mp, nq, -HPL_rone, - mat.A, mat.ld, mat.X, 1, HPL_rone, Bptr, 1, T_TEMPO); + mat.A, mat.ld, mat.X, 1, HPL_rone, Bptr, 1, T_DEFAULT); /*HPL_dgemv( HplColumnMajor, HplNoTrans, mat.mp, nq, -HPL_rone, mat.A, mat.ld, mat.X, 1, HPL_rone, Bptr, 1 );*/ } @@ -455,7 +473,7 @@ void HPL_pdtest } } //Adil - if( vptr ) HPL_BE_free((void**)&vptr, T_TEMPO); + if( vptr ) HPL_BE_free((void**)&vptr, T_DEFAULT); //if( vptr ) free( vptr ); /* * End of HPL_pdtest