From 016d4e94bee6f4b7ed25219e017943b179cceeca Mon Sep 17 00:00:00 2001 From: binbin Date: Mon, 25 Apr 2022 06:41:35 +0000 Subject: [PATCH] tune some parameters for better performance --- src/pgesv/HPL_pdgesvK2.cpp | 8 ++++---- testing/backend/HPL_backendHIP.cpp | 6 ++++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/pgesv/HPL_pdgesvK2.cpp b/src/pgesv/HPL_pdgesvK2.cpp index 052875a..6ed000b 100644 --- a/src/pgesv/HPL_pdgesvK2.cpp +++ b/src/pgesv/HPL_pdgesvK2.cpp @@ -209,7 +209,7 @@ void HPL_pdgesvK2 // overlap row swap2 with update1 HPL_BE_stream_wait_event(HPL_COMPUTESTREAM, HPL_RS_1, T_HIP); (void) HPL_pdupdate( NULL, NULL, panel[k], nn ); - HPL_BE_pdlaswp(panel[0], nn * 3, T_HIP); + HPL_BE_pdlaswp(panel[0], nn * 4, T_HIP); HPL_BE_event_record(HPL_RS_2, T_HIP); } @@ -217,15 +217,15 @@ void HPL_pdgesvK2 // split update and row swap step // overlap row swap3 with update2 - HPL_pdupdate( NULL, NULL, panel[0], nn * 3 ); + HPL_pdupdate( NULL, NULL, panel[0], nn * 4 ); - HPL_BE_pdlaswp(panel[0], nq - (nn * 3), T_HIP); + HPL_BE_pdlaswp(panel[0], nq - (nn * 4), T_HIP); HPL_BE_event_record(HPL_RS_3, T_HIP); //overlap row swap1 of next iteration with update3 HPL_BE_stream_wait_event(HPL_COMPUTESTREAM, HPL_RS_3, T_HIP); - HPL_pdupdate( NULL, NULL, panel[0], nq - (nn * 3) ); + HPL_pdupdate( NULL, NULL, panel[0], nq - (nn * 4) ); // overlap update with data copy diff --git a/testing/backend/HPL_backendHIP.cpp b/testing/backend/HPL_backendHIP.cpp index 1e03d3a..8c95a46 100644 --- a/testing/backend/HPL_backendHIP.cpp +++ b/testing/backend/HPL_backendHIP.cpp @@ -728,6 +728,7 @@ void HIP::dgemm(const enum HPL_ORDER ORDER, const enum HPL_TRANS TRANSA, GPUInfo("%-25s %-8d%-8d%-8d \t%-5s", "[DGEMM]", "With C of (R:C)", M, N, K, "HIP"); #if 1 //rocBLAS uses column-major storage for 2D arrays + hipEventRecord(dgemmStart, computeStream); ROCBLAS_CHECK_STATUS(rocblas_dgemm(_handle, (rocblas_operation)TRANSA, (rocblas_operation)TRANSB, M, N, K, &ALPHA, A, LDA, B, LDB, &BETA, C, LDC)); #else @@ -861,8 +862,8 @@ __global__ void _dlaswp00N(const int N, const int M, const int* __restrict__ IPIV) { // const int* IPIV) { - __shared__ double s_An_init[2048]; - __shared__ double s_An_ipiv[2048]; + __shared__ double s_An_init[512]; + __shared__ double s_An_ipiv[512]; const int m = threadIdx.x; const int n = blockIdx.x; @@ -934,6 +935,7 @@ void HIP::pdlaswp(HPL_T_panel *PANEL, const int NN){ mp = PANEL->mp - jb; nq0 = 0; nn = n - nq0; const int block_size = 512, grid_size = nn; + hipStreamWaitEvent(pdlaswpStream, dgemmStart, 0); hipLaunchKernelGGL(_dlaswp00N, dim3(grid_size), dim3(block_size), 0, pdlaswpStream, nn, jb, Aptr, lda, ipiv); }