From 016d4e94bee6f4b7ed25219e017943b179cceeca Mon Sep 17 00:00:00 2001
From: binbin <oudwwv@163.com>
Date: Mon, 25 Apr 2022 06:41:35 +0000
Subject: [PATCH] tune some parameters for better performance

---
 src/pgesv/HPL_pdgesvK2.cpp         | 8 ++++----
 testing/backend/HPL_backendHIP.cpp | 6 ++++--
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/pgesv/HPL_pdgesvK2.cpp b/src/pgesv/HPL_pdgesvK2.cpp
index 052875a..6ed000b 100644
--- a/src/pgesv/HPL_pdgesvK2.cpp
+++ b/src/pgesv/HPL_pdgesvK2.cpp
@@ -209,7 +209,7 @@ void HPL_pdgesvK2
             // overlap row swap2 with update1
             HPL_BE_stream_wait_event(HPL_COMPUTESTREAM, HPL_RS_1, T_HIP);
             (void) HPL_pdupdate( NULL, NULL, panel[k], nn );
-            HPL_BE_pdlaswp(panel[0], nn * 3, T_HIP);
+            HPL_BE_pdlaswp(panel[0], nn * 4, T_HIP);
             HPL_BE_event_record(HPL_RS_2, T_HIP);
 
          }
@@ -217,15 +217,15 @@ void HPL_pdgesvK2
 
          // split update and row swap step    
          // overlap row swap3 with update2
-         HPL_pdupdate( NULL, NULL, panel[0], nn * 3 );
+         HPL_pdupdate( NULL, NULL, panel[0], nn * 4 );
          
-         HPL_BE_pdlaswp(panel[0], nq - (nn * 3), T_HIP);        
+         HPL_BE_pdlaswp(panel[0], nq - (nn * 4), T_HIP);        
          HPL_BE_event_record(HPL_RS_3, T_HIP);
 
          //overlap row swap1 of next iteration with update3
          HPL_BE_stream_wait_event(HPL_COMPUTESTREAM, HPL_RS_3, T_HIP);
 
-         HPL_pdupdate( NULL, NULL, panel[0], nq - (nn * 3) );
+         HPL_pdupdate( NULL, NULL, panel[0], nq - (nn * 4) );
 
          // overlap update with data copy
          
diff --git a/testing/backend/HPL_backendHIP.cpp b/testing/backend/HPL_backendHIP.cpp
index 1e03d3a..8c95a46 100644
--- a/testing/backend/HPL_backendHIP.cpp
+++ b/testing/backend/HPL_backendHIP.cpp
@@ -728,6 +728,7 @@ void HIP::dgemm(const enum HPL_ORDER ORDER, const enum HPL_TRANS TRANSA,
     GPUInfo("%-25s %-8d%-8d%-8d \t%-5s", "[DGEMM]", "With C of (R:C)", M, N, K, "HIP");
 #if 1
     //rocBLAS uses column-major storage for 2D arrays
+    hipEventRecord(dgemmStart, computeStream);
     ROCBLAS_CHECK_STATUS(rocblas_dgemm(_handle, (rocblas_operation)TRANSA, (rocblas_operation)TRANSB, 
                          M, N, K, &ALPHA, A, LDA, B, LDB, &BETA, C, LDC));
 #else
@@ -861,8 +862,8 @@ __global__ void _dlaswp00N(const int N, const int M,
                      const int* __restrict__ IPIV) {
                     //  const int* IPIV) {
 
-   __shared__ double s_An_init[2048];
-   __shared__ double s_An_ipiv[2048];
+   __shared__ double s_An_init[512];
+   __shared__ double s_An_ipiv[512];
 
    const int m = threadIdx.x;
    const int n = blockIdx.x;
@@ -934,6 +935,7 @@ void HIP::pdlaswp(HPL_T_panel *PANEL, const int NN){
     mp   = PANEL->mp - jb;  nq0   = 0;       nn = n - nq0;
 
     const int block_size = 512, grid_size = nn;
+    hipStreamWaitEvent(pdlaswpStream, dgemmStart, 0);
     hipLaunchKernelGGL(_dlaswp00N, dim3(grid_size), dim3(block_size), 0, pdlaswpStream,
                                       nn, jb, Aptr, lda, ipiv);
 }