WIP: Add CUDA version to HIP branch

ecmwf-ifs · Aug 11, 2023 · 498d0e8 · 498d0e8
1 parent e018588
commit 498d0e8
Show file tree

Hide file tree

Showing 27 changed files with 1,769 additions and 167 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -27,6 +27,7 @@ ecbuild_find_package( NAME fiat REQUIRED )
 
 ecbuild_add_option( FEATURE MPI
                     DESCRIPTION "Support for MPI distributed memory parallelism"
+                    REQUIRED_PACKAGES "MPI COMPONENTS Fortran C CXX"
                     CONDITION fiat_HAVE_MPI )
 
 ecbuild_add_option( FEATURE OMP
@@ -35,11 +36,11 @@ ecbuild_add_option( FEATURE OMP
                     REQUIRED_PACKAGES "OpenMP COMPONENTS Fortran" )
 
 ecbuild_add_option( FEATURE DOUBLE_PRECISION
-	DEFAULT ON
+                    DEFAULT ON
                     DESCRIPTION "Support for Double Precision" )
 
 ecbuild_add_option( FEATURE SINGLE_PRECISION
-	DEFAULT ON
+                    DEFAULT ON
                     DESCRIPTION "Support for Single Precision" )
 
 if( HAVE_SINGLE_PRECISION )
@@ -74,18 +75,46 @@ ecbuild_add_option( FEATURE TRANSI
                     CONDITION HAVE_DOUBLE_PRECISION AND HAVE_CPU )
 
 set( HIP_REQUESTED OFF )
+set( CUDA_REQUESTED OFF )
 if( ECTRANS_ENABLE_GPU OR (NOT DEFINED ECTRANS_ENABLE_GPU AND ENABLE_GPU))
-  set( HIP_REQUESTED ON )
+# TODO: allow to switch between CUDA and HIP or automatically detect target platform
+  set( CUDA_REQUESTED ON )
+# set( HIP_REQUESTED ON )
 endif()
 
 if( HIP_REQUESTED )
   ectrans_find_hip() # sets "HAVE_HIP"
-endif()           
+endif()
+
+if( CUDA_REQUESTED )
+  enable_language( CUDA )
+  set( HAVE_CUDA ON )
+  if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+    set(CMAKE_CUDA_ARCHITECTURES 70 80)
+  endif()
+  find_package( CUDAToolkit )
+endif()
 
 ecbuild_add_option( FEATURE GPU
                     DEFAULT OFF
                     DESCRIPTION "Compile GPU version of ectrans (Requires OpenACC or sufficient OpenMP offloading support and MPI)"
-                    CONDITION HAVE_HIP )
+                    CONDITION HAVE_HIP OR HAVE_CUDA )
+
+if( ${CMAKE_VERSION} VERSION_LESS "3.25" AND (NOT DEFINED ENABLE_ACC OR ENABLE_ACC ) )
+  # Incredibly inconvenient: FindOpenACC does _not_ set OpenACC_FOUND, only
+  # the language-specific components OpenACC_Fortran_FOUND and OpenACC_C_FOUND.
+  # This means, even internally CMake considers OpenACC as not found.
+  # (See eg get_property(... GLOBAL PROPERTY PACKAGES_NOT_FOUND))
+  # Therefore, we search for OpenACC, set OpenACC_FOUND ourselves according to
+  # the result, and then, trigger a second find_package via ecbuild_add_option.
+  # This then conveniently takes the previously set OpenACC_FOUND into account
+  # and rectifies CMake's internal bookkeeping in the process.
+  # This has been fixed in CMake 3.25
+  find_package( OpenACC )
+  if( OpenACC_Fortran_FOUND )
+    set( OpenACC_FOUND ON )
+  endif()
+endif()
 
 ecbuild_add_option( FEATURE ACC
                     DEFAULT ON
@@ -100,10 +129,18 @@ if( HAVE_GPU )
   else()
     ecbuild_error("Could not enable GPU as OMP or ACC were not enabled")
   endif()
+
+  if( HAVE_CUDA )
+    set( GPU_RUNTIME "CUDA" )
+  elseif( HAVE_HIP )
+    set( GPU_RUNTIME "HIP" )
+  else()
+    ecbuild_error("Could not enable GPU as CUDA or HIP were not found")
+  endif()
 endif()
 
 ecbuild_add_option( FEATURE GPU_AWARE_MPI
-	            DEFAULT OFF
+                    DEFAULT OFF
                     CONDITION HAVE_GPU
                     REQUIRED_PACKAGES "MPI COMPONENTS CXX Fortran"
                     DESCRIPTION "Enable CUDA-aware MPI" )

diff --git a/src/programs/ectrans-benchmark.F90 b/src/programs/ectrans-benchmark.F90
@@ -63,7 +63,7 @@ program transform_test
 ! Default parameters
 integer(kind=jpim) :: nsmax   = 79  ! Spectral truncation
 integer(kind=jpim) :: iters   = 10  ! Number of iterations for transform test
-integer(kind=jpim) :: nfld    = 1   ! Number of scalar fields 
+integer(kind=jpim) :: nfld    = 1   ! Number of scalar fields
 integer(kind=jpim) :: nlev    = 1   ! Number of vertical levels
 
 integer(kind=jpim) :: nflevg
@@ -608,6 +608,8 @@ program transform_test
   ! Do inverse transform
   !=================================================================================================
 
+  write(nout, *) 'inv_trans'
+  flush(nout)
   ztstep1(jstep) = timef()
   call gstats(4,0)
   if (lvordiv) then
@@ -658,6 +660,8 @@ program transform_test
 
   ztstep2(jstep) = timef()
 
+  write(nout, *) 'dir_trans'
+  flush(nout)
   call gstats(5,0)
   if (lvordiv) then
     call dir_trans(kresol=1, kproma=nproma, &

diff --git a/src/trans/gpu/CMakeLists.txt b/src/trans/gpu/CMakeLists.txt
@@ -10,7 +10,7 @@
 
 if(CMAKE_Fortran_COMPILER_ID MATCHES "NVHPC")
 
-  # Compile setup_trans with pinned memory to improve data movement performance. 
+  # Compile setup_trans with pinned memory to improve data movement performance.
   ectrans_add_compile_options(
       SOURCES external/setup_trans.F90
       #FLAGS   "-gpu=pinned,deepcopy,fastmath,nordc")
@@ -30,6 +30,8 @@ ecbuild_list_add_pattern( LIST trans_src
                           QUIET
                         )
 
+ecbuild_info( "${trans_src}")
+
 ## for reduced memory option, replace source files
 if( HAVE_GPU_REDUCED_MEMORY )
   ecbuild_list_add_pattern( LIST reducedmem_files
@@ -50,12 +52,26 @@ ecbuild_list_exclude_pattern( LIST trans_src            REGEX dilatation_mod.F90
   ecbuild_info("warn: special compile flags ftdir_mod.F90")
 #endif()
 
-ectrans_declare_hip_sources( SOURCES_GLOB 
-  sharedmem/*.hip.cpp
-  algor/*.hip.cpp
-  internal/*.hip.cpp
-  external/*.hip.cpp
-)
+if( HIP_FOUND )
+  ectrans_declare_hip_sources( SOURCES_GLOB
+    sharedmem/*.hip.cpp
+    algor/*.hip.cpp
+    internal/*.hip.cpp
+    external/*.hip.cpp
+  )
+  ecbuild_list_exclude_pattern( LIST trans_src  REGEX .cu )
+  ecbuild_list_exclude_pattern( LIST trans_src  REGEX cublas_mod.F90 )
+  ecbuild_list_exclude_pattern( LIST trans_src  REGEX cuda_device_mod.F90 )
+  ecbuild_list_exclude_pattern( LIST trans_src  REGEX cuda_gemm_batched_mod.F90 )
+  ecbuild_list_exclude_pattern( LIST trans_src  REGEX tpm_fftc.F90 )
+else()
+  set( ECTRANS_GPU_HIP_LIBRARIES CUDA::cufft CUDA::cublas )
+  ecbuild_list_exclude_pattern( LIST trans_src  REGEX .hip.cpp )
+  ecbuild_list_exclude_pattern( LIST trans_src  REGEX hipblas_mod.F90 )
+  ecbuild_list_exclude_pattern( LIST trans_src  REGEX hip_device_mod.F90 )
+  ecbuild_list_exclude_pattern( LIST trans_src  REGEX hip_gemm_batched_mod.F90 )
+  ecbuild_list_exclude_pattern( LIST trans_src  REGEX tpm_ffth.F90 )
+endif()
 
 foreach( prec dp sp )
   if( HAVE_${prec} )
@@ -79,9 +95,11 @@ foreach( prec dp sp )
                            ${LAPACK_LIBRARIES} # we still have symbols in some files
                            $<${HAVE_ACC}:OpenACC::OpenACC_Fortran>
                            $<${HAVE_OMP}:OpenMP::OpenMP_Fortran>
-                           $<${HAVE_GPU_AWARE_MPI}:MPI::MPI_Fortran MPI::MPI_CXX>
+                          #  $<${HAVE_GPU_AWARE_MPI}:MPI::MPI_Fortran MPI::MPI_CXX>
+                          MPI::MPI_Fortran MPI::MPI_CXX
           PRIVATE_DEFINITIONS
 	                   ${GPU_OFFLOAD}GPU
+                     ${GPU_RUNTIME}GPU
 	                   $<${HAVE_GPU_AWARE_MPI}:USE_CUDA_AWARE_MPI_FT>
 			   $<${HAVE_GPU_REDUCED_MEMORY}:REDUCED_MEM>
         )

diff --git a/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu b/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu
@@ -0,0 +1,167 @@
+#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
+#include "cufft.h"
+#include "stdio.h"
+    static const char *_cudaGetErrorEnum(cufftResult error)
+    {
+    switch (error)
+    {
+    case CUFFT_SUCCESS:
+    return "CUFFT_SUCCESS";
+
+    case CUFFT_INVALID_PLAN:
+    return "CUFFT_INVALID_PLAN";
+
+    case CUFFT_ALLOC_FAILED:
+    return "CUFFT_ALLOC_FAILED";
+
+    case CUFFT_INVALID_TYPE:
+    return "CUFFT_INVALID_TYPE";
+
+    case CUFFT_INVALID_VALUE:
+    return "CUFFT_INVALID_VALUE";
+
+    case CUFFT_INTERNAL_ERROR:
+    return "CUFFT_INTERNAL_ERROR";
+
+    case CUFFT_EXEC_FAILED:
+    return "CUFFT_EXEC_FAILED";
+
+    case CUFFT_SETUP_FAILED:
+    return "CUFFT_SETUP_FAILED";
+
+    case CUFFT_INVALID_SIZE:
+    return "CUFFT_INVALID_SIZE";
+
+    case CUFFT_UNALIGNED_DATA:
+    return "CUFFT_UNALIGNED_DATA";
+    }
+
+    return "<unknown>";
+    }
+
+    inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
+    {
+    if( CUFFT_SUCCESS != err) {
+    fprintf(stderr, "CUFFT error at 1\n");
+    fprintf(stderr, "CUFFT error in file '%s'\n",__FILE__);
+    fprintf(stderr, "CUFFT error at 2\n");
+    /*fprintf(stderr, "CUFFT error line '%s'\n",__LINE__);*/
+    fprintf(stderr, "CUFFT error at 3\n");
+    /*fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
+    _cudaGetErrorEnum(err)); \*/
+    fprintf(stderr, "CUFFT error %d: %s\nterminating!\n",err,_cudaGetErrorEnum(err)); \
+    cudaDeviceReset(); return; \
+    }
+    }
+
+
+static int allocatedWorkspace=0;
+static void* planWorkspace;
+static int planWorkspaceSize=100*1024*1024; //100MB
+
+extern "C"
+void
+create_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, int *Np, int *LOTp)
+{
+int ISIGN = *ISIGNp;
+int N = *Np;
+int LOT = *LOTp;
+
+cufftHandle plan;
+
+if (cudaDeviceSynchronize() != cudaSuccess){
+	fprintf(stderr, "Cuda error: Failed to synchronize\n");
+	return;	
+}
+
+
+// //create a single re-usable workspace
+// if(!allocatedWorkspace){
+//   allocatedWorkspace=1;
+//   //allocate plan workspace
+//   cudaMalloc(&planWorkspace,planWorkspaceSize);
+// }
+//
+// //disable auto allocation so we can re-use a single workspace (created above)
+//  cufftSetAutoAllocation(plan, false);
+
+int embed[1];
+int stride;
+int dist;
+
+#ifdef TRANS_SINGLE
+cufftType cufft_1 = CUFFT_R2C;
+cufftType cufft_2 = CUFFT_C2R;
+#else
+cufftType cufft_1 = CUFFT_D2Z;
+cufftType cufft_2 = CUFFT_Z2D;
+#endif
+
+embed[0] = 1;
+stride   = LOT;
+dist     = 1;
+
+cufftSafeCall(cufftCreate(&plan));
+
+//printf("CreatePlan cuFFT\n","N=",N);
+//printf("%s %d \n","plan=",plan);
+//printf("%s %d \n","LOT=",LOT);
+//printf("%s %d \n","ISIGN=",ISIGN);
+//printf("%s %d \n","Np=",*Np);
+
+if( ISIGN== -1 ){
+  cufftSafeCall(cufftPlanMany(&plan, 1, &N,
+                 embed, stride, dist, 
+                 embed, stride, dist, 
+                 cufft_1, LOT));
+  //cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_D2Z, LOT));
+}
+else if( ISIGN== 1){
+  cufftSafeCall(cufftPlanMany(&plan, 1, &N,
+                 embed, stride, dist, 
+                 embed, stride, dist, 
+                 cufft_2, LOT));
+  //cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_Z2D, LOT));
+}
+else {
+  abort();
+}
+
+// // use our reusaable work area for the plan
+// cufftSetWorkArea(plan,planWorkspace); 
+
+/*
+if( ISIGN== -1 ){
+  cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_D2Z, LOT));
+}
+else if( ISIGN== 1){
+  cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_Z2D, LOT));
+}
+else {
+  abort();
+}
+*/
+
+if (cudaDeviceSynchronize() != cudaSuccess){
+	fprintf(stderr, "Cuda error: Failed to synchronize\n");
+	return;	
+}
+
+*PLANp=plan;
+
+// // get size used by this plan
+// size_t workSize;
+// cufftGetSize(plan,&workSize);
+//
+// // exit if we don't have enough space for the work area in the re-usable workspace
+// if(workSize > planWorkspaceSize){
+//   printf("create_plan_fftc: plan workspace size not large enough - exiting\n");
+// exit(1);
+// }
+
+
+return;
+
+
+}
+