From 498d0e848f2faeb81cc041e53f7dd5e8522da29e Mon Sep 17 00:00:00 2001 From: Balthasar Reuter Date: Thu, 10 Aug 2023 16:34:35 +0200 Subject: [PATCH] WIP: Add CUDA version to HIP branch --- CMakeLists.txt | 49 +++- src/programs/ectrans-benchmark.F90 | 6 +- src/trans/gpu/CMakeLists.txt | 34 ++- .../external/fourier/create_plan_fftc.cu | 167 ++++++++++++ .../external/fourier/destroy_plan_fftc.cu | 77 ++++++ .../external/fourier/execute_plan_fftc.cu | 100 +++++++ .../algor/external/fourier/storage_fftc.cu | 28 ++ .../gpu/algor/module/cublasDgemmBatched.cu | 141 ++++++++++ .../gpu/algor/module/cublasSTCgemmBatched.cu | 107 ++++++++ .../gpu/algor/module/cublasSgemmBatched.cu | 125 +++++++++ .../module/cublasTCgemmBatched.actual.cu | 134 ++++++++++ src/trans/gpu/algor/module/cublas_mod.F90 | 158 +++++++++++ .../gpu/algor/module/cuda_device_mod.F90 | 40 +++ .../gpu/algor/module/hip_device_mod.F90~ | 40 --- src/trans/gpu/external/setup_trans.F90 | 12 +- .../gpu/internal/cuda_gemm_batched_mod.F90 | 16 +- src/trans/gpu/internal/ftdir_mod.F90 | 18 +- src/trans/gpu/internal/ftinv_mod.F90 | 19 +- .../gpu/internal/hip_gemm_batched_mod.F90 | 248 ++++++++++++++++++ src/trans/gpu/internal/ledir_mod.F90 | 42 +-- src/trans/gpu/internal/leinv_mod.F90 | 34 +-- src/trans/gpu/internal/set_resol_mod.F90 | 10 +- src/trans/gpu/internal/sufft_mod.F90 | 30 ++- src/trans/gpu/internal/tpm_fftc.F90 | 210 +++++++++++++++ src/trans/gpu/internal/tpm_ffth.F90 | 73 +++--- .../gpu/internal_reducedmem/ftdir_mod.F90 | 6 +- .../gpu/internal_reducedmem/ftinv_mod.F90 | 12 +- 27 files changed, 1769 insertions(+), 167 deletions(-) create mode 100644 src/trans/gpu/algor/external/fourier/create_plan_fftc.cu create mode 100644 src/trans/gpu/algor/external/fourier/destroy_plan_fftc.cu create mode 100644 src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu create mode 100644 src/trans/gpu/algor/external/fourier/storage_fftc.cu create mode 100644 src/trans/gpu/algor/module/cublasDgemmBatched.cu create mode 100644 src/trans/gpu/algor/module/cublasSTCgemmBatched.cu create mode 100644 src/trans/gpu/algor/module/cublasSgemmBatched.cu create mode 100644 src/trans/gpu/algor/module/cublasTCgemmBatched.actual.cu create mode 100644 src/trans/gpu/algor/module/cublas_mod.F90 create mode 100644 src/trans/gpu/algor/module/cuda_device_mod.F90 delete mode 100644 src/trans/gpu/algor/module/hip_device_mod.F90~ create mode 100755 src/trans/gpu/internal/hip_gemm_batched_mod.F90 create mode 100755 src/trans/gpu/internal/tpm_fftc.F90 diff --git a/CMakeLists.txt b/CMakeLists.txt index 51363714d..12fb560a4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,6 +27,7 @@ ecbuild_find_package( NAME fiat REQUIRED ) ecbuild_add_option( FEATURE MPI DESCRIPTION "Support for MPI distributed memory parallelism" + REQUIRED_PACKAGES "MPI COMPONENTS Fortran C CXX" CONDITION fiat_HAVE_MPI ) ecbuild_add_option( FEATURE OMP @@ -35,11 +36,11 @@ ecbuild_add_option( FEATURE OMP REQUIRED_PACKAGES "OpenMP COMPONENTS Fortran" ) ecbuild_add_option( FEATURE DOUBLE_PRECISION - DEFAULT ON + DEFAULT ON DESCRIPTION "Support for Double Precision" ) ecbuild_add_option( FEATURE SINGLE_PRECISION - DEFAULT ON + DEFAULT ON DESCRIPTION "Support for Single Precision" ) if( HAVE_SINGLE_PRECISION ) @@ -74,18 +75,46 @@ ecbuild_add_option( FEATURE TRANSI CONDITION HAVE_DOUBLE_PRECISION AND HAVE_CPU ) set( HIP_REQUESTED OFF ) +set( CUDA_REQUESTED OFF ) if( ECTRANS_ENABLE_GPU OR (NOT DEFINED ECTRANS_ENABLE_GPU AND ENABLE_GPU)) - set( HIP_REQUESTED ON ) +# TODO: allow to switch between CUDA and HIP or automatically detect target platform + set( CUDA_REQUESTED ON ) +# set( HIP_REQUESTED ON ) endif() if( HIP_REQUESTED ) ectrans_find_hip() # sets "HAVE_HIP" -endif() +endif() + +if( CUDA_REQUESTED ) + enable_language( CUDA ) + set( HAVE_CUDA ON ) + if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES 70 80) + endif() + find_package( CUDAToolkit ) +endif() ecbuild_add_option( FEATURE GPU DEFAULT OFF DESCRIPTION "Compile GPU version of ectrans (Requires OpenACC or sufficient OpenMP offloading support and MPI)" - CONDITION HAVE_HIP ) + CONDITION HAVE_HIP OR HAVE_CUDA ) + +if( ${CMAKE_VERSION} VERSION_LESS "3.25" AND (NOT DEFINED ENABLE_ACC OR ENABLE_ACC ) ) + # Incredibly inconvenient: FindOpenACC does _not_ set OpenACC_FOUND, only + # the language-specific components OpenACC_Fortran_FOUND and OpenACC_C_FOUND. + # This means, even internally CMake considers OpenACC as not found. + # (See eg get_property(... GLOBAL PROPERTY PACKAGES_NOT_FOUND)) + # Therefore, we search for OpenACC, set OpenACC_FOUND ourselves according to + # the result, and then, trigger a second find_package via ecbuild_add_option. + # This then conveniently takes the previously set OpenACC_FOUND into account + # and rectifies CMake's internal bookkeeping in the process. + # This has been fixed in CMake 3.25 + find_package( OpenACC ) + if( OpenACC_Fortran_FOUND ) + set( OpenACC_FOUND ON ) + endif() +endif() ecbuild_add_option( FEATURE ACC DEFAULT ON @@ -100,10 +129,18 @@ if( HAVE_GPU ) else() ecbuild_error("Could not enable GPU as OMP or ACC were not enabled") endif() + + if( HAVE_CUDA ) + set( GPU_RUNTIME "CUDA" ) + elseif( HAVE_HIP ) + set( GPU_RUNTIME "HIP" ) + else() + ecbuild_error("Could not enable GPU as CUDA or HIP were not found") + endif() endif() ecbuild_add_option( FEATURE GPU_AWARE_MPI - DEFAULT OFF + DEFAULT OFF CONDITION HAVE_GPU REQUIRED_PACKAGES "MPI COMPONENTS CXX Fortran" DESCRIPTION "Enable CUDA-aware MPI" ) diff --git a/src/programs/ectrans-benchmark.F90 b/src/programs/ectrans-benchmark.F90 index b847f0d45..47f4ed6b9 100644 --- a/src/programs/ectrans-benchmark.F90 +++ b/src/programs/ectrans-benchmark.F90 @@ -63,7 +63,7 @@ program transform_test ! Default parameters integer(kind=jpim) :: nsmax = 79 ! Spectral truncation integer(kind=jpim) :: iters = 10 ! Number of iterations for transform test -integer(kind=jpim) :: nfld = 1 ! Number of scalar fields +integer(kind=jpim) :: nfld = 1 ! Number of scalar fields integer(kind=jpim) :: nlev = 1 ! Number of vertical levels integer(kind=jpim) :: nflevg @@ -608,6 +608,8 @@ program transform_test ! Do inverse transform !================================================================================================= + write(nout, *) 'inv_trans' + flush(nout) ztstep1(jstep) = timef() call gstats(4,0) if (lvordiv) then @@ -658,6 +660,8 @@ program transform_test ztstep2(jstep) = timef() + write(nout, *) 'dir_trans' + flush(nout) call gstats(5,0) if (lvordiv) then call dir_trans(kresol=1, kproma=nproma, & diff --git a/src/trans/gpu/CMakeLists.txt b/src/trans/gpu/CMakeLists.txt index 666bf563d..154109a88 100644 --- a/src/trans/gpu/CMakeLists.txt +++ b/src/trans/gpu/CMakeLists.txt @@ -10,7 +10,7 @@ if(CMAKE_Fortran_COMPILER_ID MATCHES "NVHPC") - # Compile setup_trans with pinned memory to improve data movement performance. + # Compile setup_trans with pinned memory to improve data movement performance. ectrans_add_compile_options( SOURCES external/setup_trans.F90 #FLAGS "-gpu=pinned,deepcopy,fastmath,nordc") @@ -30,6 +30,8 @@ ecbuild_list_add_pattern( LIST trans_src QUIET ) +ecbuild_info( "${trans_src}") + ## for reduced memory option, replace source files if( HAVE_GPU_REDUCED_MEMORY ) ecbuild_list_add_pattern( LIST reducedmem_files @@ -50,12 +52,26 @@ ecbuild_list_exclude_pattern( LIST trans_src REGEX dilatation_mod.F90 ecbuild_info("warn: special compile flags ftdir_mod.F90") #endif() -ectrans_declare_hip_sources( SOURCES_GLOB - sharedmem/*.hip.cpp - algor/*.hip.cpp - internal/*.hip.cpp - external/*.hip.cpp -) +if( HIP_FOUND ) + ectrans_declare_hip_sources( SOURCES_GLOB + sharedmem/*.hip.cpp + algor/*.hip.cpp + internal/*.hip.cpp + external/*.hip.cpp + ) + ecbuild_list_exclude_pattern( LIST trans_src REGEX .cu ) + ecbuild_list_exclude_pattern( LIST trans_src REGEX cublas_mod.F90 ) + ecbuild_list_exclude_pattern( LIST trans_src REGEX cuda_device_mod.F90 ) + ecbuild_list_exclude_pattern( LIST trans_src REGEX cuda_gemm_batched_mod.F90 ) + ecbuild_list_exclude_pattern( LIST trans_src REGEX tpm_fftc.F90 ) +else() + set( ECTRANS_GPU_HIP_LIBRARIES CUDA::cufft CUDA::cublas ) + ecbuild_list_exclude_pattern( LIST trans_src REGEX .hip.cpp ) + ecbuild_list_exclude_pattern( LIST trans_src REGEX hipblas_mod.F90 ) + ecbuild_list_exclude_pattern( LIST trans_src REGEX hip_device_mod.F90 ) + ecbuild_list_exclude_pattern( LIST trans_src REGEX hip_gemm_batched_mod.F90 ) + ecbuild_list_exclude_pattern( LIST trans_src REGEX tpm_ffth.F90 ) +endif() foreach( prec dp sp ) if( HAVE_${prec} ) @@ -79,9 +95,11 @@ foreach( prec dp sp ) ${LAPACK_LIBRARIES} # we still have symbols in some files $<${HAVE_ACC}:OpenACC::OpenACC_Fortran> $<${HAVE_OMP}:OpenMP::OpenMP_Fortran> - $<${HAVE_GPU_AWARE_MPI}:MPI::MPI_Fortran MPI::MPI_CXX> + # $<${HAVE_GPU_AWARE_MPI}:MPI::MPI_Fortran MPI::MPI_CXX> + MPI::MPI_Fortran MPI::MPI_CXX PRIVATE_DEFINITIONS ${GPU_OFFLOAD}GPU + ${GPU_RUNTIME}GPU $<${HAVE_GPU_AWARE_MPI}:USE_CUDA_AWARE_MPI_FT> $<${HAVE_GPU_REDUCED_MEMORY}:REDUCED_MEM> ) diff --git a/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu b/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu new file mode 100644 index 000000000..b45e329de --- /dev/null +++ b/src/trans/gpu/algor/external/fourier/create_plan_fftc.cu @@ -0,0 +1,167 @@ +#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__) +#include "cufft.h" +#include "stdio.h" + static const char *_cudaGetErrorEnum(cufftResult error) + { + switch (error) + { + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; + + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; + + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; + + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; + + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; + + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; + + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; + + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; + + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; + + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; + } + + return ""; + } + + inline void __cufftSafeCall(cufftResult err, const char *file, const int line) + { + if( CUFFT_SUCCESS != err) { + fprintf(stderr, "CUFFT error at 1\n"); + fprintf(stderr, "CUFFT error in file '%s'\n",__FILE__); + fprintf(stderr, "CUFFT error at 2\n"); + /*fprintf(stderr, "CUFFT error line '%s'\n",__LINE__);*/ + fprintf(stderr, "CUFFT error at 3\n"); + /*fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \ + _cudaGetErrorEnum(err)); \*/ + fprintf(stderr, "CUFFT error %d: %s\nterminating!\n",err,_cudaGetErrorEnum(err)); \ + cudaDeviceReset(); return; \ + } + } + + +static int allocatedWorkspace=0; +static void* planWorkspace; +static int planWorkspaceSize=100*1024*1024; //100MB + +extern "C" +void +create_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, int *Np, int *LOTp) +{ +int ISIGN = *ISIGNp; +int N = *Np; +int LOT = *LOTp; + +cufftHandle plan; + +if (cudaDeviceSynchronize() != cudaSuccess){ + fprintf(stderr, "Cuda error: Failed to synchronize\n"); + return; +} + + +// //create a single re-usable workspace +// if(!allocatedWorkspace){ +// allocatedWorkspace=1; +// //allocate plan workspace +// cudaMalloc(&planWorkspace,planWorkspaceSize); +// } +// +// //disable auto allocation so we can re-use a single workspace (created above) +// cufftSetAutoAllocation(plan, false); + +int embed[1]; +int stride; +int dist; + +#ifdef TRANS_SINGLE +cufftType cufft_1 = CUFFT_R2C; +cufftType cufft_2 = CUFFT_C2R; +#else +cufftType cufft_1 = CUFFT_D2Z; +cufftType cufft_2 = CUFFT_Z2D; +#endif + +embed[0] = 1; +stride = LOT; +dist = 1; + +cufftSafeCall(cufftCreate(&plan)); + +//printf("CreatePlan cuFFT\n","N=",N); +//printf("%s %d \n","plan=",plan); +//printf("%s %d \n","LOT=",LOT); +//printf("%s %d \n","ISIGN=",ISIGN); +//printf("%s %d \n","Np=",*Np); + +if( ISIGN== -1 ){ + cufftSafeCall(cufftPlanMany(&plan, 1, &N, + embed, stride, dist, + embed, stride, dist, + cufft_1, LOT)); + //cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_D2Z, LOT)); +} +else if( ISIGN== 1){ + cufftSafeCall(cufftPlanMany(&plan, 1, &N, + embed, stride, dist, + embed, stride, dist, + cufft_2, LOT)); + //cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_Z2D, LOT)); +} +else { + abort(); +} + +// // use our reusaable work area for the plan +// cufftSetWorkArea(plan,planWorkspace); + +/* +if( ISIGN== -1 ){ + cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_D2Z, LOT)); +} +else if( ISIGN== 1){ + cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_Z2D, LOT)); +} +else { + abort(); +} +*/ + +if (cudaDeviceSynchronize() != cudaSuccess){ + fprintf(stderr, "Cuda error: Failed to synchronize\n"); + return; +} + +*PLANp=plan; + +// // get size used by this plan +// size_t workSize; +// cufftGetSize(plan,&workSize); +// +// // exit if we don't have enough space for the work area in the re-usable workspace +// if(workSize > planWorkspaceSize){ +// printf("create_plan_fftc: plan workspace size not large enough - exiting\n"); +// exit(1); +// } + + +return; + + +} + diff --git a/src/trans/gpu/algor/external/fourier/destroy_plan_fftc.cu b/src/trans/gpu/algor/external/fourier/destroy_plan_fftc.cu new file mode 100644 index 000000000..d0dd94201 --- /dev/null +++ b/src/trans/gpu/algor/external/fourier/destroy_plan_fftc.cu @@ -0,0 +1,77 @@ +#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__) +#include "cufft.h" +#include "stdio.h" + static const char *_cudaGetErrorEnum(cufftResult error) + { + switch (error) + { + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; + + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; + + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; + + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; + + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; + + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; + + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; + + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; + + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; + + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; + } + + return ""; + } + + inline void __cufftSafeCall(cufftResult err, const char *file, const int line) + { + if( CUFFT_SUCCESS != err) { + fprintf(stderr, "CUFFT error at 1\n"); + fprintf(stderr, "CUFFT error in file '%s'\n",__FILE__); + fprintf(stderr, "CUFFT error at 2\n"); + /*fprintf(stderr, "CUFFT error line '%s'\n",__LINE__);*/ + fprintf(stderr, "CUFFT error at 3\n"); + /*fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \ + _cudaGetErrorEnum(err)); \*/ + fprintf(stderr, "CUFFT error %d: %s\nterminating!\n",err,_cudaGetErrorEnum(err)); \ + cudaDeviceReset(); return; \ + } + } + +extern "C" +void +destroy_plan_fftc_(cufftHandle *PLANp) +{ +cufftHandle plan = *PLANp; + +if (cudaDeviceSynchronize() != cudaSuccess){ + fprintf(stderr, "Cuda error: Failed to synchronize\n"); + return; +} + +cufftSafeCall(cufftDestroy(plan)); + +if (cudaDeviceSynchronize() != cudaSuccess){ + fprintf(stderr, "Cuda error: Failed to synchronize\n"); + return; +} + + +} + diff --git a/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu b/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu new file mode 100644 index 000000000..51a069705 --- /dev/null +++ b/src/trans/gpu/algor/external/fourier/execute_plan_fftc.cu @@ -0,0 +1,100 @@ +#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__) +#include "cufft.h" +#include "stdio.h" + static const char *_cudaGetErrorEnum(cufftResult error) + { + switch (error) + { + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; + + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; + + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; + + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; + + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; + + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; + + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; + + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; + + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; + + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; + } + + return ""; + } + + inline void __cufftSafeCall(cufftResult err, const char *file, const int line) + { + if( CUFFT_SUCCESS != err) { + fprintf(stderr, "CUFFT error at 1\n"); + fprintf(stderr, "CUFFT error in file '%s'\n",__FILE__); + fprintf(stderr, "CUFFT error at 2\n"); + /*fprintf(stderr, "CUFFT error line '%s'\n",__LINE__);*/ + fprintf(stderr, "CUFFT error at 3\n"); + /*fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \ + _cudaGetErrorEnum(err)); \*/ + fprintf(stderr, "CUFFT error %d: %s\nterminating!\n",err,_cudaGetErrorEnum(err)); \ + cudaDeviceReset(); return; \ + } + } + +extern "C" +void +#ifdef TRANS_SINGLE +execute_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, cufftComplex *data_in, cufftComplex *data_out) +#else +execute_plan_fftc_(cufftHandle *PLANp, int *ISIGNp, cufftDoubleComplex *data_in, cufftDoubleComplex *data_out) +#endif +{ +cufftHandle plan = *PLANp; +int ISIGN = *ISIGNp; + +/*if (cudaDeviceSynchronize() != cudaSuccess){ + fprintf(stderr, "Cuda error: Failed to synchronize\n"); + return; +}*/ + +if( ISIGN== -1 ){ + #ifdef TRANS_SINGLE + cufftSafeCall(cufftExecR2C(plan, (cufftReal*)data_in, data_out)); + #else + cufftSafeCall(cufftExecD2Z(plan, (cufftDoubleReal*)data_in, data_out)); + #endif +} +else if( ISIGN== 1){ + #ifdef TRANS_SINGLE + cufftSafeCall(cufftExecC2R(plan, data_in, (cufftReal*)data_out)); + #else + cufftSafeCall(cufftExecZ2D(plan, data_in, (cufftDoubleReal*)data_out)); + #endif +} +else { + abort(); +} + +// cudaDeviceSynchronize(); + +//if (cudaDeviceSynchronize() != cudaSuccess){ +// fprintf(stderr, "Cuda error: Failed to synchronize\n"); +// return; +//} + + +} + diff --git a/src/trans/gpu/algor/external/fourier/storage_fftc.cu b/src/trans/gpu/algor/external/fourier/storage_fftc.cu new file mode 100644 index 000000000..7badd5fab --- /dev/null +++ b/src/trans/gpu/algor/external/fourier/storage_fftc.cu @@ -0,0 +1,28 @@ +#include "cufft.h" +#include "stdio.h" +extern "C" +cufftDoubleComplex *create_storage_(int *Np) +{ + int N = *Np; + cufftDoubleComplex *data; + /*cudaMalloc((void**)&data,sizeof(cufftDoubleComplex)*N); + if (cudaGetLastError() != cudaSuccess){ + fprintf(stderr, "Cuda error: Failed to allocate\n"); + return 0; + } + return data;*/ + printf("%s %d \n","sizeof(cufftDoubleComplex)=",sizeof(cufftDoubleComplex)); + printf("%s %d \n","N=",N); + if (cudaMalloc(&data, sizeof(cufftDoubleComplex)*N) == cudaSuccess){ + printf("%s %X \n","data ",data); + return data; + } + fprintf(stderr, "Cuda error: Failed to allocate\n"); + return 0; +} + +extern "C" +void destroy_storage_(int *ptr) +{ + cudaFree(ptr); +} diff --git a/src/trans/gpu/algor/module/cublasDgemmBatched.cu b/src/trans/gpu/algor/module/cublasDgemmBatched.cu new file mode 100644 index 000000000..98fe25c2f --- /dev/null +++ b/src/trans/gpu/algor/module/cublasDgemmBatched.cu @@ -0,0 +1,141 @@ +// +// Wrapper for cublasDgemm function. +// +// Alan Gray, NVIDIA +// + +#include +#include "cublas_v2.h" + + +bool alreadyAllocated_dgemm=false; +bool alreadyAllocated_dgemm_handle=false; + +double **d_Aarray; +double **d_Barray; +double **d_Carray; + +double **Aarray; +double **Barray; +double **Carray; + +cublasHandle_t handle_dgemm; + +extern "C" void cublasDgemmBatched_wrapper (char transa, char transb, int m, int n,int k, double alpha, const double *A, int lda, int tda, const double *B, int ldb, int tdb, double beta, double *C, int ldc, int tdc, int batchCount) +{ + + + // printf("CUBLAS m=%d,n=%d,k=%d,batchcount=%d\n",m,n,k,batchCount); + cublasStatus_t stat; + + + cublasOperation_t op_t1=CUBLAS_OP_N, op_t2=CUBLAS_OP_N; + + if (transa=='T' || transa=='t') + op_t1=CUBLAS_OP_T; + + if (transb=='T' || transb=='t') + op_t2=CUBLAS_OP_T; + + + //double **Aarray = (double**) malloc(batchCount*sizeof(double*)); + //double **Barray = (double**) malloc(batchCount*sizeof(double*)); + //double **Carray = (double**) malloc(batchCount*sizeof(double*)); + + + + if (!alreadyAllocated_dgemm_handle){ + stat = cublasCreate(&handle_dgemm); + if (stat != CUBLAS_STATUS_SUCCESS) { + printf ("CUBLAS initialization failed\n"); + //return EXIT_FAILURE; + } + } + alreadyAllocated_dgemm_handle=true; + + if (!alreadyAllocated_dgemm){ + cudaError_t errcm1 = cudaMallocHost(&Aarray,batchCount*sizeof(double*)); + cudaError_t errcm2 = cudaMallocHost(&Barray,batchCount*sizeof(double*)); + cudaError_t errcm3 = cudaMallocHost(&Carray,batchCount*sizeof(double*)); + + cudaError_t errcm4 = cudaMalloc(&d_Aarray,batchCount*sizeof(double*)); + cudaError_t errcm5 = cudaMalloc(&d_Barray,batchCount*sizeof(double*)); + cudaError_t errcm6 = cudaMalloc(&d_Carray,batchCount*sizeof(double*)); + } + alreadyAllocated_dgemm=true; + + int i; + for(i=0;i +#include "cublas_v2.h" + +bool alreadyAllocated_stcgemm = false; +bool alreadyAllocated_stcgemm_handle = false; + +half **d_Aarray_stcgemm; +half **d_Barray_stcgemm; +float **d_Carray_stcgemm; + +half **Aarray_stcgemm; +half **Barray_stcgemm; +float **Carray_stcgemm; + +cublasHandle_t handle_stcgemm; + +extern "C" void cublasSTCgemmBatched_wrapper( + char transa, char transb, + int m, int n, int k, + float alpha, + const half *A, int lda, int tda, + const half *B, int ldb, int tdb, + float beta, + float *C, int ldc, int tdc, + int batchCount +){ + // Define CUBLAS operation handles + cublasOperation_t op_t1, op_t2; + + // Decide whether to transpose matrices or not + op_t1 = (transa == 'T' || transa == 't') ? CUBLAS_OP_T : CUBLAS_OP_N; + op_t2 = (transb == 'T' || transb == 't') ? CUBLAS_OP_T : CUBLAS_OP_N; + + // Initialize CUBLAS handle + if (!alreadyAllocated_stcgemm_handle) { + cublasCreate(&handle_stcgemm); + alreadyAllocated_stcgemm_handle = true; + } + + // Allocate host arrays + if (!alreadyAllocated_stcgemm) { + cudaMallocHost(&Aarray_stcgemm,batchCount*sizeof(half*)); + cudaMallocHost(&Barray_stcgemm,batchCount*sizeof(half*)); + cudaMallocHost(&Carray_stcgemm,batchCount*sizeof(float*)); + alreadyAllocated_stcgemm = true; + } + + // Allocate device arrays + cudaMalloc(&d_Aarray_stcgemm, batchCount*sizeof(half*)); + cudaMalloc(&d_Barray_stcgemm, batchCount*sizeof(half*)); + cudaMalloc(&d_Carray_stcgemm, batchCount*sizeof(float*)); + + // Transfer data from input arrays to host arrays + for (int i = 0; i < batchCount; i++) { + Aarray_stcgemm[i] = (half*) &(A[i*lda*tda]); + Barray_stcgemm[i] = (half*) &(B[i*ldb*tdb]); + Carray_stcgemm[i] = (float*) &(C[i*ldc*tdc]); + } + + // Transfer data from host arrays to device arrays + cudaMemcpy(d_Aarray_stcgemm, Aarray_stcgemm, batchCount*sizeof(half*), cudaMemcpyHostToDevice); + cudaMemcpy(d_Barray_stcgemm, Barray_stcgemm, batchCount*sizeof(half*), cudaMemcpyHostToDevice); + cudaMemcpy(d_Carray_stcgemm, Carray_stcgemm, batchCount*sizeof(float*), cudaMemcpyHostToDevice); + + // Perform batched SGEMM + cublasGemmBatchedEx(handle_stcgemm, + op_t1, op_t2, + m, n, k, + (const void*)&alpha, + (const void**)d_Aarray_stcgemm, CUDA_R_16F, lda, + (const void**)d_Barray_stcgemm, CUDA_R_16F, ldb, + (const void*)&beta, + (void**)d_Carray_stcgemm, CUDA_R_32F, ldc, + batchCount, + CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP + ); + + cudaDeviceSynchronize(); + + // Free device arrays + cudaFree(d_Aarray_stcgemm); + cudaFree(d_Barray_stcgemm); + cudaFree(d_Carray_stcgemm); +} + +extern "C" void cublasSTCgemmBatched_finalize() { + if (alreadyAllocated_stcgemm) { + cudaFree(Aarray_stcgemm); + cudaFree(Barray_stcgemm); + cudaFree(Carray_stcgemm); + + cudaFree(d_Aarray_stcgemm); + cudaFree(d_Barray_stcgemm); + cudaFree(d_Carray_stcgemm); + } + + if (alreadyAllocated_stcgemm_handle) { + cublasDestroy(handle_stcgemm); + } +} diff --git a/src/trans/gpu/algor/module/cublasSgemmBatched.cu b/src/trans/gpu/algor/module/cublasSgemmBatched.cu new file mode 100644 index 000000000..61c8384ac --- /dev/null +++ b/src/trans/gpu/algor/module/cublasSgemmBatched.cu @@ -0,0 +1,125 @@ +// +// Wrapper for cublasSgemm function. +// +// Alan Gray, NVIDIA +// + +#include +#include "cublas_v2.h" + + +bool alreadyAllocated_sgemm=false; +bool alreadyAllocated_sgemm_handle=false; + +float **d_Aarray_sgemm; +float **d_Barray_sgemm; +float **d_Carray_sgemm; + +float **Aarray_sgemm; +float **Barray_sgemm; +float **Carray_sgemm; + +cublasHandle_t handle_sgemm; + +extern "C" void cublasSgemmBatched_wrapper (char transa, char transb, int m, int n,int k, float alpha, const float *A, int lda, int tda, const float *B, int ldb, int tdb, float beta, float *C, int ldc, int tdc, int batchCount) +{ + + // printf("CUBLAS m=%d,n=%d,k=%d,batchcount=%d\n",m,n,k,batchCount); + + cublasOperation_t op_t1=CUBLAS_OP_N, op_t2=CUBLAS_OP_N; + + if (transa=='T' || transa=='t') + op_t1=CUBLAS_OP_T; + + if (transb=='T' || transb=='t') + op_t2=CUBLAS_OP_T; + + //float **Aarray_sgemm = (float**) malloc(batchCount*sizeof(float*)); + //float **Barray_sgemm = (float**) malloc(batchCount*sizeof(float*)); + //float **Carray_sgemm = (float**) malloc(batchCount*sizeof(float*)); + + if (!alreadyAllocated_sgemm_handle){ + cublasCreate(&handle_sgemm); + alreadyAllocated_sgemm_handle=true; + } + + if (!alreadyAllocated_sgemm){ + cudaMallocHost(&Aarray_sgemm,batchCount*sizeof(float*)); + cudaMallocHost(&Barray_sgemm,batchCount*sizeof(float*)); + cudaMallocHost(&Carray_sgemm,batchCount*sizeof(float*)); + alreadyAllocated_sgemm=true; + } + + cudaMalloc(&d_Aarray_sgemm,batchCount*sizeof(float*)); + cudaMalloc(&d_Barray_sgemm,batchCount*sizeof(float*)); + cudaMalloc(&d_Carray_sgemm,batchCount*sizeof(float*)); + + int i; + for(i=0;i +#include "cublas_v2.h" + + +bool alreadyAllocated_tcgemm=false; +bool alreadyAllocated_tcgemm_handle=false; + +// Device arrays +//half **d_Aarray_h; +//half **d_Barray_h; +float **d_Aarray_h; +float **d_Barray_h; +float **d_Aarray_tcgemm; +float **d_Barray_tcgemm; +float **d_Carray_tcgemm; + +// Host arrays +float **Aarray_tcgemm; +float **Barray_tcgemm; +float **Carray_tcgemm; + +cublasHandle_t handle_tcgemm; + +// Converts from single-precision to half-precision (CUDA kernel) +__global__ void float2half(half *out, const float *in, int n) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + if (idx < n) { + out[idx] = __float2half(in[idx]); + } +} + + + +extern "C" void cublasTCgemmBatched_wrapper(char transa, char transb, + int m, int n, int k, + float alpha, + const float *A, int lda, int tda, + const float *B, int ldb, int tdb, + float beta, + float *C, int ldc, int tdc, + int batchCount) +{ + fprintf(stderr, "Using Tensor Core\n"); + + // Set transpose operation parameters + cublasOperation_t op_t1 = (transa == 'T' || transa == 't') ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t op_t2 = (transb == 'T' || transb == 't') ? CUBLAS_OP_T : CUBLAS_OP_N; + + if (!alreadyAllocated_tcgemm_handle) { + cublasCreate(&handle_tcgemm); + alreadyAllocated_tcgemm_handle=true; + } + + //cublasSetMathMode(handle_tcgemm, CUBLAS_TENSOR_OP_MATH); + + if (!alreadyAllocated_tcgemm) { + // Allocate host arrays specifically for host->device transfer + cudaMallocHost(&Aarray_tcgemm, batchCount*sizeof(float*)); + cudaMallocHost(&Barray_tcgemm, batchCount*sizeof(float*)); + cudaMallocHost(&Carray_tcgemm, batchCount*sizeof(float*)); + alreadyAllocated_tcgemm=true; + } + + // Allocate device arrays + cudaMalloc(&d_Aarray_h, batchCount*sizeof(float*)); + cudaMalloc(&d_Barray_h, batchCount*sizeof(float*)); + cudaMalloc(&d_Aarray_tcgemm, batchCount*sizeof(float*)); + cudaMalloc(&d_Barray_tcgemm, batchCount*sizeof(float*)); + cudaMalloc(&d_Carray_tcgemm, batchCount*sizeof(float*)); + + // Copy data from dummy arrays to host arrays + for (int i = 0; i < batchCount; i++) { + Aarray_tcgemm[i] = (float*) &(A[i*lda*tda]); + Barray_tcgemm[i] = (float*) &(B[i*ldb*tdb]); + Carray_tcgemm[i] = (float*) &(C[i*ldc*tdc]); + } + + // Transfer arrays from host to device + cudaMemcpy(d_Aarray_tcgemm, Aarray_tcgemm, batchCount*sizeof(float*), cudaMemcpyHostToDevice); + cudaMemcpy(d_Barray_tcgemm, Barray_tcgemm, batchCount*sizeof(float*), cudaMemcpyHostToDevice); + cudaMemcpy(d_Carray_tcgemm, Carray_tcgemm, batchCount*sizeof(float*), cudaMemcpyHostToDevice); + +// // Convert arrays to half-precision +// for (int i = 0; i < batchCount; i++) { +// float2half<<<(int)(m*k/256) + 1, 256 >>>(d_Aarray_h[i], d_Aarray_tcgemm[i], batchCount); +// float2half<<<(int)(k*n/256) + 1, 256 >>>(d_Barray_h[i], d_Barray_tcgemm[i], batchCount); +// } + + // Perform Tensor Core batched GEMM + cublasGemmBatchedEx(handle_tcgemm, op_t1, op_t2, + m, n, k, + (const void *)&alpha, + (const void **)d_Aarray_h, CUDA_R_32F, lda, + (const void **)d_Barray_h, CUDA_R_32F, ldb, + (const void *)&beta, + (void **)d_Carray_tcgemm, CUDA_R_32F, ldc, + batchCount, + CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT); + + cudaDeviceSynchronize(); + + cudaFree(d_Aarray_h); + cudaFree(d_Barray_h); + cudaFree(d_Aarray_tcgemm); + cudaFree(d_Barray_tcgemm); + cudaFree(d_Carray_tcgemm); +} + +extern "C" void cublasTCgemmBatched_finalize() +{ + if (alreadyAllocated_tcgemm) { + cudaFree(Aarray_tcgemm); + cudaFree(Barray_tcgemm); + cudaFree(Carray_tcgemm); + + cudaFree(d_Aarray_h); + cudaFree(d_Barray_h); + cudaFree(d_Aarray_tcgemm); + cudaFree(d_Barray_tcgemm); + cudaFree(d_Carray_tcgemm); + } + + if (alreadyAllocated_tcgemm_handle) { + cublasDestroy(handle_tcgemm); + } +} + diff --git a/src/trans/gpu/algor/module/cublas_mod.F90 b/src/trans/gpu/algor/module/cublas_mod.F90 new file mode 100644 index 000000000..e69a00e16 --- /dev/null +++ b/src/trans/gpu/algor/module/cublas_mod.F90 @@ -0,0 +1,158 @@ +MODULE CUBLAS_MOD +! +! Define the interfaces to the NVIDIA C code +! +interface cuda_gemm +! +! void cublasSgemm (char transa, char transb, int m, int n, +! int k, float alpha, const float *A, int lda, +! const float *B, int ldb, float beta, float *C, int ldc) +! +subroutine cuda_sgemm(cta, ctb, m, n, k,& +alpha, A, lda, B, ldb, beta, c, ldc) bind(C,name='cublasSgemm') +use iso_c_binding +character(1,c_char),value :: cta, ctb +integer(c_int),value :: m,n,k,lda,ldb,ldc +real(c_float),value :: alpha,beta +real(c_float), dimension(lda,*) :: A +real(c_float), dimension(ldb,*) :: B +real(c_float), dimension(ldc,*) :: C +end subroutine cuda_sgemm + +! +! void cublasDgemm (char transa, char transb, int m, int n, +! int k, double alpha, const double *A, int lda, +! const double *B, int ldb, double beta, double *C, int ldc) +! +subroutine cuda_dgemm(cta, ctb, m, n, k,& +alpha, A, lda, B, ldb, beta, c, ldc) bind(C,name='cublasDgemm') +use iso_c_binding +character(1,c_char),value :: cta, ctb +integer(c_int),value :: m,n,k,lda,ldb,ldc +real(c_double),value :: alpha,beta +real(c_double), dimension(lda,*) :: A +real(c_double), dimension(ldb,*) :: B +real(c_double), dimension(ldc,*) :: C +end subroutine cuda_dgemm +end interface + + +INTERFACE + SUBROUTINE CUDA_DGEMM_BATCHED(& + & CTA, CTB, & + & M, N, K, & + & ALPHA, & + & A, LDA, TDA, & + & B, LDB, TDB, & + & BETA, & + & C, LDC, TDC, & + & BATCHCOUNT & + &) BIND(C, NAME='cublasDgemmBatched_wrapper') + USE ISO_C_BINDING + CHARACTER(1,C_CHAR), VALUE :: CTA, CTB + INTEGER(C_INT), VALUE :: M, N, K, LDA, LDB, LDC, TDA, TDB, TDC, BATCHCOUNT + REAL(C_DOUBLE), VALUE :: ALPHA,BETA + REAL(C_DOUBLE), DIMENSION(LDA,*) :: A + REAL(C_DOUBLE), DIMENSION(LDB,*) :: B + REAL(C_DOUBLE), DIMENSION(LDC,*) :: C + END SUBROUTINE CUDA_DGEMM_BATCHED + + SUBROUTINE CUDA_DGEMM_STRIDED_BATCHED(& + & CTA, CTB, & + & M, N, K, & + & ALPHA, & + & A, LDA, TDA, & + & B, LDB, TDB, & + & BETA, & + & C, LDC, TDC, & + & BATCHCOUNT & + &) BIND(C, NAME='cublasDgemmStridedBatched_wrapper') + USE ISO_C_BINDING + CHARACTER(1,C_CHAR), VALUE :: CTA, CTB + INTEGER(C_INT), VALUE :: M, N, K, LDA, LDB, LDC, BATCHCOUNT + INTEGER(C_LONG_LONG), VALUE :: TDA,TDB,TDC + REAL(C_DOUBLE), VALUE :: ALPHA, BETA + REAL(C_DOUBLE), DIMENSION(LDA,*) :: A + REAL(C_DOUBLE), DIMENSION(LDB,*) :: B + REAL(C_DOUBLE), DIMENSION(LDC,*) :: C + END SUBROUTINE CUDA_DGEMM_STRIDED_BATCHED + + subroutine cuda_dgemm_batched_finalize() bind(C,name='cublasDgemmBatched_finalize') + end subroutine cuda_dgemm_batched_finalize + +END INTERFACE + +INTERFACE + + SUBROUTINE CUDA_SGEMM_BATCHED(& + & CTA, CTB, & + & M, N, K, & + & ALPHA, & + & A, LDA, TDA, & + & B, LDB, TDB, & + & BETA, & + & C, LDC, TDC, & + & BATCHCOUNT & + &) BIND(C, NAME='cublasSgemmBatched_wrapper') + USE ISO_C_BINDING + CHARACTER(1,C_CHAR), VALUE :: CTA, CTB + INTEGER(C_INT), VALUE :: M, N, K, LDA, LDB, LDC, TDA, TDB, TDC, BATCHCOUNT + REAL(C_FLOAT), VALUE :: ALPHA, BETA + REAL(C_FLOAT), DIMENSION(LDA,*) :: A + REAL(C_FLOAT), DIMENSION(LDB,*) :: B + REAL(C_FLOAT), DIMENSION(LDC,*) :: C + END SUBROUTINE CUDA_SGEMM_BATCHED +!!END INTERFACE + +!!INTERFACE + SUBROUTINE CUDA_SGEMM_STRIDED_BATCHED(& + & CTA, CTB, & + & M, N, K, & + & ALPHA, & + & A, LDA, TDA, & + & B, LDB, TDB, & + & BETA, & + & C, LDC, TDC, & + & BATCHCOUNT & + &) BIND(C, NAME='cublasSgemmStridedBatched_wrapper') + USE ISO_C_BINDING + CHARACTER(1,C_CHAR), VALUE :: CTA, CTB + INTEGER(C_INT), VALUE :: M, N, K, LDA, LDB, LDC, BATCHCOUNT + INTEGER(C_LONG_LONG), VALUE :: TDA,TDB,TDC + REAL(C_FLOAT), VALUE :: ALPHA, BETA + REAL(C_FLOAT), DIMENSION(LDA,*) :: A + REAL(C_FLOAT), DIMENSION(LDB,*) :: B + REAL(C_FLOAT), DIMENSION(LDC,*) :: C + END SUBROUTINE CUDA_SGEMM_STRIDED_BATCHED + + subroutine cuda_sgemm_batched_finalize() bind(C,name='cublasSgemmBatched_finalize') + end subroutine cuda_sgemm_batched_finalize + + +END INTERFACE + +INTERFACE + SUBROUTINE CUDA_STCGEMM_BATCHED(& + & CTA, CTB, & + & M, N, K, & + & ALPHA, & + & A, LDA, TDA, & + & B, LDB, TDB, & + & BETA, & + & C, LDC, TDC, & + & BATCHCOUNT & + &) BIND(C, NAME='cublasSTCgemmBatched_wrapper') + USE ISO_C_BINDING + CHARACTER(1,C_CHAR), VALUE :: CTA, CTB + INTEGER(C_INT), VALUE :: M, N, K, LDA, LDB, LDC, TDA, TDB, TDC, BATCHCOUNT + REAL(C_FLOAT), VALUE :: ALPHA, BETA + REAL(2), DIMENSION(LDA,*) :: A + REAL(2), DIMENSION(LDB,*) :: B + REAL(C_FLOAT), DIMENSION(LDC,*) :: C + END SUBROUTINE CUDA_STCGEMM_BATCHED +END INTERFACE + + + + +END MODULE CUBLAS_MOD diff --git a/src/trans/gpu/algor/module/cuda_device_mod.F90 b/src/trans/gpu/algor/module/cuda_device_mod.F90 new file mode 100644 index 000000000..f710b687d --- /dev/null +++ b/src/trans/gpu/algor/module/cuda_device_mod.F90 @@ -0,0 +1,40 @@ +module cuda_device_mod + +interface cuda_sync + +integer function cuda_synchronize() bind(C,name='cudaDeviceSynchronize') +use iso_c_binding +end function cuda_synchronize + +integer function cuda_stream_synchronize(stream) bind(C,name='cudaStreamSynchronize') +use iso_c_binding +type(c_ptr) :: stream +end function cuda_stream_synchronize + +integer function cuda_stream_destroy(stream) bind(C,name='cudaStreamDestroy') +use iso_c_binding +type(c_ptr) :: stream +end function cuda_stream_destroy + +end interface cuda_sync + +interface cuda_device + +integer function cuda_SetDevice(devnum) bind(C,name='cudaSetDevice') +use iso_c_binding +integer(c_int),value :: devnum +end function cuda_SetDevice + +integer function cuda_GetDevice(devnum) bind(C,name='cudaGetDevice') +use iso_c_binding +integer(c_int) :: devnum +end function cuda_GetDevice + +integer function cuda_GetDeviceCount(devnum) bind(C,name='cudaGetDeviceCount') +use iso_c_binding +integer(c_int) :: devnum +end function cuda_GetDeviceCount + +end interface cuda_device + +end module cuda_device_mod diff --git a/src/trans/gpu/algor/module/hip_device_mod.F90~ b/src/trans/gpu/algor/module/hip_device_mod.F90~ deleted file mode 100644 index bee278412..000000000 --- a/src/trans/gpu/algor/module/hip_device_mod.F90~ +++ /dev/null @@ -1,40 +0,0 @@ -module hip_device_mod - -interface hip_sync - -integer function hip_synchronize() bind(C,name='hipDeviceSynchronize') -use iso_c_binding -end function hip_synchronize - -integer function hip_stream_synchronize(stream) bind(C,name='hipStreamSynchronize') -use iso_c_binding -type(c_ptr) :: stream -end function hip_stream_synchronize - -integer function hip_stream_destroy(stream) bind(C,name='hipStreamDestroy') -use iso_c_binding -type(c_ptr) :: stream -end function hip_stream_destroy - -end interface hip_sync - -interface hip_device - -integer function hip_SetDevice(devnum) bind(C,name='hipSetDevice') -use iso_c_binding -integer(c_int),value :: devnum -end function hip_SetDevice - -integer function hip_GetDevice(devnum) bind(C,name='hipGetDevice') -use iso_c_binding -integer(c_int) :: devnum -end function hip_GetDevice - -integer function hip_GetDeviceCount(devnum) bind(C,name='hipGetDeviceCount') -use iso_c_binding -integer(c_int) :: devnum -end function hip_GetDeviceCount - -end interface hip_device - -end module hip_device_mod diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index b46e94c3b..559b1c859 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -1,6 +1,6 @@ ! (C) Copyright 2000- ECMWF. ! (C) Copyright 2000- Meteo-France. -! +! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. ! In applying this licence, ECMWF does not waive the privileges and immunities @@ -119,7 +119,11 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& & ZAS0,DZCST0,KMLOC0 ! IZBA,IZCAT USE TPM_FFT ,ONLY : T, FFT_RESOL -USE TPM_FFTH ,ONLY : TC, FFTH_RESOL +#ifdef HIPGPU +USE TPM_FFTH ,ONLY : TC, GPU_FFT_RESOL => FFTH_RESOL +#elif defined(CUDAGPU) +USE TPM_FFTC ,ONLY : TC, GPU_FFT_RESOL => FFTC_RESOL +#endif USE TPM_FLT #ifndef REDUCED_MEM USE TPM_TRANS ,ONLY : FOUBUF_IN, FOUBUF, ZGTF, ZAVE, ZMINGL, ZMAXGL, ZMINGPN, ZMAXGPN @@ -216,7 +220,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ALLOCATE(GEOM_RESOL(NMAX_RESOL)) ALLOCATE(DISTR_RESOL(NMAX_RESOL)) ALLOCATE(FFT_RESOL(NMAX_RESOL)) - ALLOCATE(FFTH_RESOL(NMAX_RESOL)) + ALLOCATE(GPU_FFT_RESOL(NMAX_RESOL)) ALLOCATE(FLT_RESOL(NMAX_RESOL)) ALLOCATE(CTL_RESOL(NMAX_RESOL)) GEOM_RESOL(:)%LAM=.FALSE. @@ -633,7 +637,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ZAS(:,:,:) = 0._JPRBT DO JMLOC=1,D%NUMP - KM = D%MYMS(JMLOC) + KM = D%MYMS(JMLOC) KDGLU = MIN(R%NDGNH,G%NDGLU(KM)) ILA = (R%NSMAX-KM+2)/2 diff --git a/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 b/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 index 04db087cd..8032a1c28 100755 --- a/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 +++ b/src/trans/gpu/internal/cuda_gemm_batched_mod.F90 @@ -1,5 +1,5 @@ MODULE CUDA_GEMM_BATCHED_MOD - USE HIPBLAS_MOD + USE CUBLAS_MOD USE PARKIND1, ONLY: JPRD, JPRM, JPIM, JPIB IMPLICIT NONE @@ -14,7 +14,7 @@ MODULE CUDA_GEMM_BATCHED_MOD MODULE PROCEDURE CUDA_DGEMM_BATCHED_1D_OVERLOAD MODULE PROCEDURE CUDA_SGEMM_BATCHED_1D_OVERLOAD MODULE PROCEDURE CUDA_SGEMM_STRIDED_BATCHED_1D_OVERLOAD - END INTERFACE CUDA_GEMM_BATCHED + END INTERFACE CUDA_GEMM_BATCHED CONTAINS SUBROUTINE CUDA_DGEMM_BATCHED_OVERLOAD( & @@ -44,7 +44,7 @@ SUBROUTINE CUDA_DGEMM_BATCHED_OVERLOAD( & INTEGER(KIND=JPIM) :: STRIDEC INTEGER(KIND=JPIM) :: BATCHCOUNT -CALL HIP_DGEMM_BATCHED( & +CALL CUDA_DGEMM_BATCHED( & & TRANSA, TRANSB, & & M, N, K, & & ALPHA, & @@ -82,7 +82,7 @@ SUBROUTINE CUDA_SGEMM_BATCHED_OVERLOAD( & INTEGER(KIND=JPIM) :: STRIDEC INTEGER(KIND=JPIM) :: BATCHCOUNT -CALL HIP_SGEMM_BATCHED( & +CALL CUDA_SGEMM_BATCHED( & & TRANSA, TRANSB, & & M, N, K, & & ALPHA, & @@ -120,7 +120,7 @@ SUBROUTINE CUDA_SGEMM_STRIDED_BATCHED_OVERLOAD( & INTEGER(KIND=JPIB) :: STRIDEC INTEGER(KIND=JPIM) :: BATCHCOUNT - CALL HIP_SGEMM_STRIDED_BATCHED( & + CALL CUDA_SGEMM_STRIDED_BATCHED( & & TRANSA, TRANSB, & & M, N, K, & & ALPHA, & @@ -158,7 +158,7 @@ SUBROUTINE CUDA_DGEMM_BATCHED_1D_OVERLOAD( & INTEGER(KIND=JPIM) :: STRIDEC INTEGER(KIND=JPIM) :: BATCHCOUNT - CALL HIP_DGEMM_BATCHED( & + CALL CUDA_DGEMM_BATCHED( & & TRANSA, TRANSB, & & M, N, K, & & ALPHA, & @@ -196,7 +196,7 @@ SUBROUTINE CUDA_SGEMM_BATCHED_1D_OVERLOAD( & INTEGER(KIND=JPIM) :: STRIDEC INTEGER(KIND=JPIM) :: BATCHCOUNT - CALL HIP_SGEMM_BATCHED( & + CALL CUDA_SGEMM_BATCHED( & & TRANSA, TRANSB, & & M, N, K, & & ALPHA, & @@ -234,7 +234,7 @@ SUBROUTINE CUDA_SGEMM_STRIDED_BATCHED_1D_OVERLOAD( & INTEGER(KIND=JPIB) :: STRIDEC INTEGER(KIND=JPIM) :: BATCHCOUNT - CALL HIP_SGEMM_STRIDED_BATCHED( & + CALL CUDA_SGEMM_STRIDED_BATCHED( & & TRANSA, TRANSB, & & M, N, K, & & ALPHA, & diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index a6a1284fc..f51b07d23 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -1,6 +1,6 @@ ! (C) Copyright 2000- ECMWF. ! (C) Copyright 2000- Meteo-France. -! +! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. ! In applying this licence, ECMWF does not waive the privileges and immunities @@ -52,9 +52,16 @@ SUBROUTINE FTDIR(KFIELDS) USE TPM_TRANS ,ONLY : ZGTF USE TPM_GEOMETRY ,ONLY : G,G_NMEN,G_NMEN_MAX,G_NLOEN,G_NLOEN_MAX USE TPM_FFT ,ONLY : T +#ifdef HIPGPU USE TPM_FFTH ,ONLY : CREATE_PLAN_FFT, EXECUTE_PLAN_FFT +USE HIP_DEVICE_MOD ,ONLY : DEVICE_SYNCHRONIZE => HIP_SYNCHRONIZE +#define PLAN_TYPE TYPE(C_PTR) +#elif defined(CUDAGPU) +USE TPM_FFTC ,ONLY : CREATE_PLAN_FFT, EXECUTE_PLAN_FFT +USE CUDA_DEVICE_MOD ,ONLY : DEVICE_SYNCHRONIZE => CUDA_SYNCHRONIZE +#define PLAN_TYPE INTEGER(KIND=JPIM) +#endif USE TPM_DIM ,ONLY : R,R_NNOEXTZL -USE HIP_DEVICE_MOD USE ISO_C_BINDING ! @@ -66,7 +73,7 @@ SUBROUTINE FTDIR(KFIELDS) INTEGER(KIND=JPIM) :: IGLG,IST,ILEN,IJUMP,JJ,JF,IST1 INTEGER(KIND=JPIM) :: IOFF,IRLEN,ICLEN, ITYPE -TYPE(C_PTR) :: IPLAN_R2C +PLAN_TYPE :: IPLAN_R2C INTEGER(KIND=JPIM) :: JMAX REAL(KIND=JPRBT) :: SCAL LOGICAL :: LL_ALL=.FALSE. ! T=do kfields ffts in one batch, F=do kfields ffts one at a time @@ -109,9 +116,12 @@ SUBROUTINE FTDIR(KFIELDS) CALL CREATE_PLAN_FFT(IPLAN_R2C,-1,G_NLOEN(IGLG),KFIELDS) CALL EXECUTE_PLAN_FFT(-1,G_NLOEN(IGLG),ZGTF(1,IOFF),ZGTF2(1,IOFF),IPLAN_R2C) +! ! !$ACC HOST_DATA USE_DEVICE(ZGTF,ZGTF2) +! CALL EXECUTE_PLAN_FFTC(IPLAN_R2C,-1,ZGTF(1,IOFF),ZGTF2(1,IOFF)) +! !$ACC END HOST_DATA END DO -ISTAT = HIP_SYNCHRONIZE() +ISTAT = DEVICE_SYNCHRONIZE() #ifdef OMPGPU !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) PRIVATE(IGLG,JJ) DEFAULT(NONE) & diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index 0ffdd6393..8263385ce 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -1,6 +1,6 @@ ! (C) Copyright 2000- ECMWF. ! (C) Copyright 2000- Meteo-France. -! +! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. ! In applying this licence, ECMWF does not waive the privileges and immunities @@ -50,9 +50,16 @@ SUBROUTINE FTINV(PREEL,KFIELDS) USE TPM_GEOMETRY ,ONLY : G, G_NLOEN, G_NMEN USE TPM_GEN ,ONLY : NOUT USE TPM_FFT ,ONLY : T +#ifdef HIPGPU USE TPM_FFTH ,ONLY : CREATE_PLAN_FFT, EXECUTE_PLAN_FFT +USE HIP_DEVICE_MOD ,ONLY : DEVICE_SYNCHRONIZE => HIP_SYNCHRONIZE +#define PLAN_TYPE TYPE(C_PTR) +#elif defined(CUDAGPU) +USE TPM_FFTC ,ONLY : CREATE_PLAN_FFT, EXECUTE_PLAN_FFT +USE CUDA_DEVICE_MOD ,ONLY : DEVICE_SYNCHRONIZE => CUDA_SYNCHRONIZE +#define PLAN_TYPE INTEGER(KIND=JPIM) +#endif USE TPM_DIM ,ONLY : R, R_NNOEXTZL -USE HIP_DEVICE_MOD USE ISO_C_BINDING IMPLICIT NONE @@ -64,7 +71,7 @@ SUBROUTINE FTINV(PREEL,KFIELDS) INTEGER(KIND=JPIM) :: IGLG,IST,ILEN,IJUMP,JJ,JF,IST1 INTEGER(KIND=JPIM) :: IOFF,IRLEN,ICLEN, ITYPE LOGICAL :: LL_ALL=.FALSE. ! T=do kfields ffts in one batch, F=do kfields ffts one at a time -TYPE(C_PTR) :: IPLAN_C2R +PLAN_TYPE :: IPLAN_C2R INTEGER(KIND=JPIM) :: IBEG,IEND,IINC,ISIZE, IDIM2 integer :: istat,idev, iunit INTEGER :: I, J @@ -131,8 +138,12 @@ SUBROUTINE FTINV(PREEL,KFIELDS) IGLG = D_NPTRLS(MYSETW)+KGL-1 CALL CREATE_PLAN_FFT(IPLAN_C2R,1,G_NLOEN(IGLG),KFIELDS) CALL EXECUTE_PLAN_FFT(1,G_NLOEN(IGLG),PREEL(1, ioff),ZREEL2(1, ioff),IPLAN_C2R) +! !$ACC HOST_DATA USE_DEVICE(PREEL,ZREEL2) +! CALL EXECUTE_PLAN_FFTC(IPLAN_C2R,1,PREEL(1, ioff),ZREEL2(1, ioff)) +! !$ACC END HOST_DATA END DO -ISTAT = HIP_SYNCHRONIZE() + +ISTAT = DEVICE_SYNCHRONIZE() #ifdef OMPGPU !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) PRIVATE(IGLG,JJ) DEFAULT(NONE) & diff --git a/src/trans/gpu/internal/hip_gemm_batched_mod.F90 b/src/trans/gpu/internal/hip_gemm_batched_mod.F90 new file mode 100755 index 000000000..d7aa6193f --- /dev/null +++ b/src/trans/gpu/internal/hip_gemm_batched_mod.F90 @@ -0,0 +1,248 @@ +MODULE CUDA_GEMM_BATCHED_MOD + USE HIPBLAS_MOD + USE PARKIND1, ONLY: JPRD, JPRM, JPIM, JPIB + + IMPLICIT NONE + +!! PRIVATE + PUBLIC CUDA_GEMM_BATCHED, CUDA_DGEMM_BATCHED_OVERLOAD, CUDA_DGEMM_BATCHED_1D_OVERLOAD + + INTERFACE CUDA_GEMM_BATCHED + MODULE PROCEDURE CUDA_DGEMM_BATCHED_OVERLOAD + MODULE PROCEDURE CUDA_SGEMM_BATCHED_OVERLOAD + MODULE PROCEDURE CUDA_SGEMM_STRIDED_BATCHED_OVERLOAD + MODULE PROCEDURE CUDA_DGEMM_BATCHED_1D_OVERLOAD + MODULE PROCEDURE CUDA_SGEMM_BATCHED_1D_OVERLOAD + MODULE PROCEDURE CUDA_SGEMM_STRIDED_BATCHED_1D_OVERLOAD + END INTERFACE CUDA_GEMM_BATCHED + +CONTAINS +SUBROUTINE CUDA_DGEMM_BATCHED_OVERLOAD( & + & TRANSA, TRANSB, & + & M, N, K, & + & ALPHA, & + & AARRAY, LDA, STRIDEA, & + & BARRAY, LDB, STRIDEB, & + & BETA, & + & CARRAY, LDC, STRIDEC, & + & BATCHCOUNT) +CHARACTER, INTENT(IN) :: TRANSA +CHARACTER, INTENT(IN) :: TRANSB +INTEGER(KIND=JPIM) :: M +INTEGER(KIND=JPIM) :: N +INTEGER(KIND=JPIM) :: K +REAL(KIND=JPRD) :: ALPHA +REAL(KIND=JPRD), DIMENSION(:,:,:) :: AARRAY +INTEGER(KIND=JPIM) :: LDA +INTEGER(KIND=JPIM) :: STRIDEA +REAL(KIND=JPRD), DIMENSION(:,:,:) :: BARRAY +INTEGER(KIND=JPIM) :: LDB +INTEGER(KIND=JPIM) :: STRIDEB +REAL(KIND=JPRD) :: BETA +REAL(KIND=JPRD), DIMENSION(:,:,:) :: CARRAY +INTEGER(KIND=JPIM) :: LDC +INTEGER(KIND=JPIM) :: STRIDEC +INTEGER(KIND=JPIM) :: BATCHCOUNT + +CALL HIP_DGEMM_BATCHED( & + & TRANSA, TRANSB, & + & M, N, K, & + & ALPHA, & + & AARRAY, LDA, STRIDEA, & + & BARRAY, LDB, STRIDEB, & + & BETA, & + & CARRAY, LDC, STRIDEC, & + & BATCHCOUNT) +END SUBROUTINE CUDA_DGEMM_BATCHED_OVERLOAD + +SUBROUTINE CUDA_SGEMM_BATCHED_OVERLOAD( & + & TRANSA, TRANSB, & + & M, N, K, & + & ALPHA, & + & AARRAY, LDA, STRIDEA, & + & BARRAY, LDB, STRIDEB, & + & BETA, & + & CARRAY, LDC, STRIDEC, & + & BATCHCOUNT) +CHARACTER, INTENT(IN) :: TRANSA +CHARACTER, INTENT(IN) :: TRANSB +INTEGER(KIND=JPIM) :: M +INTEGER(KIND=JPIM) :: N +INTEGER(KIND=JPIM) :: K +REAL(KIND=JPRM) :: ALPHA +REAL(KIND=JPRM), DIMENSION(:,:,:) :: AARRAY +INTEGER(KIND=JPIM) :: LDA +INTEGER(KIND=JPIM) :: STRIDEA +REAL(KIND=JPRM), DIMENSION(:,:,:) :: BARRAY +INTEGER(KIND=JPIM) :: LDB +INTEGER(KIND=JPIM) :: STRIDEB +REAL(KIND=JPRM) :: BETA +REAL(KIND=JPRM), DIMENSION(:,:,:) :: CARRAY +INTEGER(KIND=JPIM) :: LDC +INTEGER(KIND=JPIM) :: STRIDEC +INTEGER(KIND=JPIM) :: BATCHCOUNT + +CALL HIP_SGEMM_BATCHED( & + & TRANSA, TRANSB, & + & M, N, K, & + & ALPHA, & + & AARRAY, LDA, STRIDEA, & + & BARRAY, LDB, STRIDEB, & + & BETA, & + & CARRAY, LDC, STRIDEC, & + & BATCHCOUNT) +END SUBROUTINE CUDA_SGEMM_BATCHED_OVERLOAD + +SUBROUTINE CUDA_SGEMM_STRIDED_BATCHED_OVERLOAD( & + & TRANSA, TRANSB, & + & M, N, K, & + & ALPHA, & + & AARRAY, LDA, STRIDEA, & + & BARRAY, LDB, STRIDEB, & + & BETA, & + & CARRAY, LDC, STRIDEC, & + & BATCHCOUNT) + CHARACTER, INTENT(IN) :: TRANSA + CHARACTER, INTENT(IN) :: TRANSB + INTEGER(KIND=JPIM) :: M + INTEGER(KIND=JPIM) :: N + INTEGER(KIND=JPIM) :: K + REAL(KIND=JPRM) :: ALPHA + REAL(KIND=JPRM), DIMENSION(:,:,:) :: AARRAY + INTEGER(KIND=JPIM) :: LDA + INTEGER(KIND=JPIB) :: STRIDEA + REAL(KIND=JPRM), DIMENSION(:,:,:) :: BARRAY + INTEGER(KIND=JPIM) :: LDB + INTEGER(KIND=JPIB) :: STRIDEB + REAL(KIND=JPRM) :: BETA + REAL(KIND=JPRM), DIMENSION(:,:,:) :: CARRAY + INTEGER(KIND=JPIM) :: LDC + INTEGER(KIND=JPIB) :: STRIDEC + INTEGER(KIND=JPIM) :: BATCHCOUNT + + CALL HIP_SGEMM_STRIDED_BATCHED( & + & TRANSA, TRANSB, & + & M, N, K, & + & ALPHA, & + & AARRAY, LDA, STRIDEA, & + & BARRAY, LDB, STRIDEB, & + & BETA, & + & CARRAY, LDC, STRIDEC, & + & BATCHCOUNT) +END SUBROUTINE CUDA_SGEMM_STRIDED_BATCHED_OVERLOAD + +SUBROUTINE CUDA_DGEMM_BATCHED_1D_OVERLOAD( & + & TRANSA, TRANSB, & + & M, N, K, & + & ALPHA, & + & AARRAY, LDA, STRIDEA, & + & BARRAY, LDB, STRIDEB, & + & BETA, & + & CARRAY, LDC, STRIDEC, & + & BATCHCOUNT) + CHARACTER, INTENT(IN) :: TRANSA + CHARACTER, INTENT(IN) :: TRANSB + INTEGER(KIND=JPIM) :: M + INTEGER(KIND=JPIM) :: N + INTEGER(KIND=JPIM) :: K + REAL(KIND=JPRD) :: ALPHA + REAL(KIND=JPRD), DIMENSION(:) :: AARRAY + INTEGER(KIND=JPIM) :: LDA + INTEGER(KIND=JPIM) :: STRIDEA + REAL(KIND=JPRD), DIMENSION(:,:,:) :: BARRAY + INTEGER(KIND=JPIM) :: LDB + INTEGER(KIND=JPIM) :: STRIDEB + REAL(KIND=JPRD) :: BETA + REAL(KIND=JPRD), DIMENSION(:) :: CARRAY + INTEGER(KIND=JPIM) :: LDC + INTEGER(KIND=JPIM) :: STRIDEC + INTEGER(KIND=JPIM) :: BATCHCOUNT + + CALL HIP_DGEMM_BATCHED( & + & TRANSA, TRANSB, & + & M, N, K, & + & ALPHA, & + & AARRAY, LDA, STRIDEA, & + & BARRAY, LDB, STRIDEB, & + & BETA, & + & CARRAY, LDC, STRIDEC, & + & BATCHCOUNT) + END SUBROUTINE CUDA_DGEMM_BATCHED_1D_OVERLOAD + + SUBROUTINE CUDA_SGEMM_BATCHED_1D_OVERLOAD( & + & TRANSA, TRANSB, & + & M, N, K, & + & ALPHA, & + & AARRAY, LDA, STRIDEA, & + & BARRAY, LDB, STRIDEB, & + & BETA, & + & CARRAY, LDC, STRIDEC, & + & BATCHCOUNT) + CHARACTER, INTENT(IN) :: TRANSA + CHARACTER, INTENT(IN) :: TRANSB + INTEGER(KIND=JPIM) :: M + INTEGER(KIND=JPIM) :: N + INTEGER(KIND=JPIM) :: K + REAL(KIND=JPRM) :: ALPHA + REAL(KIND=JPRM), DIMENSION(:) :: AARRAY + INTEGER(KIND=JPIM) :: LDA + INTEGER(KIND=JPIM) :: STRIDEA + REAL(KIND=JPRM), DIMENSION(:,:,:) :: BARRAY + INTEGER(KIND=JPIM) :: LDB + INTEGER(KIND=JPIM) :: STRIDEB + REAL(KIND=JPRM) :: BETA + REAL(KIND=JPRM), DIMENSION(:) :: CARRAY + INTEGER(KIND=JPIM) :: LDC + INTEGER(KIND=JPIM) :: STRIDEC + INTEGER(KIND=JPIM) :: BATCHCOUNT + + CALL HIP_SGEMM_BATCHED( & + & TRANSA, TRANSB, & + & M, N, K, & + & ALPHA, & + & AARRAY, LDA, STRIDEA, & + & BARRAY, LDB, STRIDEB, & + & BETA, & + & CARRAY, LDC, STRIDEC, & + & BATCHCOUNT) + END SUBROUTINE CUDA_SGEMM_BATCHED_1D_OVERLOAD + + SUBROUTINE CUDA_SGEMM_STRIDED_BATCHED_1D_OVERLOAD( & + & TRANSA, TRANSB, & + & M, N, K, & + & ALPHA, & + & AARRAY, LDA, STRIDEA, & + & BARRAY, LDB, STRIDEB, & + & BETA, & + & CARRAY, LDC, STRIDEC, & + & BATCHCOUNT) + CHARACTER, INTENT(IN) :: TRANSA + CHARACTER, INTENT(IN) :: TRANSB + INTEGER(KIND=JPIM) :: M + INTEGER(KIND=JPIM) :: N + INTEGER(KIND=JPIM) :: K + REAL(KIND=JPRM) :: ALPHA + REAL(KIND=JPRM), DIMENSION(:) :: AARRAY + INTEGER(KIND=JPIM) :: LDA + INTEGER(KIND=JPIB) :: STRIDEA + REAL(KIND=JPRM), DIMENSION(:,:,:) :: BARRAY + INTEGER(KIND=JPIM) :: LDB + INTEGER(KIND=JPIB) :: STRIDEB + REAL(KIND=JPRM) :: BETA + REAL(KIND=JPRM), DIMENSION(:) :: CARRAY + INTEGER(KIND=JPIM) :: LDC + INTEGER(KIND=JPIB) :: STRIDEC + INTEGER(KIND=JPIM) :: BATCHCOUNT + + CALL HIP_SGEMM_STRIDED_BATCHED( & + & TRANSA, TRANSB, & + & M, N, K, & + & ALPHA, & + & AARRAY, LDA, STRIDEA, & + & BARRAY, LDB, STRIDEB, & + & BETA, & + & CARRAY, LDC, STRIDEC, & + & BATCHCOUNT) + END SUBROUTINE CUDA_SGEMM_STRIDED_BATCHED_1D_OVERLOAD + +END MODULE CUDA_GEMM_BATCHED_MOD diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 8f61ffd70..63e122cb4 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -1,6 +1,6 @@ ! (C) Copyright 2000- ECMWF. ! (C) Copyright 2000- Meteo-France. -! +! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. ! In applying this licence, ECMWF does not waive the privileges and immunities @@ -37,7 +37,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) ! Method. ! ------- use butterfly or dgemm -! Externals. +! Externals. ! ---------- ! Reference. @@ -68,11 +68,15 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) USE TPM_GEN ,ONLY : NOUT USE TPM_FLT USE BUTTERFLY_ALG_MOD -USE HIPBLAS_MOD ,ONLY : HIP_DGEMM_BATCHED, HIP_SGEMM_BATCHED +#ifdef HIPGPU +USE HIPBLAS_MOD ,ONLY : SGEMM_BATCHED => HIP_SGEMM_BATCHED, DGEMM_BATCHED => HIP_DGEMM_BATCHED +#elif defined(CUDAGPU) +USE CUBLAS_MOD ,ONLY : SGEMM_BATCHED => CUDA_SGEMM_BATCHED, DGEMM_BATCHED => CUDA_DGEMM_BATCHED +#endif #ifdef TRANS_SINGLE -#define HIP_GEMM_BATCHED HIP_SGEMM_BATCHED +#define GEMM_BATCHED SGEMM_BATCHED #else -#define HIP_GEMM_BATCHED HIP_DGEMM_BATCHED +#define GEMM_BATCHED DGEMM_BATCHED #endif USE, INTRINSIC :: ISO_C_BINDING USE IEEE_ARITHMETIC @@ -152,12 +156,12 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) DO KMLOC=1,D_NUMP DO J=1,R_NDGNH DO JK=1,KFC - - KM = D_MYMS(KMLOC) + + KM = D_MYMS(KMLOC) KDGLU = MIN(R_NDGNH,G_NDGLU(KM)) IF (J .LE. KDGLU) THEN ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) - + IF(KM == 0)THEN ISKIP = 2 ELSE @@ -181,7 +185,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) #ifdef ACCGPU !$ACC HOST_DATA USE_DEVICE(ZAA,DZBST,DZCAT) #endif -CALL HIP_GEMM_BATCHED( & +CALL GEMM_BATCHED( & & 'N', 'N', & & DTDZBA, TDZAA, DLDZBA, & & 1.0_JPRBT, & @@ -216,7 +220,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) ELSE ISKIP = 1 ENDIF - + IF (MOD((JK-1),ISKIP) .EQ. 0) THEN ILA = (R_NTMAX-KM+2)/2 IA = 1+MOD(R_NTMAX-KM+2,2) @@ -266,7 +270,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) #ifdef ACCGPU !$ACC HOST_DATA USE_DEVICE(ZAA0,DZBST0,DZCAT0) #endif - CALL HIP_DGEMM_BATCHED( & + CALL DGEMM_BATCHED( & & 'N','N', & & DTDZBA, TDZAA, DLDZBA, & & 1.0_JPRD, & @@ -320,13 +324,13 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) !$ACC & PRESENT(D_NUMP,R_NDGNH,D_MYMS,G_NDGLU,DZBST,PAIA,F_RW) #endif DO KMLOC=1,D_NUMP - DO J=1,R_NDGNH + DO J=1,R_NDGNH DO JK=1,KFC - KM = D_MYMS(KMLOC) + KM = D_MYMS(KMLOC) KDGLU = MIN(R_NDGNH,G_NDGLU(KM)) IF (J .LE. KDGLU) THEN ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) - + IF(KM == 0)THEN ISKIP = 2 ELSE @@ -349,7 +353,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) #ifdef ACCGPU !$ACC HOST_DATA USE_DEVICE(ZAS,DZBST,DZCST) #endif -CALL HIP_GEMM_BATCHED( & +CALL GEMM_BATCHED( & & 'N', 'N', & & DTDZBS, TDZAS, DLDZBS, & & 1.0_JPRBT, & @@ -384,12 +388,12 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) ELSE ISKIP = 1 ENDIF - + IF (MOD((JK-1),ISKIP) .EQ. 0) THEN ILS = (R_NTMAX-KM+3)/2 IF (J .LE. ILS) THEN IS = 1+MOD(R_NTMAX-KM+1,2) - POA1(JK,IS+(J-1)*2,KMLOC) = DZCST((JK-1)/ISKIP+1+(J-1+(KMLOC-1)*DLDZCS)*DTDZCS) + POA1(JK,IS+(J-1)*2,KMLOC) = DZCST((JK-1)/ISKIP+1+(J-1+(KMLOC-1)*DLDZCS)*DTDZCS) END IF END IF ENDDO @@ -407,7 +411,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) !$ACC& COPYIN(KFC,DTDZBS,KMLOC0,ISKIP) & !$ACC& PRESENT(R_NDGNH,G_NDGLU,DZBST0,PAIA,F_RW) #endif - DO J=1,R_NDGNH + DO J=1,R_NDGNH DO JK=1,KFC KDGLU = MIN(R_NDGNH,G_NDGLU(0)) IF (J .LE. KDGLU) THEN @@ -429,7 +433,7 @@ SUBROUTINE LEDIR(KF_FS,KLED2,PAIA,POA1,KMODE) #ifdef OMPGPU !$OMP TARGET DATA USE_DEVICE_PTR(ZAS0,DZBST0,DZCST0) #endif - CALL HIP_DGEMM_BATCHED('N','N',& + CALL DGEMM_BATCHED('N','N',& & DTDZBS,TDZAS,DLDZBS,& & 1.0_JPRD,DZBST0,DTDZBS,DLDZBS,& & ZAS0,LDZAS,TDZAS,& diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 48eebd089..d18ac89b5 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -1,6 +1,6 @@ ! (C) Copyright 2000- ECMWF. ! (C) Copyright 2000- Meteo-France. -! +! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. ! In applying this licence, ECMWF does not waive the privileges and immunities @@ -53,7 +53,7 @@ SUBROUTINE LEINV(KFC,KSTA,KF_OUT_LT,PAOA1,PSOA1) ! J.Hague : Oct 2012 DR_HOOK round calls to DGEMM: ! F. Vana 05-Mar-2015 Support for single precision ! ------------------------------------------------------------------ - + USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE TPM_DIM ,ONLY : R, R_NDGNH,R_NSMAX @@ -62,15 +62,19 @@ SUBROUTINE LEINV(KFC,KSTA,KF_OUT_LT,PAOA1,PSOA1) & ZAA,ZAS,LDZAA,LDZAS,TDZAA,TDZAS,& & IZBS,ILDZBA,ILDZBS,ITDZBA,ITDZBS,& & IZCS,IZCST,ILDZCA,ILDZCS,ITDZCA,ITDZCS,& - & TDZAS, IF_FS_INV, ZAMAX, ZSMAX + & TDZAS, IF_FS_INV, ZAMAX, ZSMAX USE TPM_DISTR ,ONLY : D_NUMP,D_MYMS, MYPROC USE TPM_GEN ,ONLY : NOUT USE TPM_FLT -USE HIPBLAS_MOD ,ONLY : HIP_SGEMM_BATCHED, HIP_DGEMM_BATCHED +#ifdef HIPGPU +USE HIPBLAS_MOD ,ONLY : SGEMM_BATCHED => HIP_SGEMM_BATCHED, DGEMM_BATCHED => HIP_DGEMM_BATCHED +#elif defined(CUDAGPU) +USE CUBLAS_MOD ,ONLY : SGEMM_BATCHED => CUDA_SGEMM_BATCHED, DGEMM_BATCHED => CUDA_DGEMM_BATCHED +#endif #ifdef TRANS_SINGLE -#define HIP_GEMM_BATCHED HIP_SGEMM_BATCHED +#define GEMM_BATCHED SGEMM_BATCHED #else -#define HIP_GEMM_BATCHED HIP_DGEMM_BATCHED +#define GEMM_BATCHED DGEMM_BATCHED #endif ! issue ? address error !USE HIP_GEMM_BATCHED_MOD @@ -98,7 +102,7 @@ SUBROUTINE LEINV(KFC,KSTA,KF_OUT_LT,PAOA1,PSOA1) INTEGER(KIND=JPIM) :: ISTAT REAL(KIND=JPHOOK) :: ZHOOK_HANDLE - + !* 1.1 PREPARATIONS. IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) ! ------------------------------------------------------------------ @@ -132,7 +136,7 @@ SUBROUTINE LEINV(KFC,KSTA,KF_OUT_LT,PAOA1,PSOA1) DO KMLOC=1,D_NUMP DO JGL=1,R_NDGNH DO J1=2,KFC,2 - + KM = D_MYMS(KMLOC) IF(KM == 0)THEN PSOA1(J1,JGL,KMLOC) = 0.0_JPRBT @@ -154,14 +158,14 @@ SUBROUTINE LEINV(KFC,KSTA,KF_OUT_LT,PAOA1,PSOA1) DO KMLOC=1,D_NUMP DO J=1,(R_NSMAX+2)/2 DO JK=1,KFC - + KM = D_MYMS(KMLOC) IF (KM == 0) THEN ISKIP = 2 ELSE ISKIP = 1 ENDIF - + IF (MOD((JK-1),ISKIP) .EQ. 0) THEN ILA = (R_NSMAX-KM+2)/2 IF (J .LE. ILA) THEN @@ -186,7 +190,7 @@ SUBROUTINE LEINV(KFC,KSTA,KF_OUT_LT,PAOA1,PSOA1) #ifdef OMPGPU !$OMP TARGET DATA USE_DEVICE_PTR(ZAA,IZBS,IZCST) #endif -CALL HIP_GEMM_BATCHED( & +CALL GEMM_BATCHED( & & 'N', 'T', & & ITDZCA, ILDZCA, ILDZBA, & & 1.0_JPRBT, & @@ -221,7 +225,7 @@ SUBROUTINE LEINV(KFC,KSTA,KF_OUT_LT,PAOA1,PSOA1) ELSE ISKIP = 1 END IF - + ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) IF (MOD((JK-1),ISKIP) .EQ. 0) THEN PAOA1(JK,ISL+JI-1,KMLOC) = IZCST((JK-1)/ISKIP+1+(JI-1)*ITDZCA+(KMLOC-1)*ILDZCA*ITDZCA) @@ -250,7 +254,7 @@ SUBROUTINE LEINV(KFC,KSTA,KF_OUT_LT,PAOA1,PSOA1) ELSE ISKIP = 1 ENDIF - + IF (MOD((JK-1),ISKIP) .EQ. 0) THEN ILS = (R_NSMAX-KM+3)/2 IF (J .LE. ILS) THEN @@ -271,7 +275,7 @@ SUBROUTINE LEINV(KFC,KSTA,KF_OUT_LT,PAOA1,PSOA1) #ifdef ACCGPU !$ACC HOST_DATA USE_DEVICE(ZAS,IZBS,IZCST) #endif -CALL HIP_GEMM_BATCHED( & +CALL GEMM_BATCHED( & & 'N', 'T', & & ITDZCS, ILDZCS, ILDZBS, & & 1.0_JPRBT, & @@ -307,7 +311,7 @@ SUBROUTINE LEINV(KFC,KSTA,KF_OUT_LT,PAOA1,PSOA1) ELSE ISKIP = 1 END IF - + ISL = MAX(R_NDGNH-G_NDGLU(KM)+1,1) IF (MOD((JK-1),ISKIP) .EQ. 0) THEN PSOA1(JK,ISL+JI-1,KMLOC) = IZCST((JK-1)/ISKIP+1+(JI-1)*ITDZCS+(KMLOC-1)*ITDZCS*ILDZCS) diff --git a/src/trans/gpu/internal/set_resol_mod.F90 b/src/trans/gpu/internal/set_resol_mod.F90 index 6598ac3e8..5753ae41d 100755 --- a/src/trans/gpu/internal/set_resol_mod.F90 +++ b/src/trans/gpu/internal/set_resol_mod.F90 @@ -1,6 +1,6 @@ ! (C) Copyright 2000- ECMWF. ! (C) Copyright 2000- Meteo-France. -! +! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. ! In applying this licence, ECMWF does not waive the privileges and immunities @@ -20,7 +20,11 @@ SUBROUTINE SET_RESOL(KRESOL,LDSETUP) USE TPM_GEOMETRY ,ONLY : G, GEOM_RESOL USE TPM_FIELDS ,ONLY : F, FIELDS_RESOL USE TPM_FFT ,ONLY : T, FFT_RESOL -USE TPM_FFTH ,ONLY : TC, FFTH_RESOL +#ifdef HIPGPU +USE TPM_FFTH ,ONLY : TC, GPU_FFT_RESOL => FFTH_RESOL +#elif defined(CUDAGPU) +USE TPM_FFTC ,ONLY : TC, GPU_FFT_RESOL => FFTC_RESOL +#endif USE TPM_FLT USE TPM_CTL ,ONLY : C, CTL_RESOL USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS @@ -63,7 +67,7 @@ SUBROUTINE SET_RESOL(KRESOL,LDSETUP) G => GEOM_RESOL(NCUR_RESOL) D => DISTR_RESOL(NCUR_RESOL) T => FFT_RESOL(NCUR_RESOL) - TC => FFTH_RESOL(NCUR_RESOL) + TC => GPU_FFT_RESOL(NCUR_RESOL) S => FLT_RESOL(NCUR_RESOL) C => CTL_RESOL(NCUR_RESOL) ENDIF diff --git a/src/trans/gpu/internal/sufft_mod.F90 b/src/trans/gpu/internal/sufft_mod.F90 index 0267b4644..1b834c2bb 100755 --- a/src/trans/gpu/internal/sufft_mod.F90 +++ b/src/trans/gpu/internal/sufft_mod.F90 @@ -1,6 +1,6 @@ ! (C) Copyright 2000- ECMWF. ! (C) Copyright 2000- Meteo-France. -! +! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. ! In applying this licence, ECMWF does not waive the privileges and immunities @@ -11,37 +11,41 @@ MODULE SUFFT_MOD CONTAINS SUBROUTINE SUFFT - + USE PARKIND1 ,ONLY : JPIM - + USE TPM_DIM ,ONLY : R USE TPM_GEN ,ONLY : NOUT, NPRINTLEV USE TPM_DISTR ,ONLY : D, MYSETW USE TPM_GEOMETRY ,ONLY : G USE TPM_FFT ,ONLY : T + #ifdef HIPGPU USE TPM_FFTH ,ONLY : TC, INIT_PLANS_FFT + #elif defined CUDAGPU + USE TPM_FFTC ,ONLY : TC, INIT_PLANS_FFT + #endif ! - + IMPLICIT NONE - + INTEGER(KIND=JPIM) :: JGL,IGLG LOGICAL :: LLP1,LLP2 - + ! ------------------------------------------------------------------ - + IF(.NOT.D%LGRIDONLY) THEN - + LLP1 = NPRINTLEV>0 LLP2 = NPRINTLEV>1 IF(LLP1) WRITE(NOUT,*) '=== ENTER ROUTINE SUFFT ===' CALL INIT_PLANS_FFT(R%NDLON) - + ENDIF - + ! ------------------------------------------------------------------ - + 9 FORMAT(1X,'ARRAY ',A10,' ALLOCATED ',8I8) - + END SUBROUTINE SUFFT -END MODULE SUFFT_MOD +END MODULE SUFFT_MOD diff --git a/src/trans/gpu/internal/tpm_fftc.F90 b/src/trans/gpu/internal/tpm_fftc.F90 new file mode 100755 index 000000000..ef9956d0f --- /dev/null +++ b/src/trans/gpu/internal/tpm_fftc.F90 @@ -0,0 +1,210 @@ +! (C) Copyright 2014- ECMWF. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. +! + +MODULE TPM_FFTC + +! Author. +! ------- +! George Mozdzynski +! +! Modifications. +! -------------- +! Original October 2014 + +USE, INTRINSIC :: ISO_C_BINDING + +USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT +USE MPL_MODULE, ONLY : MPL_MYRANK + +IMPLICIT NONE + +SAVE + +PRIVATE +PUBLIC CREATE_PLAN_FFT, DESTROY_PLAN_FFT, DESTROY_ALL_PLANS_FFT, INIT_PLANS_FFT, EXECUTE_PLAN_FFT, & + & FFTC_RESOL, TC + +TYPE FFTC_TYPE + INTEGER(KIND=JPIM),POINTER :: N_PLANS(:) + TYPE(FFTC_PLAN),POINTER :: FFTC_PLANS(:) + INTEGER(KIND=JPIM) :: N_MAX=0 + INTEGER(KIND=JPIM) :: N_MAX_PLANS=8 +END TYPE FFTC_TYPE + + +TYPE FFTC_PLAN + INTEGER(KIND=JPIM) :: NPLAN_ID=123456 + INTEGER(KIND=JPIM) :: NPLAN + INTEGER(KIND=JPIM) :: NLOT + INTEGER(KIND=JPIM) :: NTYPE + TYPE(FFTC_PLAN),POINTER :: NEXT_PLAN => NULL() +END TYPE FFTC_PLAN + +TYPE(FFTC_TYPE),ALLOCATABLE,TARGET :: FFTC_RESOL(:) +TYPE(FFTC_TYPE),POINTER :: TC + + + +! ------------------------------------------------------------------ +CONTAINS +! ------------------------------------------------------------------ + + +SUBROUTINE INIT_PLANS_FFT(KDLON) +INTEGER(KIND=JPIM),INTENT(IN) :: KDLON + +TC%N_MAX=KDLON +ALLOCATE(TC%FFTC_PLANS(TC%N_MAX)) +ALLOCATE(TC%N_PLANS(TC%N_MAX)) +TC%N_PLANS(:)=0 +RETURN +END SUBROUTINE INIT_PLANS_FFT + + +SUBROUTINE CREATE_PLAN_FFT(KPLAN,KTYPE,KN,KLOT) +INTEGER(KIND=JPIM),INTENT(OUT) :: KPLAN +INTEGER(KIND=JPIM),INTENT(IN) :: KTYPE,KN,KLOT + +INTEGER(KIND=JPIM) :: IPLAN +INTEGER(KIND=JPIM) :: IRANK, ISTRIDE +INTEGER(KIND=JPIM) :: JL, JN +INTEGER(KIND=JPIM) :: IRDIST,ICDIST,IN(1),IEMBED(1) +LOGICAL :: LLFOUND +LOGICAL :: LLRESTRICT_PLANS=.TRUE. +TYPE(FFTC_PLAN),POINTER :: CURR_FFTC_PLAN,START_FFTC_PLAN +INTERFACE + SUBROUTINE CREATE_PLAN_FFTC(KPLAN,KTYPE,KN,KLOT) BIND(C,NAME="create_plan_fftc_") + USE, INTRINSIC :: ISO_C_BINDING + INTEGER(C_INT) :: KPLAN + INTEGER(C_INT) :: KTYPE,KN,KLOT + END SUBROUTINE CREATE_PLAN_FFTC +END INTERFACE + +IF( KN > TC%N_MAX )THEN + CALL ABOR1('CREATE_PLAN_FFT: KN > N_MAX THAT WAS INITIALISED IN INIT_PLANS_FFTC') +ENDIF + +IRANK=1 +ISTRIDE=1 +IN(1)=KN +IEMBED(1)=IN(1) +ICDIST=KN/2+1 +IRDIST=ICDIST*2 + +!!$OMP CRITICAL +LLFOUND=.FALSE. +IF( TC%FFTC_PLANS(KN)%NPLAN_ID /= 123456 )THEN + WRITE(*,'("CREATE_PLAN_FFT.1: PLAN_ID=",I10)')TC%FFTC_PLANS(KN)%NPLAN_ID + CALL ABOR1('CREATE_PLAN_FFT.1: NPLAN_ID /= 123456') +ENDIF +CURR_FFTC_PLAN=>TC%FFTC_PLANS(KN) +IF( CURR_FFTC_PLAN%NPLAN_ID /= 123456 )THEN + WRITE(*,'("CREATE_PLAN_FFT.2: PLAN_ID=",I10)')CURR_FFTC_PLAN%NPLAN_ID + CALL ABOR1('CREATE_PLAN_FFT.2: NPLAN_ID /= 123456') +ENDIF +! search for plan in existing plans +DO JL=1,TC%N_PLANS(KN) + IF( KLOT == CURR_FFTC_PLAN%NLOT .AND. KTYPE == CURR_FFTC_PLAN%NTYPE )THEN + LLFOUND=.TRUE. + IPLAN=CURR_FFTC_PLAN%NPLAN + EXIT + ELSEIF( JL /= TC%N_PLANS(KN) )THEN + CURR_FFTC_PLAN=>CURR_FFTC_PLAN%NEXT_PLAN + IF( CURR_FFTC_PLAN%NPLAN_ID /= 123456 )THEN + WRITE(*,'("CREATE_PLAN_FFT.3: PLAN_ID=",I10)')CURR_FFTC_PLAN%NPLAN_ID + CALL ABOR1('CREATE_PLAN_FFT.3: NPLAN_ID /= 123456') + ENDIF + ENDIF +ENDDO +IF( .NOT.LLFOUND )THEN + IF( LLRESTRICT_PLANS )THEN + IF( TC%N_PLANS(KN) == TC%N_MAX_PLANS )THEN + ! destroy the plan at the start of the list +! WRITE(*,'("CREATE_PLAN_FFT: BEG: DESTROYING A PLAN AT THE START OF THE LIST")') + CALL DESTROY_PLAN_FFT(TC%FFTC_PLANS(KN)%NPLAN) + TC%FFTC_PLANS(KN)%NPLAN_ID=999999 + START_FFTC_PLAN=>TC%FFTC_PLANS(KN) + TC%FFTC_PLANS(KN)=TC%FFTC_PLANS(KN)%NEXT_PLAN + ! DEALLOCATE(START_FFTC_PLAN) + TC%N_PLANS(KN)=TC%N_PLANS(KN)-1 +! WRITE(*,'("CREATE_PLAN_FFT: END: DESTROYING A PLAN AT THE START OF THE LIST")') + ENDIF + ENDIF + CALL CREATE_PLAN_FFTC(IPLAN,KTYPE,KN,KLOT) + KPLAN=IPLAN + TC%N_PLANS(KN)=TC%N_PLANS(KN)+1 + IF( TC%N_PLANS(KN) /= 1 )THEN + ALLOCATE(CURR_FFTC_PLAN%NEXT_PLAN) + CURR_FFTC_PLAN=>CURR_FFTC_PLAN%NEXT_PLAN + ENDIF + IF( CURR_FFTC_PLAN%NPLAN_ID /= 123456 )THEN + WRITE(*,'("CREATE_PLAN_FFT.4: PLAN_ID=",I10)')CURR_FFTC_PLAN%NPLAN_ID + CALL ABOR1('CREATE_PLAN_FFT.4: NPLAN_ID /= 123456') + ENDIF + CURR_FFTC_PLAN%NPLAN=IPLAN + CURR_FFTC_PLAN%NLOT=KLOT + CURR_FFTC_PLAN%NTYPE=KTYPE + CURR_FFTC_PLAN%NEXT_PLAN=>NULL() +! write(*,'("CREATE_PLAN_FFT: KN=",I5," NPLANS=",I3," KLOT=",I6," KTYPE=",I2,& +! & " NEW IPLAN=",Z16)')KN,TC%N_PLANS(KN),KLOT,KTYPE,IPLAN +ELSE + KPLAN=IPLAN +! write(*,'("CREATE_PLAN_FFT: KN=",I5," NPLANS=",I3," KLOT=",I6," KTYPE=",I2,& +! & " CUR IPLAN=",Z16)')KN,TC%N_PLANS(KN),KLOT,KTYPE,IPLAN +ENDIF +!!$OMP END CRITICAL + +RETURN +END SUBROUTINE CREATE_PLAN_FFT + + +SUBROUTINE DESTROY_PLAN_FFT(KPLAN) +INTEGER(KIND=JPIM),INTENT(IN) :: KPLAN +CALL DESTROY_PLAN_FFTC(KPLAN) +RETURN +END SUBROUTINE DESTROY_PLAN_FFT + + +SUBROUTINE DESTROY_ALL_PLANS_FFT +INTEGER(KIND=JPIM) :: JL, JN +TYPE(FFTC_PLAN),POINTER :: CURR_FFTC_PLAN +DO JN=1,TC%N_MAX + CURR_FFTC_PLAN=>TC%FFTC_PLANS(JN) +ENDDO +WRITE(*,'("DESTROY_ALL_PLANS_FFTC: MPL_RANK=",I6," SUM(TC%N_PLANS(:))=",I10)')& + & MPL_MYRANK(), SUM(TC%N_PLANS(:)) +DEALLOCATE(TC%FFTC_PLANS) +DEALLOCATE(TC%N_PLANS) +RETURN +END SUBROUTINE DESTROY_ALL_PLANS_FFT + +SUBROUTINE EXECUTE_PLAN_FFT(KN,N,X_IN,X_OUT,PLAN_PTR) +INTEGER(KIND=JPIM) :: PLAN_PTR +INTEGER(KIND=C_INT) :: KN +INTEGER(KIND=C_INT) :: N +REAL(KIND=JPRBT), TARGET :: X_IN(*) +REAL(KIND=JPRBT), TARGET :: X_OUT(*) + +#ifdef OMPGPU +!$OMP TARGET DATA USE_DEVICE_PTR(X_IN,X_OUT) +#endif +#ifdef ACCGPU +!$ACC HOST_DATA USE_DEVICE(X_IN,X_OUT) +#endif +CALL EXECUTE_PLAN_FFTC(PLAN_PTR,KN,X_IN,X_OUT) +#ifdef ACCGPU +!$ACC END HOST_DATA +#endif +#ifdef OMPGPU +!$OMP END TARGET DATA +#endif + +END SUBROUTINE EXECUTE_PLAN_FFT + +END MODULE TPM_FFTC diff --git a/src/trans/gpu/internal/tpm_ffth.F90 b/src/trans/gpu/internal/tpm_ffth.F90 index 9426693f1..1f2bbde94 100755 --- a/src/trans/gpu/internal/tpm_ffth.F90 +++ b/src/trans/gpu/internal/tpm_ffth.F90 @@ -1,5 +1,5 @@ ! (C) Copyright 2014- ECMWF. -! +! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. ! In applying this licence, ECMWF does not waive the privileges and immunities @@ -14,30 +14,30 @@ MODULE TPM_FFTH ! George Mozdzynski ! ! Modifications. - ! -------------- + ! -------------- ! Original October 2014 - + USE, INTRINSIC :: ISO_C_BINDING - + USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT - !USE MPL_MODULE ,ONLY : MPL_MYRANK - + USE MPL_MODULE ,ONLY : MPL_MYRANK + IMPLICIT NONE - + SAVE - + PRIVATE PUBLIC CREATE_PLAN_FFT, DESTROY_PLAN_FFT, DESTROY_ALL_PLANS_FFT, INIT_PLANS_FFT, EXECUTE_PLAN_FFT, & & FFTH_RESOL, TC - + TYPE FFTH_TYPE INTEGER(KIND=C_INT),POINTER :: N_PLANS(:) TYPE(FFTH_PLAN),POINTER :: FFTH_PLANS(:) INTEGER(KIND=C_INT) :: N_MAX=0 INTEGER(KIND=C_INT) :: N_MAX_PLANS=8 END TYPE FFTH_TYPE - - + + TYPE FFTH_PLAN INTEGER(KIND=C_INT) :: NPLAN_ID=123456 TYPE(C_PTR) :: NPLAN @@ -45,32 +45,32 @@ MODULE TPM_FFTH INTEGER(KIND=C_INT) :: NTYPE TYPE(FFTH_PLAN),POINTER :: NEXT_PLAN => NULL() END TYPE FFTH_PLAN - + TYPE(FFTH_TYPE),ALLOCATABLE,TARGET :: FFTH_RESOL(:) TYPE(FFTH_TYPE),POINTER :: TC - - - + + + ! ------------------------------------------------------------------ CONTAINS ! ------------------------------------------------------------------ - - + + SUBROUTINE INIT_PLANS_FFT(KDLON) INTEGER(KIND=C_INT),INTENT(IN) :: KDLON - + TC%N_MAX=KDLON ALLOCATE(TC%FFTH_PLANS(TC%N_MAX)) ALLOCATE(TC%N_PLANS(TC%N_MAX)) TC%N_PLANS(:)=0 - RETURN + RETURN END SUBROUTINE INIT_PLANS_FFT - - + + SUBROUTINE CREATE_PLAN_FFT(KPLAN,KTYPE,KN,KLOT) TYPE(C_PTR),INTENT(OUT) :: KPLAN INTEGER(KIND=C_INT),INTENT(IN) :: KTYPE,KN,KLOT - + TYPE(C_PTR) :: IPLAN INTEGER(KIND=C_INT) :: IRANK, ISTRIDE INTEGER(KIND=C_INT) :: JL, JN @@ -85,18 +85,18 @@ SUBROUTINE CREATE_PLAN_FFTH(KPLAN,KTYPE,KN,KLOT) BIND(C,NAME="create_plan_ffth_" INTEGER(C_INT) :: KTYPE,KN,KLOT END SUBROUTINE CREATE_PLAN_FFTH END INTERFACE - + IF( KN > TC%N_MAX )THEN stop 'CREATE_PLAN_FFT: KN > N_MAX THAT WAS INITIALISED IN INIT_PLANS_FFTH' ENDIF - + IRANK=1 ISTRIDE=1 IN(1)=KN IEMBED(1)=IN(1) ICDIST=KN/2+1 IRDIST=ICDIST*2 - + !!$OMP CRITICAL LLFOUND=.FALSE. IF( TC%FFTH_PLANS(KN)%NPLAN_ID /= 123456 )THEN @@ -159,38 +159,38 @@ END SUBROUTINE CREATE_PLAN_FFTH ! & " CUR IPLAN=",Z16)')KN,TC%N_PLANS(KN),KLOT,KTYPE,IPLAN ENDIF !!$OMP END CRITICAL - + RETURN END SUBROUTINE CREATE_PLAN_FFT - - + + SUBROUTINE DESTROY_PLAN_FFT(KPLAN) TYPE(C_PTR),INTENT(IN) :: KPLAN CALL DESTROY_PLAN_FFTH(KPLAN) RETURN END SUBROUTINE DESTROY_PLAN_FFT - - + + SUBROUTINE DESTROY_ALL_PLANS_FFT INTEGER(KIND=C_INT) :: JL, JN TYPE(FFTH_PLAN),POINTER :: CURR_FFTH_PLAN DO JN=1,TC%N_MAX CURR_FFTH_PLAN=>TC%FFTH_PLANS(JN) ENDDO - !WRITE(*,'("DESTROY_ALL_PLANS_FFTH: MPL_RANK=",I6," SUM(TC%N_PLANS(:))=",I10)')& - ! & MPL_MYRANK(), SUM(TC%N_PLANS(:)) + WRITE(*,'("DESTROY_ALL_PLANS_FFTH: MPL_RANK=",I6," SUM(TC%N_PLANS(:))=",I10)')& + & MPL_MYRANK(), SUM(TC%N_PLANS(:)) DEALLOCATE(TC%FFTH_PLANS) DEALLOCATE(TC%N_PLANS) RETURN END SUBROUTINE DESTROY_ALL_PLANS_FFT - + SUBROUTINE EXECUTE_PLAN_FFT(KN,N,X_IN,X_OUT,PLAN_PTR) TYPE(C_PTR) :: PLAN_PTR INTEGER(KIND=C_INT) :: KN INTEGER(KIND=C_INT) :: N REAL(KIND=JPRBT), TARGET :: X_IN REAL(KIND=JPRBT), TARGET :: X_OUT - + INTERFACE SUBROUTINE EXECUTE_PLAN_FFTH_C (KN, N, X_IN_PTR, X_OUT_PTR, PLAN_PTR) & & BIND(C,NAME="execute_plan_ffth_c_") @@ -215,8 +215,7 @@ END SUBROUTINE EXECUTE_PLAN_FFTH_C #ifdef OMPGPU !$OMP END TARGET DATA #endif - + END SUBROUTINE EXECUTE_PLAN_FFT - + END MODULE TPM_FFTH - \ No newline at end of file diff --git a/src/trans/gpu/internal_reducedmem/ftdir_mod.F90 b/src/trans/gpu/internal_reducedmem/ftdir_mod.F90 index 7cbe71037..1a4519a06 100755 --- a/src/trans/gpu/internal_reducedmem/ftdir_mod.F90 +++ b/src/trans/gpu/internal_reducedmem/ftdir_mod.F90 @@ -1,6 +1,6 @@ ! (C) Copyright 2000- ECMWF. ! (C) Copyright 2000- Meteo-France. -! +! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. ! In applying this licence, ECMWF does not waive the privileges and immunities @@ -52,7 +52,11 @@ SUBROUTINE FTDIR(KFIELDS) USE TPM_TRANS ,ONLY : ZGTF, ZGTFTMP USE TPM_GEOMETRY ,ONLY : G,G_NMEN,G_NMEN_MAX,G_NLOEN,G_NLOEN_MAX USE TPM_FFT ,ONLY : T +#ifdef HIPGPU USE TPM_FFTH ,ONLY : CREATE_PLAN_FFT, EXECUTE_PLAN_FFT +#elif defined(CUDAGPU) +USE TPM_FFTC ,ONLY : CREATE_PLAN_FFT, EXECUTE_PLAN_FFT +#endif USE TPM_DIM ,ONLY : R,R_NNOEXTZL USE CUDA_DEVICE_MOD ! diff --git a/src/trans/gpu/internal_reducedmem/ftinv_mod.F90 b/src/trans/gpu/internal_reducedmem/ftinv_mod.F90 index b17e977db..3c7ca5acd 100755 --- a/src/trans/gpu/internal_reducedmem/ftinv_mod.F90 +++ b/src/trans/gpu/internal_reducedmem/ftinv_mod.F90 @@ -1,6 +1,6 @@ ! (C) Copyright 2000- ECMWF. ! (C) Copyright 2000- Meteo-France. -! +! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. ! In applying this licence, ECMWF does not waive the privileges and immunities @@ -22,7 +22,7 @@ SUBROUTINE FTINV(PREELIN,PREELOUT,KFIELDS) ! CALL FTINV(..) ! Explicit arguments : PREELTMP - in Fourier/grid-point array -! PREEL - out Fourier/grid-point array +! PREEL - out Fourier/grid-point array ! -------------------- KFIELDS - number of fields ! Method. @@ -51,7 +51,11 @@ SUBROUTINE FTINV(PREELIN,PREELOUT,KFIELDS) USE TPM_GEOMETRY ,ONLY : G USE TPM_GEN ,ONLY : NOUT USE TPM_FFT ,ONLY : T -USE TPM_FFTH ,ONLY : CREATE_PLAN_FFT, DESTROY_PLAN_FFT, EXECUTE_PLAN_FFT +#ifdef HIPGPU +USE TPM_FFTH ,ONLY : CREATE_PLAN_FFT, EXECUTE_PLAN_FFT, DESTROY_PLAN_FFT +#elif defined(CUDAGPU) +USE TPM_FFTC ,ONLY : CREATE_PLAN_FFT, EXECUTE_PLAN_FFT, DESTROY_PLAN_FFT +#endif USE TPM_DIM ,ONLY : R USE CUDA_DEVICE_MOD @@ -153,7 +157,7 @@ SUBROUTINE FTINV(PREELIN,PREELOUT,KFIELDS) !ENDIF END DO !!$OMP END PARALLEL DO -ISTAT = CUDA_SYNCHRONIZE() +ISTAT = CUDA_SYNCHRONIZE() ! ------------------------------------------------------------------