diff --git a/src/programs/ectrans-benchmark-ifs.F90 b/src/programs/ectrans-benchmark-ifs.F90 index fd9fcc8d..5187553f 100644 --- a/src/programs/ectrans-benchmark-ifs.F90 +++ b/src/programs/ectrans-benchmark-ifs.F90 @@ -1094,6 +1094,17 @@ subroutine parse_grid(cgrid,ndgl,nloen) !=================================================================================================== +subroutine str2int(str, int, stat) + + character(len=*), intent(in) :: str + integer, intent(out) :: int + integer, intent(out) :: stat + read(str, *, iostat=stat) int + +end subroutine str2int + +!=================================================================================================== + function get_int_value(cname, iarg) result(value) integer :: value @@ -1130,6 +1141,78 @@ function get_str_value(cname, iarg) result(value) !=================================================================================================== +subroutine print_help(unit) + + integer, optional :: unit + integer :: nout = 6 + if (present(unit)) then + nout = unit + endif + + write(nout, "(a)") "" + + if (jprb == jprd) then + write(nout, "(a)") "NAME ectrans-benchmark-dp" + else + write(nout, "(a)") "NAME ectrans-benchmark-sp" + end if + write(nout, "(a)") "" + + write(nout, "(a)") "DESCRIPTION" + write(nout, "(a)") " This program tests ecTrans by transforming fields back and forth& + & between spectral " + if (jprb == jprd) then + write(nout, "(a)") " space and grid-point space (double-precision version)" + else + write(nout, "(a)") " space and grid-point space (single-precision version)" + end if + write(nout, "(a)") "" + + write(nout, "(a)") "USAGE" + if (jprb == jprd) then + write(nout, "(a)") " ectrans-benchmark-dp [options]" + else + write(nout, "(a)") " ectrans-benchmark-sp [options]" + end if + write(nout, "(a)") "" + + write(nout, "(a)") "OPTIONS" + write(nout, "(a)") " -h, --help Print this message" + write(nout, "(a)") " -v Run with verbose output" + write(nout, "(a)") " -t, --truncation T Run with this triangular spectral truncation& + & (default = 79)" + write(nout, "(a)") " -g, --grid GRID Run with this grid. Possible values: O, F" + write(nout, "(a)") " If not specified, O is used with N=truncation+1& + & (cubic relation)" + write(nout, "(a)") " -n, --niter NITER Run for this many inverse/direct transform& + & iterations (default = 10)" + write(nout, "(a)") " -f, --nfld NFLD Number of scalar fields (default = 1)" + write(nout, "(a)") " -l, --nlev NLEV Number of vertical levels (default = 1)" + write(nout, "(a)") " --vordiv Also transform vorticity-divergence to wind" + write(nout, "(a)") " --scders Compute scalar derivatives (default off)" + write(nout, "(a)") " --uvders Compute uv East-West derivatives (default off). Only& + & when also --vordiv is given" + write(nout, "(a)") " --flt Run with fast Legendre transforms (default off)" + write(nout, "(a)") " --nproma NPROMA Run with NPROMA (default no blocking: NPROMA=ngptot)" + write(nout, "(a)") " --norms Calculate and print spectral norms of transformed& + & fields" + write(nout, "(a)") " The computation of spectral norms will skew overall& + & timings" + write(nout, "(a)") " --meminfo Show diagnostic information from FIAT's ec_meminfo& + & subroutine on memory usage, thread-binding etc." + write(nout, "(a)") " --nprtrv Size of V set in spectral decomposition" + write(nout, "(a)") " --nprtrw Size of W set in spectral decomposition" + write(nout, "(a)") " -c, --check VALUE The multiplier of the machine epsilon used as a& + & tolerance for correctness checking" + write(nout, "(a)") "" + write(nout, "(a)") "DEBUGGING" + write(nout, "(a)") " --dump-values Output gridpoint fields in unformatted binary file" + write(nout, "(a)") "" + +end subroutine print_help + +!=================================================================================================== + subroutine parsing_failed(message) character(len=*), intent(in) :: message @@ -1240,17 +1323,6 @@ function cubic_octahedral_gaussian_grid(nsmax) result(cgrid) !=================================================================================================== -subroutine str2int(str, int, stat) - - character(len=*), intent(in) :: str - integer, intent(out) :: int - integer, intent(out) :: stat - read(str, *, iostat=stat) int - -end subroutine str2int - -!=================================================================================================== - subroutine sort(a, n) real(kind=jprd), intent(inout) :: a(n) @@ -1275,78 +1347,6 @@ end subroutine sort !=================================================================================================== -subroutine print_help(unit) - - integer, optional :: unit - integer :: nout = 6 - if (present(unit)) then - nout = unit - endif - - write(nout, "(a)") "" - - if (jprb == jprd) then - write(nout, "(a)") "NAME ectrans-benchmark-dp" - else - write(nout, "(a)") "NAME ectrans-benchmark-sp" - end if - write(nout, "(a)") "" - - write(nout, "(a)") "DESCRIPTION" - write(nout, "(a)") " This program tests ecTrans by transforming fields back and forth& - & between spectral " - if (jprb == jprd) then - write(nout, "(a)") " space and grid-point space (double-precision version)" - else - write(nout, "(a)") " space and grid-point space (single-precision version)" - end if - write(nout, "(a)") "" - - write(nout, "(a)") "USAGE" - if (jprb == jprd) then - write(nout, "(a)") " ectrans-benchmark-dp [options]" - else - write(nout, "(a)") " ectrans-benchmark-sp [options]" - end if - write(nout, "(a)") "" - - write(nout, "(a)") "OPTIONS" - write(nout, "(a)") " -h, --help Print this message" - write(nout, "(a)") " -v Run with verbose output" - write(nout, "(a)") " -t, --truncation T Run with this triangular spectral truncation& - & (default = 79)" - write(nout, "(a)") " -g, --grid GRID Run with this grid. Possible values: O, F" - write(nout, "(a)") " If not specified, O is used with N=truncation+1& - & (cubic relation)" - write(nout, "(a)") " -n, --niter NITER Run for this many inverse/direct transform& - & iterations (default = 10)" - write(nout, "(a)") " -f, --nfld NFLD Number of scalar fields (default = 1)" - write(nout, "(a)") " -l, --nlev NLEV Number of vertical levels (default = 1)" - write(nout, "(a)") " --vordiv Also transform vorticity-divergence to wind" - write(nout, "(a)") " --scders Compute scalar derivatives (default off)" - write(nout, "(a)") " --uvders Compute uv East-West derivatives (default off). Only& - & when also --vordiv is given" - write(nout, "(a)") " --flt Run with fast Legendre transforms (default off)" - write(nout, "(a)") " --nproma NPROMA Run with NPROMA (default no blocking: NPROMA=ngptot)" - write(nout, "(a)") " --norms Calculate and print spectral norms of transformed& - & fields" - write(nout, "(a)") " The computation of spectral norms will skew overall& - & timings" - write(nout, "(a)") " --meminfo Show diagnostic information from FIAT's ec_meminfo& - & subroutine on memory usage, thread-binding etc." - write(nout, "(a)") " --nprtrv Size of V set in spectral decomposition" - write(nout, "(a)") " --nprtrw Size of W set in spectral decomposition" - write(nout, "(a)") " -c, --check VALUE The multiplier of the machine epsilon used as a& - & tolerance for correctness checking" - write(nout, "(a)") "" - write(nout, "(a)") "DEBUGGING" - write(nout, "(a)") " --dump-values Output gridpoint fields in unformatted binary file" - write(nout, "(a)") "" - -end subroutine print_help - -!=================================================================================================== - subroutine initialize_spectral_arrays(nsmax, zsp, sp3d) integer, intent(in) :: nsmax ! Spectral truncation diff --git a/src/trans/common/internal/tpm_dim.F90 b/src/trans/common/internal/tpm_dim.F90 index 236d08d0..59287add 100755 --- a/src/trans/common/internal/tpm_dim.F90 +++ b/src/trans/common/internal/tpm_dim.F90 @@ -49,10 +49,4 @@ MODULE TPM_DIM TYPE(DIM_TYPE),ALLOCATABLE,TARGET :: DIM_RESOL(:) TYPE(DIM_TYPE),POINTER :: R -! flat copies of above -INTEGER(KIND=JPIM) :: R_NSMAX ! Truncation order -INTEGER(KIND=JPIM) :: R_NTMAX ! Truncation order for tendencies -INTEGER(KIND=JPIM) :: R_NDGNH ! Number of rows in northern hemisphere -INTEGER(KIND=JPIM) :: R_NDGL ! Number of rows of latitudes - END MODULE TPM_DIM diff --git a/src/trans/common/internal/tpm_distr.F90 b/src/trans/common/internal/tpm_distr.F90 index 6a151192..dc62c94c 100755 --- a/src/trans/common/internal/tpm_distr.F90 +++ b/src/trans/common/internal/tpm_distr.F90 @@ -178,29 +178,5 @@ MODULE TPM_DISTR TYPE(DISTR_TYPE),ALLOCATABLE,TARGET :: DISTR_RESOL(:) TYPE(DISTR_TYPE),POINTER :: D -!flat versions of the above -INTEGER(KIND=JPIM) :: D_NUMP ! No. of spectral waves handled by this processor -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_MYMS(:) ! Wave numbers handled by this PE -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NSTAGT0B(:) ! Start adresses for segments within buffer - ! (according to processors to whom data - ! is going to be sent) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NSTAGT1B(:) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPROCL(:) ! Process responsible for each lat. (F.S) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPNTGTB1(:,:) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NASM0(:) ! Address in a spectral array of (m, n=m) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NSTAGTF(:) ! Offset for specific latitude in -INTEGER(KIND=JPIM) :: D_NDGL_FS ! Number of rows of latitudes for which this process is - ! performing Fourier Space calculations -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_MSTABF(:) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPNTGTB0(:,:) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPROCM(:) ! Process that does the calc. for certain -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPTRLS(:) ! Pointer to first lat. (F.S) - - -! The offsets in the input and output arrays to the gemms. -! (1) are the offsets in the "inputs" of dirtrans ("outputs" invtrans) -! (2) are the offsets in the "outputs" of invtrans ("inputs" dirtrans) -INTEGER(KIND=JPIM), POINTER :: D_OFFSETS_GEMM1(:), D_OFFSETS_GEMM2(:) - END MODULE TPM_DISTR diff --git a/src/trans/common/internal/tpm_geometry.F90 b/src/trans/common/internal/tpm_geometry.F90 index 1ff1a1be..48454a37 100644 --- a/src/trans/common/internal/tpm_geometry.F90 +++ b/src/trans/common/internal/tpm_geometry.F90 @@ -34,11 +34,4 @@ MODULE TPM_GEOMETRY TYPE(GEOM_TYPE),ALLOCATABLE,TARGET :: GEOM_RESOL(:) TYPE(GEOM_TYPE),POINTER :: G -!flat copies of the above -INTEGER(KIND=JPIM),ALLOCATABLE :: G_NDGLU(:) ! NUMBER OF HEMISPERIC LATITUDES -INTEGER(KIND=JPIM),ALLOCATABLE :: G_NMEN(:) ! ASSOCIATED CUT-OFF WAVE NUMBER -INTEGER(KIND=JPIM) :: G_NMEN_MAX -INTEGER(KIND=JPIM),ALLOCATABLE :: G_NLOEN(:) ! NUMBER OF POINTS ON A PARALLEL -INTEGER(KIND=JPIM) :: G_NLOEN_MAX - END MODULE TPM_GEOMETRY diff --git a/src/trans/gpu/algor/growing_allocator_mod.F90 b/src/trans/gpu/algor/growing_allocator_mod.F90 index f8de0fc9..db869e48 100644 --- a/src/trans/gpu/algor/growing_allocator_mod.F90 +++ b/src/trans/gpu/algor/growing_allocator_mod.F90 @@ -5,6 +5,7 @@ MODULE GROWING_ALLOCATOR_MOD PRIVATE PUBLIC :: GROWING_ALLOCATION_TYPE PUBLIC :: REALLOCATE_GROWING_ALLOCATION, REGISTER_FREE_FUNCTION + PUBLIC :: DESTROY_GROWING_ALLOCATOR ABSTRACT INTERFACE SUBROUTINE FREE_FUNC_PROC(PTR, SZ) BIND(C) @@ -32,19 +33,12 @@ SUBROUTINE REALLOCATE_GROWING_ALLOCATION(ALLOC, SZ) USE TPM_GEN, ONLY: NOUT IMPLICIT NONE TYPE(GROWING_ALLOCATION_TYPE), INTENT(INOUT) :: ALLOC - INTEGER(C_SIZE_T) :: SZ - INTEGER :: I + INTEGER(C_SIZE_T), INTENT(IN) :: SZ ! Deallocate existing pointer IF (ASSOCIATED(ALLOC%PTR) .AND. SZ > SIZE(ALLOC%PTR, 1, C_SIZE_T)) THEN WRITE(NOUT,*) "WARNING: REALLOCATING GROWING POINTER CAUSING GRAPH REINSTANTIATION" - DO I = 1, ALLOC%FREE_FUNCS_SZ - CALL ALLOC%FREE_FUNCS(I)%FUNC(ALLOC%PTR, & - SIZE(ALLOC%PTR, 1, C_SIZE_T)) - ENDDO - !$ACC EXIT DATA DELETE(ALLOC%PTR) - DEALLOCATE(ALLOC%PTR) - NULLIFY(ALLOC%PTR) + CALL DESTROY_GROWING_ALLOCATOR(ALLOC) ENDIF IF (.NOT. ASSOCIATED(ALLOC%PTR)) THEN @@ -89,4 +83,20 @@ SUBROUTINE REGISTER_FREE_C(ALLOC_C, FREE_FUNC_C) BIND(C, NAME="growing_allocator END SUBROUTINE + SUBROUTINE DESTROY_GROWING_ALLOCATOR(ALLOC) + USE ISO_C_BINDING, ONLY: C_SIZE_T + IMPLICIT NONE + TYPE(GROWING_ALLOCATION_TYPE) :: ALLOC + INTEGER :: I + IF (ASSOCIATED(ALLOC%PTR)) THEN + DO I = 1, ALLOC%FREE_FUNCS_SZ + CALL ALLOC%FREE_FUNCS(I)%FUNC(ALLOC%PTR, & + SIZE(ALLOC%PTR, 1, C_SIZE_T)) + ENDDO + !$ACC EXIT DATA DELETE(ALLOC%PTR) + DEALLOCATE(ALLOC%PTR) + NULLIFY(ALLOC%PTR) + ENDIF + END SUBROUTINE + END MODULE diff --git a/src/trans/gpu/algor/hicblas_cutlass.cuda.h b/src/trans/gpu/algor/hicblas_cutlass.cuda.h index 7a842a80..a9148e93 100644 --- a/src/trans/gpu/algor/hicblas_cutlass.cuda.h +++ b/src/trans/gpu/algor/hicblas_cutlass.cuda.h @@ -72,6 +72,7 @@ class cutlass_sgemm_grouped { static constexpr int sz_align = 8; public: + using real_type = float; void operator()(cudaStream_t stream, int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc) const { @@ -129,6 +130,7 @@ class cutlass_sgemm_grouped { static constexpr int sz_align = 1; public: + using real_type = float; void operator()(cudaStream_t stream, int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc) const { @@ -149,7 +151,7 @@ class cutlass_sgemm_grouped { } // namespace detail template -void cutlass_sgemm_wrapper_grouped_op(int blas_id, int m, int *n, int *k, +void cutlass_sgemm_wrapper_grouped_op(int resol_id, int blas_id, int m, int *n, int *k, float alpha, const float *A, int lda, int *offsetsA, const float *B, int ldb, int *offsetsB, float beta, float *C, @@ -165,18 +167,18 @@ void cutlass_sgemm_wrapper_grouped_op(int blas_id, int m, int *n, int *k, if (capability_major >= 8 && use_3xtf32) run_group_graph(cutlass_sgemm_grouped(), - m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, + resol_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, batchCount, stream, blas_id, growing_allocator); else run_group_graph(cutlass_sgemm_grouped(), - m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, + resol_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, batchCount, stream, blas_id, growing_allocator); } -void cutlass_sgemm_wrapper_grouped(int blas_id, char transa, char transb, +void cutlass_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa, char transb, int m, int *n, int *k, float alpha, const float *A, int lda, int *offsetsA, const float *B, int ldb, int *offsetsB, float beta, @@ -186,19 +188,19 @@ void cutlass_sgemm_wrapper_grouped(int blas_id, char transa, char transb, if (transa == 'N' && transb == 'N') cutlass_sgemm_wrapper_grouped_op( - blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, + resol_id, blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, batchCount, stream, growing_allocator); else if (transa == 'N' && transb == 'T') cutlass_sgemm_wrapper_grouped_op( - blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, + resol_id, blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, batchCount, stream, growing_allocator); else if (transa == 'T' && transb == 'N') cutlass_sgemm_wrapper_grouped_op( - blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, + resol_id, blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, batchCount, stream, growing_allocator); else if (transa == 'T' && transb == 'T') cutlass_sgemm_wrapper_grouped_op( - blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, + resol_id, blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, batchCount, stream, growing_allocator); else assert(false); diff --git a/src/trans/gpu/algor/hicblas_gemm.hip.cpp b/src/trans/gpu/algor/hicblas_gemm.hip.cpp index 9e4b67c4..2f228aa3 100644 --- a/src/trans/gpu/algor/hicblas_gemm.hip.cpp +++ b/src/trans/gpu/algor/hicblas_gemm.hip.cpp @@ -21,63 +21,88 @@ #include "growing_allocator.h" +bool hip_alreadyAllocated_sgemm = false; +bool hip_alreadyAllocated_sgemm_handle = false; -bool hip_alreadyAllocated_sgemm=false; -bool hip_alreadyAllocated_sgemm_handle=false; - -bool hip_alreadyAllocated_dsgemm=false; -bool hip_alreadyAllocated_dgemm_handle=false; +bool hip_alreadyAllocated_dsgemm = false; +bool hip_alreadyAllocated_dgemm_handle = false; hipblasHandle_t handle_hip_sgemm; hipblasHandle_t handle_hip_dgemm; - namespace { -namespace detail { -struct pair_hash { - std::size_t operator()(const std::pair &p) const { - return p.first * 10000 + p.second; +struct cache_key { + int resol_id; + int m; + int blas_id; + + bool operator==(const cache_key &other) const { + return resol_id == other.resol_id && m == other.m && + blas_id == other.blas_id; + } + cache_key(int resol_id_, int m_, int blas_id_) + : resol_id(resol_id_), m(m_), blas_id(blas_id_) {} +}; +} // namespace +template <> struct std::hash { + std::size_t operator()(const cache_key &k) const { + return k.blas_id * 1000000 + k.resol_id * 10000 + k.m; } }; -} // namespace detail -template auto &get_graph_cache() { - // we store at most one graph per "m" (# fields) and "blas id" - static std::unordered_map, hipGraphExec_t, - detail::pair_hash> +namespace { +template auto &get_graph_cache() { + // we store at most one graph per "m" (# fields) and "blas id" and resolution + static std::unordered_map> graphCache; return graphCache; } -template auto &get_ptr_cache() { +template auto &get_ptr_cache() { + using real_t = typename Gemm::real_type; static std::unordered_map< - std::pair, std::tuple, - detail::pair_hash> + cache_key, std::tuple> ptrCache; return ptrCache; } -template void free_gemm_cache(float *, size_t) { - get_graph_cache().clear(); - get_ptr_cache().clear(); +template void free_gemm_graph_cache(float *, size_t) { + get_graph_cache().clear(); + get_ptr_cache().clear(); +} +template +void erase_resol_from_cache(Cache &cache, int resol_id) { + // Note that in C++20 this could also be std::erase_if + int erased = 0; + for (auto it = cache.begin(); it != cache.end();) { + if (it->first.resol_id == resol_id) { + it = cache.erase(it); + ++erased; + } else + ++it; + } +} +template void erase_from_caches(int resol_id) { + erase_resol_from_cache(get_graph_cache(), resol_id); + erase_resol_from_cache(get_ptr_cache(), resol_id); } // this version is using graphs and caches the graphs template -void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha, - const Real *A, int lda, int *offsetsA, const Real *B, - int ldb, int *offsetsB, Real beta, Real *C, int ldc, - int *offsetsC, int batchCount, hipStream_t stream, +void run_group_graph(Gemm &&gemm, int resol_id, int m, int *n, int *k, + Real alpha, const Real *A, int lda, int *offsetsA, + const Real *B, int ldb, int *offsetsB, Real beta, Real *C, + int ldc, int *offsetsC, int batchCount, hipStream_t stream, int blas_id, void *growing_allocator) { growing_allocator_register_free_c(growing_allocator, - free_gemm_cache); + free_gemm_graph_cache); // we store at most one graph per "m" (# fields) and "blas id" - auto &graphCache = get_graph_cache(); + auto &graphCache = get_graph_cache(); // we also store A, B, and C and recreate the graph if they change - auto &ptrCache = get_ptr_cache(); + auto &ptrCache = get_ptr_cache(); - auto key = std::make_pair(m, blas_id); + auto key = cache_key{resol_id, m, blas_id}; auto ptrs = ptrCache.find(key); if (ptrs != ptrCache.end() && @@ -86,13 +111,14 @@ void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha, // the plan is cached, but the pointers are not correct. we remove and // delete the graph, but we keep the hipblas handles, if this happens more // often, we should cache this... - std::cout << "WARNING GEMM: POINTER CHANGE - Graph recreation might be slow." << std::endl; + std::cout + << "WARNING GEMM: POINTER CHANGE - Graph recreation might be slow." + << std::endl; std::cout << "We have an entry with key {m=" << m << ", blas_id=" << blas_id - << "}\n"; + << ", resol_id=" << resol_id << "}\n"; std::cout << "Pointers: " << std::get<0>(ptrs->second) << ", " << std::get<1>(ptrs->second) << ", " << std::get<2>(ptrs->second) << " vs. " << A << ", " << B << ", " << C << std::endl; - HIC_CHECK(hipGraphExecDestroy(graphCache[key])); graphCache.erase(key); ptrCache.erase(key); } @@ -115,32 +141,36 @@ void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha, hipGraph_t my_graph; HIC_CHECK(hipStreamEndCapture(stream, &my_graph)); hipGraphNode_t my_node; - HIC_CHECK(hipGraphAddChildGraphNode(&my_node, new_graph, nullptr, 0, - my_graph)); + HIC_CHECK( + hipGraphAddChildGraphNode(&my_node, new_graph, nullptr, 0, my_graph)); } hipGraphExec_t instance; HIC_CHECK(hipGraphInstantiate(&instance, new_graph, NULL, NULL, 0)); HIC_CHECK(hipStreamDestroy(stream)); HIC_CHECK(hipGraphDestroy(new_graph)); - graphCache.insert({key, instance}); + graphCache.insert({key, std::shared_ptr( + new hipGraphExec_t{instance}, [](auto ptr) { + HIC_CHECK(hipGraphExecDestroy(*ptr)); + delete ptr; + })}); ptrCache.insert({key, std::make_tuple(A, B, C)}); } - HIC_CHECK(hipGraphLaunch(graphCache.at(key), stream)); + HIC_CHECK(hipGraphLaunch(*graphCache.at(key), stream)); } // stupid simple gemm calls template -void run_group(Gemm &&gemm, int m, int *n, int *k, Real alpha, const Real *A, - int lda, int *offsetsA, const Real *B, int ldb, int *offsetsB, - Real beta, Real *C, int ldc, int *offsetsC, int batchCount, - hipStream_t stream, int = -1) { +void run_group(Gemm &&gemm, int resol_id, int m, int *n, int *k, Real alpha, + const Real *A, int lda, int *offsetsA, const Real *B, int ldb, + int *offsetsB, Real beta, Real *C, int ldc, int *offsetsC, + int batchCount, hipStream_t stream, int = -1) { for (int i = 0; i < batchCount; ++i) { if (m == 0 || n[i] == 0 || k[i] == 0) continue; - gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i], ldb, - beta, C + offsetsC[i], ldc); + gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i], + ldb, beta, C + offsetsC[i], ldc); } } @@ -148,7 +178,6 @@ void run_group(Gemm &&gemm, int m, int *n, int *k, Real alpha, const Real *A, #include "hicblas_cutlass.cuda.h" #endif -namespace detail { hipblasHandle_t get_hipblas_handle() { static hipblasHandle_t handle; if (!handle) @@ -157,6 +186,7 @@ hipblasHandle_t get_hipblas_handle() { } template struct hipblas_gemm_grouped { public: + using real_type = Real; hipblas_gemm_grouped(hipblasOperation_t transa, hipblasOperation_t transb) : transa_(transa), transb_(transb) { // we need to get the hipblas handle here, otherwise this could be created @@ -171,156 +201,166 @@ template struct hipblas_gemm_grouped { if constexpr (std::is_same::value) HICBLAS_CHECK(hipblasSgemm(handle, transa_, transb_, m, n, k, &alpha, A, - lda, B, ldb, &beta, C, ldc)); + lda, B, ldb, &beta, C, ldc)); if constexpr (std::is_same::value) HICBLAS_CHECK(hipblasDgemm(handle, transa_, transb_, m, n, k, &alpha, A, - lda, B, ldb, &beta, C, ldc)); + lda, B, ldb, &beta, C, ldc)); } private: hipblasOperation_t transa_, transb_; }; -} // namespace detail #ifndef USE_CUTLASS -void hipblas_sgemm_wrapper_grouped(int blas_id, char transa, char transb, - int m, int *n, int *k, float alpha, - const float *A, int lda, int *offsetsA, - const float *B, int ldb, int *offsetsB, float beta, - float *C, int ldc, int *offsetsC, - int batchCount, hipStream_t stream, - void *growing_allocator) { - - hipblasOperation_t op_t1=HIPBLAS_OP_N, op_t2=HIPBLAS_OP_N; - if (transa=='T' || transa=='t') - op_t1=HIPBLAS_OP_T; - if (transb=='T' || transb=='t') - op_t2=HIPBLAS_OP_T; - - using namespace detail; +void hipblas_sgemm_wrapper_grouped( + int resol_id, int blas_id, char transa, char transb, int m, int *n, int *k, + float alpha, const float *A, int lda, int *offsetsA, const float *B, + int ldb, int *offsetsB, float beta, float *C, int ldc, int *offsetsC, + int batchCount, hipStream_t stream, void *growing_allocator) { + + hipblasOperation_t op_t1 = HIPBLAS_OP_N, op_t2 = HIPBLAS_OP_N; + if (transa == 'T' || transa == 't') + op_t1 = HIPBLAS_OP_T; + if (transb == 'T' || transb == 't') + op_t2 = HIPBLAS_OP_T; + #ifdef USE_GRAPHS_GEMM - run_group_graph(hipblas_gemm_grouped(op_t1, op_t2), m, n, k, alpha, A, - lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, - batchCount, stream, blas_id, growing_allocator); + run_group_graph(hipblas_gemm_grouped(op_t1, op_t2), resol_id, m, n, k, + alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, + offsetsC, batchCount, stream, blas_id, growing_allocator); #else - run_group(hipblas_gemm_grouped(op_t1, op_t2), m, n, k, alpha, A, - lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, + run_group(hipblas_gemm_grouped(op_t1, op_t2), resol_id, m, n, k, alpha, + A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, batchCount, stream); #endif } #endif -void hipblas_dgemm_wrapper_grouped(int blas_id, char transa, char transb, - int m, int *n, int *k, - double alpha, - const double *A, int lda, int *offsetsA, - const double *B, int ldb, int *offsetsB, - double beta, - double *C, int ldc, int *offsetsC, - int batchCount, hipStream_t stream, void *) { - - hipblasOperation_t op_t1=HIPBLAS_OP_N, op_t2=HIPBLAS_OP_N; - if (transa=='T' || transa=='t') - op_t1=HIPBLAS_OP_T; - if (transb=='T' || transb=='t') - op_t2=HIPBLAS_OP_T; - - using namespace detail; - run_group(hipblas_gemm_grouped(op_t1, op_t2), m, n, k, alpha, - A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, - batchCount, stream, blas_id); +void hipblas_dgemm_wrapper_grouped(int resol_id, int blas_id, char transa, + char transb, int m, int *n, int *k, + double alpha, const double *A, int lda, + int *offsetsA, const double *B, int ldb, + int *offsetsB, double beta, double *C, + int ldc, int *offsetsC, int batchCount, + hipStream_t stream, void *) { + + hipblasOperation_t op_t1 = HIPBLAS_OP_N, op_t2 = HIPBLAS_OP_N; + if (transa == 'T' || transa == 't') + op_t1 = HIPBLAS_OP_T; + if (transb == 'T' || transb == 't') + op_t2 = HIPBLAS_OP_T; + + run_group(hipblas_gemm_grouped(op_t1, op_t2), resol_id, m, n, k, + alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, + batchCount, stream, blas_id); } } // namespace extern "C" { -void hipblas_dgemm_wrapper (char transa, char transb, - int m, int n,int k, double alpha, - const double *A, int lda, int tda, - const double *B, int ldb, int tdb, double beta, - double *C, int ldc, int tdc, int batchCount, - size_t stream, - void *growing_allocator) { - - hipblasOperation_t op_t1=HIPBLAS_OP_N, op_t2=HIPBLAS_OP_N; - - if (transa=='T' || transa=='t') - op_t1=HIPBLAS_OP_T; - if (transb=='T' || transb=='t') - op_t2=HIPBLAS_OP_T; - - if (!hip_alreadyAllocated_dgemm_handle){ +void hipblas_dgemm_wrapper(char transa, char transb, int m, int n, int k, + double alpha, const double *A, int lda, int tda, + const double *B, int ldb, int tdb, double beta, + double *C, int ldc, int tdc, int batchCount, + size_t stream, void *growing_allocator) { + + hipblasOperation_t op_t1 = HIPBLAS_OP_N, op_t2 = HIPBLAS_OP_N; + + if (transa == 'T' || transa == 't') + op_t1 = HIPBLAS_OP_T; + if (transb == 'T' || transb == 't') + op_t2 = HIPBLAS_OP_T; + + if (!hip_alreadyAllocated_dgemm_handle) { HICBLAS_CHECK(hipblasCreate(&handle_hip_dgemm)); - hip_alreadyAllocated_dgemm_handle=true; + hip_alreadyAllocated_dgemm_handle = true; } - hipblasHandle_t handle = detail::get_hipblas_handle(); - HICBLAS_CHECK( - hipblasSetStream(handle, *(hipStream_t*)stream)); - - HICBLAS_CHECK(hipblasDgemmStridedBatched(handle,op_t1,op_t2,m,n,k, - &alpha,(const double *) A,lda,tda, (const double *) B,ldb,tdb, - &beta,(double *) C,ldc,tdc,batchCount)); + hipblasHandle_t handle = get_hipblas_handle(); + HICBLAS_CHECK(hipblasSetStream(handle, *(hipStream_t *)stream)); + HICBLAS_CHECK(hipblasDgemmStridedBatched( + handle, op_t1, op_t2, m, n, k, &alpha, (const double *)A, lda, tda, + (const double *)B, ldb, tdb, &beta, (double *)C, ldc, tdc, batchCount)); } -void hipblas_sgemm_wrapper (char transa, char transb, - int m, int n,int k, float alpha, - const float *A, int lda, int tda, - const float *B, int ldb, int tdb, float beta, - float *C, int ldc, int tdc, - int batchCount, - void *growing_allocator) { +void hipblas_sgemm_wrapper(char transa, char transb, int m, int n, int k, + float alpha, const float *A, int lda, int tda, + const float *B, int ldb, int tdb, float beta, + float *C, int ldc, int tdc, int batchCount, + void *growing_allocator) { - hipblasOperation_t op_t1=HIPBLAS_OP_N, op_t2=HIPBLAS_OP_N; + hipblasOperation_t op_t1 = HIPBLAS_OP_N, op_t2 = HIPBLAS_OP_N; - if (transa=='T' || transa=='t') - op_t1=HIPBLAS_OP_T; - if (transb=='T' || transb=='t') - op_t2=HIPBLAS_OP_T; + if (transa == 'T' || transa == 't') + op_t1 = HIPBLAS_OP_T; + if (transb == 'T' || transb == 't') + op_t2 = HIPBLAS_OP_T; - if (!hip_alreadyAllocated_sgemm_handle){ + if (!hip_alreadyAllocated_sgemm_handle) { HICBLAS_CHECK(hipblasCreate(&handle_hip_sgemm)); - hip_alreadyAllocated_sgemm_handle=true; + hip_alreadyAllocated_sgemm_handle = true; } - HICBLAS_CHECK(hipblasSgemmStridedBatched(handle_hip_sgemm,op_t1,op_t2,m,n,k, - &alpha,(const float *) A,lda,tda, (const float *) B,ldb,tdb, - &beta,(float*) C,ldc,tdc,batchCount)); - + HICBLAS_CHECK(hipblasSgemmStridedBatched( + handle_hip_sgemm, op_t1, op_t2, m, n, k, &alpha, (const float *)A, lda, + tda, (const float *)B, ldb, tdb, &beta, (float *)C, ldc, tdc, + batchCount)); } -void hipblas_sgemm_wrapper_grouped(int blas_id, char transa, char transb, - int m, int *n, int *k, float alpha, - const float *A, int lda, int *offsetsA, - const float *B, int ldb, int *offsetsB, float beta, - float *C, int ldc, int *offsetsC, - int batchCount, size_t stream, - void *growing_allocator) { +void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa, + char transb, int m, int *n, int *k, + float alpha, const float *A, int lda, + int *offsetsA, const float *B, int ldb, + int *offsetsB, float beta, float *C, int ldc, + int *offsetsC, int batchCount, size_t stream, + void *growing_allocator) { #ifdef USE_CUTLASS - cutlass_sgemm_wrapper_grouped(blas_id, transa, transb, m, n, k, alpha, A, lda, offsetsA, - B, ldb, offsetsB, beta, C, ldc, offsetsC, batchCount, - *(hipStream_t*)stream, - growing_allocator); + cutlass_sgemm_wrapper_grouped(resol_id, blas_id, transa, transb, m, n, k, + alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, + C, ldc, offsetsC, batchCount, + *(hipStream_t *)stream, growing_allocator); #else - hipblas_sgemm_wrapper_grouped(blas_id, transa, transb, m, n, k, alpha, A, lda, - offsetsA, B, ldb, offsetsB, beta, C, ldc, - offsetsC, batchCount, - *(hipStream_t*)stream, - growing_allocator); + hipblas_sgemm_wrapper_grouped(resol_id, blas_id, transa, transb, m, n, k, + alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, + C, ldc, offsetsC, batchCount, + *(hipStream_t *)stream, growing_allocator); #endif } -void hipblas_dgemm_wrapper_grouped(int blas_id, char transa, char transb, - int m, int *n, int *k, double alpha, - const double *A, int lda, int *offsetsA, - const double *B, int ldb, int *offsetsB, double beta, - double *C, int ldc, int *offsetsC, - int batchCount, size_t stream, - void *growing_allocator) { - hipblas_dgemm_wrapper_grouped(blas_id, transa, transb, m, n, k, alpha, A, lda, offsetsA, B, - ldb, offsetsB, beta, C, ldc, offsetsC, batchCount, - *(hipStream_t*)stream, - growing_allocator); +void hipblas_dgemm_wrapper_grouped(int resol_id, int blas_id, char transa, + char transb, int m, int *n, int *k, + double alpha, const double *A, int lda, + int *offsetsA, const double *B, int ldb, + int *offsetsB, double beta, double *C, + int ldc, int *offsetsC, int batchCount, + size_t stream, void *growing_allocator) { + hipblas_dgemm_wrapper_grouped(resol_id, blas_id, transa, transb, m, n, k, + alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, + C, ldc, offsetsC, batchCount, + *(hipStream_t *)stream, growing_allocator); +} + +void clean_gemm(int resol_id) { + erase_from_caches>(resol_id); + erase_from_caches>(resol_id); +#ifdef USE_CUTLASS + erase_from_caches>(resol_id); + erase_from_caches>(resol_id); + erase_from_caches>(resol_id); + erase_from_caches>(resol_id); + erase_from_caches>(resol_id); + erase_from_caches>(resol_id); + erase_from_caches>(resol_id); + erase_from_caches>(resol_id); +#endif } } diff --git a/src/trans/gpu/algor/hicblas_mod.F90 b/src/trans/gpu/algor/hicblas_mod.F90 index 988e1b3e..5341cff2 100644 --- a/src/trans/gpu/algor/hicblas_mod.F90 +++ b/src/trans/gpu/algor/hicblas_mod.F90 @@ -63,10 +63,16 @@ SUBROUTINE HIP_SGEMM_BATCHED( & TYPE(C_PTR), INTENT(IN), VALUE :: ALLOC END SUBROUTINE HIP_SGEMM_BATCHED END INTERFACE +INTERFACE + SUBROUTINE CLEAN_GEMM(RESOL_ID) BIND(C, NAME="clean_gemm") + USE ISO_C_BINDING + INTEGER(KIND=C_INT), INTENT(IN), VALUE :: RESOL_ID + END SUBROUTINE +END INTERFACE INTERFACE SUBROUTINE HIP_DGEMM_GROUPED( & - & BLAS_ID, CTA, CTB, & + & RESOL_ID, BLAS_ID, CTA, CTB, & & M, N, K, & & ALPHA, & & A, LDA, OFFSETA, & @@ -77,7 +83,7 @@ SUBROUTINE HIP_DGEMM_GROUPED( & &) BIND(C, NAME='hipblas_dgemm_wrapper_grouped') USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_DOUBLE, C_SIZE_T, C_PTR CHARACTER(1,C_CHAR), VALUE :: CTA, CTB - INTEGER(C_INT), VALUE :: BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT + INTEGER(C_INT), VALUE :: RESOL_ID, BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT INTEGER(C_INT) :: N(*), K(*), OFFSETA(*), OFFSETB(*), OFFSETC(*) REAL(C_DOUBLE), VALUE :: ALPHA,BETA REAL(C_DOUBLE) :: A(*), B(*), C(*) @@ -86,7 +92,7 @@ SUBROUTINE HIP_DGEMM_GROUPED( & END SUBROUTINE HIP_DGEMM_GROUPED SUBROUTINE HIP_SGEMM_GROUPED( & - & BLAS_ID, CTA, CTB, & + & RESOL_ID, BLAS_ID, CTA, CTB, & & M, N, K, & & ALPHA, & & A, LDA, OFFSETA, & @@ -97,7 +103,7 @@ SUBROUTINE HIP_SGEMM_GROUPED( & &) BIND(C, NAME='hipblas_sgemm_wrapper_grouped') USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_FLOAT, C_SIZE_T, C_PTR CHARACTER(1,C_CHAR), VALUE :: CTA, CTB - INTEGER(C_INT), VALUE :: BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT + INTEGER(C_INT), VALUE :: RESOL_ID, BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT INTEGER(C_INT) :: N(*), K(*), OFFSETA(*), OFFSETB(*), OFFSETC(*) REAL(C_FLOAT), VALUE :: ALPHA,BETA REAL(C_FLOAT) :: A(*), B(*), C(*) @@ -203,7 +209,7 @@ SUBROUTINE HIP_SGEMM_BATCHED_OVERLOAD( & END SUBROUTINE HIP_SGEMM_BATCHED_OVERLOAD SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD( & - & BLAS_ID, TRANSA, TRANSB, & + & RESOL_ID, BLAS_ID, TRANSA, TRANSB, & & M, N, K, & & ALPHA, & & AARRAY, LDA, OFFSETA, & @@ -212,6 +218,7 @@ SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD( & & CARRAY, LDC, OFFSETC, & & BATCHCOUNT, STREAM, ALLOC) USE ISO_C_BINDING, ONLY: C_INT, C_CHAR, C_LONG, C_LOC + INTEGER(KIND=C_INT), INTENT(IN) :: RESOL_ID INTEGER(KIND=C_INT), INTENT(IN) :: BLAS_ID CHARACTER(1,C_CHAR), VALUE :: TRANSA, TRANSB INTEGER(KIND=JPIM) :: M @@ -237,7 +244,7 @@ SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD( & HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG) CALL HIP_DGEMM_GROUPED( & - & BLAS_ID, TRANSA, TRANSB, & + & RESOL_ID, BLAS_ID, TRANSA, TRANSB, & & M, N, K, & & ALPHA, & & AARRAY, LDA, OFFSETA, & @@ -249,7 +256,7 @@ SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD( & END SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD(& - & BLAS_ID, TRANSA, TRANSB, & + & RESOL_ID, BLAS_ID, TRANSA, TRANSB, & & M, N, K, & & ALPHA, & & AARRAY, LDA, OFFSETA, & @@ -258,6 +265,7 @@ SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD(& & CARRAY, LDC, OFFSETC, & & BATCHCOUNT, STREAM, ALLOC) USE ISO_C_BINDING, ONLY: C_INT, C_CHAR, C_LONG, C_LOC + INTEGER(KIND=C_INT), INTENT(IN) :: RESOL_ID INTEGER(KIND=C_INT), INTENT(IN) :: BLAS_ID CHARACTER(1,C_CHAR), VALUE :: TRANSA, TRANSB INTEGER(KIND=JPIM) :: M @@ -286,7 +294,7 @@ SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD(& !$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY) #endif CALL HIP_SGEMM_GROUPED( & - & BLAS_ID, TRANSA, TRANSB, & + & RESOL_ID, BLAS_ID, TRANSA, TRANSB, & & M, N, K, & & ALPHA, & & AARRAY, LDA, OFFSETA, & diff --git a/src/trans/gpu/algor/hicfft.hip.cpp b/src/trans/gpu/algor/hicfft.hip.cpp index 8278d9a4..488610d7 100644 --- a/src/trans/gpu/algor/hicfft.hip.cpp +++ b/src/trans/gpu/algor/hicfft.hip.cpp @@ -1,5 +1,8 @@ #include "hicfft.h" +#include +#include + #include "growing_allocator.h" #define fftSafeCall(err) __fftSafeCall(err, __FILE__, __LINE__) @@ -43,57 +46,100 @@ template class hicfft_plan { real *data_real_l = &data_real[offset]; cmplx *data_complex_l = &data_complex[offset / 2]; if constexpr (Direction == HIPFFT_R2C) - fftSafeCall(hipfftExecR2C(handle, data_real_l, data_complex_l)); + fftSafeCall(hipfftExecR2C(*handle_ptr, data_real_l, data_complex_l)); else if constexpr (Direction == HIPFFT_C2R) - fftSafeCall(hipfftExecC2R(handle, data_complex_l, data_real_l)); + fftSafeCall(hipfftExecC2R(*handle_ptr, data_complex_l, data_real_l)); else if constexpr (Direction == HIPFFT_D2Z) - fftSafeCall(hipfftExecD2Z(handle, data_real_l, data_complex_l)); + fftSafeCall(hipfftExecD2Z(*handle_ptr, data_real_l, data_complex_l)); else if constexpr (Direction == HIPFFT_Z2D) - fftSafeCall(hipfftExecZ2D(handle, data_complex_l, data_real_l)); + fftSafeCall(hipfftExecZ2D(*handle_ptr, data_complex_l, data_real_l)); } void set_stream(hipStream_t stream) { - fftSafeCall(hipfftSetStream(handle, stream)); + fftSafeCall(hipfftSetStream(*handle_ptr, stream)); } hicfft_plan(hipfftHandle handle_, int offset_) - : handle(handle_), offset(offset_) {} + : handle_ptr(new hipfftHandle{handle_}, + [](auto ptr) { + fftSafeCall(hipfftDestroy(*ptr)); + delete ptr; + }), + offset(offset_) {} private: - hipfftHandle handle; + std::shared_ptr handle_ptr; int offset; }; +struct cache_key { + int resol_id; + int kfield; + bool operator==(const cache_key &other) const { + return resol_id == other.resol_id && kfield == other.kfield; + } + cache_key(int resol_id_, int kfield_) + : resol_id(resol_id_), kfield(kfield_) {} +}; +} // namespace + +template <> struct std::hash { + std::size_t operator()(const cache_key &k) const { + return k.resol_id * 10000 + k.kfield; + } +}; + +namespace { // kfield -> handles template auto &get_fft_plan_cache() { - static std::unordered_map>> + static std::unordered_map>> fftPlansCache; return fftPlansCache; } // kfield -> graphs template auto &get_graph_cache() { - static std::unordered_map graphCache; + static std::unordered_map> + graphCache; return graphCache; } // kfield -> ptrs template auto &get_ptr_cache() { using real = typename Type::real; using cmplx = typename Type::cmplx; - static std::unordered_map> ptrCache; + static std::unordered_map> ptrCache; return ptrCache; } template -void free_fft_cache(float *, size_t) { +void free_fft_graph_cache(float *, size_t) { get_graph_cache().clear(); get_ptr_cache().clear(); } +template +void erase_resol_from_cache(Cache &cache, int resol_id) { + // Note that in C++20 this could also be std::erase_if + int erased = 0; + for (auto it = cache.begin(); it != cache.end();) { + if (it->first.resol_id == resol_id) { + it = cache.erase(it); + ++erased; + } else + ++it; + } +} +template +void erase_from_caches(int resol_id) { + erase_resol_from_cache(get_fft_plan_cache(), resol_id); + erase_resol_from_cache(get_graph_cache(), resol_id); + erase_resol_from_cache(get_ptr_cache(), resol_id); +} template -std::vector> plan_all(int kfield, int *loens, - int nfft, int *offsets) { +std::vector> +plan_all(int resol_id, int kfield, int *loens, int nfft, int *offsets) { static constexpr bool is_forward = Direction == HIPFFT_R2C || Direction == HIPFFT_D2Z; - auto key = kfield; + auto key = cache_key{resol_id, kfield}; auto &fftPlansCache = get_fft_plan_cache(); auto fftPlans = fftPlansCache.find(key); if (fftPlans == fftPlansCache.end()) { @@ -104,7 +150,6 @@ std::vector> plan_all(int kfield, int *loens, int nloen = loens[i]; hipfftHandle plan; - fftSafeCall(hipfftCreate(&plan)); int dist = offsets[i + 1] - offsets[i]; int embed[] = {1}; fftSafeCall(hipfftPlanMany( @@ -119,17 +164,18 @@ std::vector> plan_all(int kfield, int *loens, template void run_group_graph(typename Type::real *data_real, - typename Type::cmplx *data_complex, int kfield, int *loens, - int *offsets, int nfft, void *growing_allocator) { + typename Type::cmplx *data_complex, int resol_id, + int kfield, int *loens, int *offsets, int nfft, + void *growing_allocator) { growing_allocator_register_free_c(growing_allocator, - free_fft_cache); + free_fft_graph_cache); // if the pointers are changed, we need to update the graph auto &ptrCache = get_ptr_cache(); // kfield -> ptrs auto &graphCache = get_graph_cache(); // kfield -> graphs - auto key = kfield; + auto key = cache_key{resol_id, kfield}; auto ptrs = ptrCache.find(key); if (ptrs != ptrCache.end() && (ptrs->second.first != data_real || ptrs->second.second != data_complex)) { @@ -138,7 +184,6 @@ void run_group_graph(typename Type::real *data_real, // we should cache this... std::cout << "WARNING FFT: POINTER CHANGE --> THIS MIGHT BE SLOW" << std::endl; - HIC_CHECK(hipGraphExecDestroy(graphCache[key])); graphCache.erase(key); ptrCache.erase(key); } @@ -146,7 +191,8 @@ void run_group_graph(typename Type::real *data_real, auto graph = graphCache.find(key); if (graph == graphCache.end()) { // this graph does not exist yet - auto plans = plan_all(kfield, loens, nfft, offsets); + auto plans = + plan_all(resol_id, kfield, loens, nfft, offsets); // create a temporary stream hipStream_t stream; @@ -172,19 +218,24 @@ void run_group_graph(typename Type::real *data_real, HIC_CHECK(hipStreamDestroy(stream)); HIC_CHECK(hipGraphDestroy(new_graph)); - graphCache.insert({key, instance}); + graphCache.insert({key, std::shared_ptr( + new hipGraphExec_t{instance}, [](auto ptr) { + HIC_CHECK(hipGraphExecDestroy(*ptr)); + delete ptr; + })}); ptrCache.insert({key, std::make_pair(data_real, data_complex)}); } - HIC_CHECK(hipGraphLaunch(graphCache.at(key), 0)); + HIC_CHECK(hipGraphLaunch(*graphCache.at(key), 0)); HIC_CHECK(hipDeviceSynchronize()); } template void run_group(typename Type::real *data_real, - typename Type::cmplx *data_complex, int kfield, int *loens, - int *offsets, int nfft, void *growing_allocator) { - auto plans = plan_all(kfield, loens, nfft, offsets); + typename Type::cmplx *data_complex, int resol_id, int kfield, + int *loens, int *offsets, int nfft, void *growing_allocator) { + auto plans = + plan_all(resol_id, kfield, loens, nfft, offsets); for (auto &plan : plans) plan.exec(data_real, data_complex); @@ -199,29 +250,37 @@ extern "C" { #define RUN run_group #endif void execute_dir_fft_float(float *data_real, hipfftComplex *data_complex, - int kfield, int *loens, int *offsets, int nfft, - void *growing_allocator) { - RUN(data_real, data_complex, kfield, loens, offsets, nfft, - growing_allocator); + int resol_id, int kfield, int *loens, int *offsets, + int nfft, void *growing_allocator) { + RUN(data_real, data_complex, resol_id, kfield, loens, + offsets, nfft, growing_allocator); } void execute_inv_fft_float(hipfftComplex *data_complex, float *data_real, - int kfield, int *loens, int *offsets, int nfft, - void *growing_allocator) { - RUN(data_real, data_complex, kfield, loens, offsets, nfft, - growing_allocator); + int resol_id, int kfield, int *loens, int *offsets, + int nfft, void *growing_allocator) { + RUN(data_real, data_complex, resol_id, kfield, loens, + offsets, nfft, growing_allocator); } void execute_dir_fft_double(double *data_real, - hipfftDoubleComplex *data_complex, int kfield, - int *loens, int *offsets, int nfft, + hipfftDoubleComplex *data_complex, int resol_id, + int kfield, int *loens, int *offsets, int nfft, void *growing_allocator) { - RUN(data_real, data_complex, kfield, loens, offsets, nfft, - growing_allocator); + RUN(data_real, data_complex, resol_id, kfield, loens, + offsets, nfft, growing_allocator); } void execute_inv_fft_double(hipfftDoubleComplex *data_complex, - double *data_real, int kfield, int *loens, - int *offsets, int nfft, void *growing_allocator) { - RUN(data_real, data_complex, kfield, loens, offsets, nfft, - growing_allocator); + double *data_real, int resol_id, int kfield, + int *loens, int *offsets, int nfft, + void *growing_allocator) { + RUN(data_real, data_complex, resol_id, kfield, loens, + offsets, nfft, growing_allocator); } #undef RUN + +void clean_fft(int resol_id) { + erase_from_caches(resol_id); + erase_from_caches(resol_id); + erase_from_caches(resol_id); + erase_from_caches(resol_id); +} } diff --git a/src/trans/gpu/external/gpnorm_trans.F90 b/src/trans/gpu/external/gpnorm_trans.F90 index 7a691a53..711cd7f2 100755 --- a/src/trans/gpu/external/gpnorm_trans.F90 +++ b/src/trans/gpu/external/gpnorm_trans.F90 @@ -59,11 +59,9 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) USE TPM_GEN, ONLY: NOUT USE TPM_DIM, ONLY: R USE TPM_TRANS, ONLY: LGPNORM, NGPBLKS, NPROMA -USE TPM_DISTR, ONLY: D, NPRCIDS, NPRTRV, NPRTRW, MYSETV, MYSETW, NPROC, D_NSTAGTF, & - & D_NPTRLS, MYPROC -USE TPM_GEOMETRY, ONLY: G, G_NLOEN +USE TPM_DISTR, ONLY: D, NPRCIDS, NPRTRV, NPRTRW, MYSETV, MYSETW, NPROC, MYPROC +USE TPM_GEOMETRY, ONLY: G USE TPM_FIELDS, ONLY: F -USE TPM_FIELDS_FLAT, ONLY: F_RW USE SET_RESOL_MOD, ONLY: SET_RESOL USE SET2PE_MOD, ONLY: SET2PE USE MPL_MODULE, ONLY: MPL_RECV, MPL_SEND, JP_BLOCKING_STANDARD @@ -142,6 +140,7 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) CALL ABORT_TRANS('GPNORM_TRANS:THIRD DIMENSION OF PGP TOO SMALL ') ENDIF +ASSOCIATE(F_RW=>F%RW, D_NSTAGTF=>D%NSTAGTF, D_NPTRLS=>D%NPTRLS, G_NLOEN=>G%NLOEN) IF_GP=KFIELDS IF_SCALARS_G=KFIELDS @@ -253,6 +252,8 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) !$ACC end data CALL GSTATS(1429,1) +END ASSOCIATE + ! from here rest on CPU ! IT IS IMPORTANT THAT SUMS ARE NOW DONE IN LATITUDE ORDER diff --git a/src/trans/gpu/external/gpnorm_trans_gpu.F90 b/src/trans/gpu/external/gpnorm_trans_gpu.F90 index d72b7a1b..e6b280ca 100755 --- a/src/trans/gpu/external/gpnorm_trans_gpu.F90 +++ b/src/trans/gpu/external/gpnorm_trans_gpu.F90 @@ -58,9 +58,9 @@ SUBROUTINE GPNORM_TRANS_GPU(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) USE TPM_GEN, ONLY: NOUT USE TPM_DIM, ONLY: R USE TPM_TRANS, ONLY: LGPNORM, NGPBLKS, NPROMA -USE TPM_DISTR, ONLY: D, NPRCIDS, NPRTRV, NPRTRW, MYSETV, MYSETW, NPROC, D_NSTAGTF, D_NPTRLS -USE TPM_GEOMETRY, ONLY: G, G_NLOEN, G_NLOEN_MAX -USE TPM_FIELDS_FLAT, ONLY: F_RW +USE TPM_DISTR, ONLY: D, NPRCIDS, NPRTRV, NPRTRW, MYSETV, MYSETW, NPROC +USE TPM_GEOMETRY, ONLY: G +USE TPM_FIELDS, ONLY: F USE SET_RESOL_MOD, ONLY: SET_RESOL USE TRGTOL_MOD, ONLY: TRGTOL USE SET2PE_MOD, ONLY: SET2PE @@ -197,17 +197,18 @@ SUBROUTINE GPNORM_TRANS_GPU(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) IBEG=1 IEND=D%NDGL_FS +ASSOCIATE(D_NSTAGTF=>D%NSTAGTF,D_NPTRLS=>D%NPTRLS,G_NLOEN=>G%NLOEN,F_RW=>F%RW) + CALL GSTATS(1429,0) IF( IF_FS > 0 )THEN #ifdef ACCGPU !$ACC DATA & - !$ACC& COPY(F_RW) & - !$ACC& COPY(D,D_NSTAGTF,D_NPTRLS,G_NLOEN,G_NLOEN_MAX) & + !$ACC& COPY(D,D_NSTAGTF,D_NPTRLS,G_NLOEN,F,F_RW) & !$ACC& PRESENT(ZGTF,ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) #endif #ifdef OMPGPU - !$OMP TARGET DATA MAP(TO:F_RW,D,D_NSTAGTF,D_NPTRLS,G_NLOEN,G_NLOEN_MAX) & + !$OMP TARGET DATA MAP(TO:F,D,D_NSTAGTF,D_NPTRLS,G_NLOEN) & !$OMP& MAP(PRESENT,ALLOC:ZGTF,ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO #endif @@ -274,7 +275,7 @@ SUBROUTINE GPNORM_TRANS_GPU(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) IGL = D_NPTRLS(MYSETW) + JGL - 1 DO JF=1,IF_FS ZAVE(JF,JGL)=ZAVE(JF,JGL)*F_RW(IGL)/G_NLOEN(IGL) - !write(iunit,*) 'aver inside ',JF,IF_FS,IGL,ZAVE(JF,JGL), F_RW(IGL), G_NLOEN(IGL),ZMINGPN(JF),ZMAXGPN(JF) + !write(iunit,*) 'aver inside ',JF,IF_FS,IGL,ZAVE(JF,JGL), F%RW(IGL), G_NLOEN(IGL),ZMINGPN(JF),ZMAXGPN(JF) ENDDO ENDDO #ifdef ACCGPU @@ -316,6 +317,8 @@ SUBROUTINE GPNORM_TRANS_GPU(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) ENDIF CALL GSTATS(1429,1) +END ASSOCIATE + ! from here rest on CPU ! IT IS IMPORTANT THAT SUMS ARE NOW DONE IN LATITUDE ORDER diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index bf1cec9d..82670fea 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -103,20 +103,15 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& !ifndef INTERFACE -USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_PTR, C_INT, C_ASSOCIATED, C_SIZE_T +USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_PTR, C_INT, C_ASSOCIATED, C_SIZE_T, C_SIZEOF USE EC_ENV_MOD, ONLY: EC_GETENV USE TPM_GEN, ONLY: NOUT, MSETUP0, NCUR_RESOL, NDEF_RESOL, & & NMAX_RESOL, NPRINTLEV, LENABLED, NERR -USE TPM_DIM, ONLY: R, DIM_RESOL, R_NSMAX,R_NTMAX, R_NDGNH, R_NDGL -USE TPM_DISTR, ONLY: D, DISTR_RESOL, NPROC, NPRTRV, D_NUMP, D_NDGL_FS, D_MYMS, & - & D_NSTAGT0B, D_NSTAGT1B, D_NPROCL, D_NPNTGTB1, D_NASM0, & - & D_NSTAGTF, D_MSTABF, D_NPNTGTB0, D_NPROCM, D_NPTRLS, & - & MYPROC, D_OFFSETS_GEMM1, D_OFFSETS_GEMM2 -USE TPM_GEOMETRY, ONLY: G, GEOM_RESOL, G_NDGLU, G_NMEN, G_NMEN_MAX, G_NLOEN, & - & G_NLOEN_MAX +USE TPM_DIM, ONLY: R, DIM_RESOL +USE TPM_DISTR, ONLY: D, DISTR_RESOL, NPROC, NPRTRV, MYPROC +USE TPM_GEOMETRY, ONLY: G, GEOM_RESOL USE TPM_FIELDS, ONLY: FIELDS_RESOL, F -USE TPM_FIELDS_FLAT, ONLY: F_RW, F_RLAPIN, F_RACTHE, ZEPSNM, & - & ZAA, ZAS, ZAA0, ZAS0, KMLOC0 +USE TPM_FIELDS_GPU, ONLY: FIELDS_GPU_RESOL, FG USE TPM_FLT, ONLY: FLT_RESOL, S USE TPM_CTL, ONLY: CTL_RESOL, C USE SET_RESOL_MOD, ONLY: SET_RESOL @@ -173,7 +168,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& INTEGER(KIND=JPIM) :: JGL,JRES,IDEF_RESOL INTEGER(KIND=JPIM) :: JMLOC, KM, ILA, ILS, KMLOC, KDGLU, JK, I, J -INTEGER(KIND=JPIM) :: IPROC, IPROCS, ISTAN, ISTAS, ISL, IGLS, JFLD +INTEGER(KIND=JPIM) :: IPROC, IPROCS, ISTAN, ISTAS, ISL, IGLS, JFLD, IMLOC0(1) LOGICAL :: LLP1,LLP2, LLSPSETUPONLY REAL(KIND=JPRD) :: ZTIME0,ZTIME1,ZTIME2 @@ -232,6 +227,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& IDEF_RESOL = 1 ALLOCATE(DIM_RESOL(NMAX_RESOL)) ALLOCATE(FIELDS_RESOL(NMAX_RESOL)) + ALLOCATE(FIELDS_GPU_RESOL(NMAX_RESOL)) ALLOCATE(GEOM_RESOL(NMAX_RESOL)) ALLOCATE(DISTR_RESOL(NMAX_RESOL)) ALLOCATE(FLT_RESOL(NMAX_RESOL)) @@ -457,15 +453,6 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& IF( .NOT.D%LGRIDONLY ) THEN -!allocating arrays for the GPU: -!! CALL EC_GETENV("ECTRANS_GPU_NFLEV",CENV) -!! IF(LEN_TRIM(CENV)>0) THEN -!! WRITE(NOUT,'(2A)') "Using temporary solution for buffer allocation using ${ECTRANS_GPU_NFLEV}=",CENV -!! READ(CENV,*) NFLEV0 -!! ELSE -!! NFLEV0 = ceiling(REAL(IMAXFLD)/NPRTRV) -!! ENDIF - IUNIT=300+MYPROC #ifdef ACCGPU @@ -481,33 +468,13 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& WRITE(NOUT,*) 'R%NTMAX=',R%NTMAX WRITE(NOUT,*) 'R%NSMAX=',R%NSMAX -#ifdef ACCGPU -!$ACC ENTER DATA COPYIN(F,S,D,R,G) -!$ACC ENTER DATA & -!$ACC& COPYIN(F%RN,F%RLAPIN) & -!$ACC& COPYIN(S%FA,S%ITHRESHOLD) & -!$ACC& COPYIN(D%NUMP,D%MYMS,D%NPNTGTB0,D%NPNTGTB1,D%NSTAGT0B,D%NSTAGT1B,D%NSTAGTF,D%NPROCM,D%NPTRLS,D%MSTABF) & -!$ACC& COPYIN(R%NDGNH,R%NSMAX) & -!$ACC& COPYIN(G%NDGLU,G%NMEN,G%NLOEN) - -#endif -#ifdef OMPGPU -!$OMP TARGET ENTER DATA MAP(ALLOC:ZAA,ZAS) -!$OMP TARGET ENTER DATA MAP(TO:F,S,D,D_NUMP,D_MYMS,R,R_NDGNH,R_NSMAX,G,G_NDGLU) -!$OMP TARGET ENTER DATA MAP(TO:D_NPNTGTB0,D_NPNTGTB1,D_NSTAGT0B,D_NSTAGT1B,D_NSTAGTF,G_NMEN,D_NPROCM,D_NPTRLS,G,G_NLOEN,D_MSTABF) -#endif - ! Initialize A arrays -ALLOCATE(ZAA(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+2)/2,8),D%NUMP)) -ALLOCATE(ZAS(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+3)/2,8),D%NUMP)) +ALLOCATE(FG%ZAA(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+2)/2,8),D%NUMP)) +ALLOCATE(FG%ZAS(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+3)/2,8),D%NUMP)) -WRITE(NOUT,*)'setup_trans: sizes1 NUMP=',D%NUMP -WRITE(NOUT,*)'ZAS:',size(ZAS) -WRITE(NOUT,*)'ZAA:',size(ZAA) - -ZAA(:,:,:) = 0._JPRBT -ZAS(:,:,:) = 0._JPRBT +FG%ZAA(:,:,:) = 0._JPRBT +FG%ZAS(:,:,:) = 0._JPRBT DO JMLOC=1,D%NUMP KM = D%MYMS(JMLOC) @@ -515,203 +482,62 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ILA = (R%NSMAX-KM+2)/2 ILS = (R%NSMAX-KM+3)/2 - ZAA(1:KDGLU,1:ILA,JMLOC)=S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA) - ZAS(1:KDGLU,1:ILS,JMLOC)=S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS) + FG%ZAA(1:KDGLU,1:ILA,JMLOC)=S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA) + FG%ZAS(1:KDGLU,1:ILS,JMLOC)=S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS) ENDDO -! permanent copy of Legendre polynomials into device - -#ifdef ACCGPU -!$ACC ENTER DATA COPYIN(ZAA,ZAS) -#endif -#ifdef OMPGPU -#endif +! arrays for m=0 in ledir_mod: +IMLOC0 = FINDLOC(D%MYMS,0) +IF(IMLOC0(1) > 0) THEN + ALLOCATE(FG%ZAA0(SIZE(FG%ZAA,1),SIZE(FG%ZAA,2))) + ALLOCATE(FG%ZAS0(SIZE(FG%ZAS,1),SIZE(FG%ZAS,2))) + FG%ZAA0 = FG%ZAA(:,:,IMLOC0(1)) + FG%ZAS0 = FG%ZAS(:,:,IMLOC0(1)) +ENDIF -ALLOCATE(ZEPSNM(D%NUMP,0:R%NTMAX+2)) -WRITE(NOUT,*)'ZEPSNM :',SIZE(ZEPSNM) +ALLOCATE(FG%ZEPSNM(D%NUMP,0:R%NTMAX+2)) +FG%ZEPSNM = 0._JPRBT +CALL PREPSNM !Initialize on the host -ZEPSNM = 0._JPRBT -! on the host -CALL PREPSNM +WRITE(NOUT,*)'setup_trans: sizes1 NUMP=',D%NUMP #ifdef ACCGPU -!$ACC ENTER DATA COPYIN(ZEPSNM) + WRITE(NOUT,*) 'Using OpenACC' #endif - -! TODO: I guess tose might be needed again -! add arrays for GPNORM1 -!ALLOCATE(ZAVE(IF_FS,R%NDGL)) -!ALLOCATE(ZMINGL(IF_FS,R%NDGL)) -!ALLOCATE(ZMAXGL(IF_FS,R%NDGL)) -!ALLOCATE(ZMINGPN(IF_FS)) -!ALLOCATE(ZMAXGPN(IF_FS)) -! -!ZAVE = 0._JPRBT -!ZMINGL = 0._JPRBT -!ZMAXGL = 0._JPRBT -!ZMINGPN = 0._JPRBT -!ZMAXGPN = 0._JPRBT -!#ifdef ACCGPU -!!$ACC ENTER DATA COPYIN(ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) -!#endif - -!set up flat copies of constant data -R_NSMAX=R%NSMAX -R_NTMAX=R%NTMAX -R_NDGNH=R%NDGNH -R_NDGL=R%NDGL - - -ALLOCATE(D_NSTAGT0B(SIZE(D%NSTAGT0B))) -ALLOCATE(D_NSTAGT1B(SIZE(D%NSTAGT1B))) -ALLOCATE(D_NPNTGTB0(0:SIZE(D%NPNTGTB0,1)-1,SIZE(D%NPNTGTB0,2))) -ALLOCATE(D_NPNTGTB1(SIZE(D%NPNTGTB1,1),SIZE(D%NPNTGTB1,2))) -ALLOCATE(D_MYMS(SIZE(D%MYMS))) -ALLOCATE(D_NPROCL(SIZE(D%NPROCL))) -ALLOCATE(D_NASM0(0:SIZE(D%NASM0)-1)) -ALLOCATE(D_NSTAGTF(SIZE(D%NSTAGTF))) -ALLOCATE(D_MSTABF(SIZE(D%MSTABF))) -ALLOCATE(D_NPROCM(0:SIZE(D%NPROCM)-1)) -ALLOCATE(D_NPTRLS(SIZE(D%NPTRLS))) - -ALLOCATE(G_NDGLU(0:SIZE(G%NDGLU)-1)) -ALLOCATE(G_NMEN(SIZE(G%NMEN))) -ALLOCATE(G_NLOEN(SIZE(G%NLOEN))) - -ALLOCATE(F_RW(SIZE(F%RW))) -ALLOCATE(F_RLAPIN(-1:SIZE(F%RLAPIN)-2)) -ALLOCATE(F_RACTHE(SIZE(F%RACTHE))) - - -DO I=0,SIZE(G%NDGLU)-1 - G_NDGLU(I)=G%NDGLU(I) -END DO - -G_NMEN_MAX=0 -DO I=1,SIZE(G%NMEN) - G_NMEN(I)=G%NMEN(I) - IF (G_NMEN(I) .GT. G_NMEN_MAX) G_NMEN_MAX=G_NMEN(I) -END DO - -G_NLOEN_MAX=0 -DO I=1,SIZE(G%NLOEN) - G_NLOEN(I)=G%NLOEN(I) - IF (G_NLOEN(I) .GT. G_NLOEN_MAX) G_NLOEN_MAX=G_NLOEN(I) -END DO - -DO I=1,SIZE(D%NSTAGT0B) - D_NSTAGT0B(I)=D%NSTAGT0B(I) -END DO - -DO I=1,SIZE(D%NSTAGT1B) - D_NSTAGT1B(I)=D%NSTAGT1B(I) -END DO - -DO I=1,SIZE(D%NPROCL) - D_NPROCL(I)=D%NPROCL(I) -END DO - -DO I=0,SIZE(D%NASM0)-1 - D_NASM0(I)=D%NASM0(I) -END DO - -DO I=1,SIZE(D%NSTAGTF) - D_NSTAGTF(I)=D%NSTAGTF(I) -END DO - -DO I=1,SIZE(D%MSTABF) - D_MSTABF(I)=D%MSTABF(I) -END DO - -DO I=0,SIZE(D%NPROCM)-1 - D_NPROCM(I)=D%NPROCM(I) -END DO - -DO I=1,SIZE(D%NPTRLS) - D_NPTRLS(I)=D%NPTRLS(I) -END DO - -DO I=1,SIZE(D%NPNTGTB0,2) - DO J=0,SIZE(D%NPNTGTB0,1)-1 - D_NPNTGTB0(J,I)=D%NPNTGTB0(J,I) - END DO -END DO - -DO I=1,SIZE(D%NPNTGTB1,2) - DO J=1,SIZE(D%NPNTGTB1,1) - D_NPNTGTB1(J,I)=D%NPNTGTB1(J,I) - END DO -END DO - -D_OFFSETS_GEMM1 => D%OFFSETS_GEMM1 -D_OFFSETS_GEMM2 => D%OFFSETS_GEMM2 #ifdef OMPGPU + WRITE(NOUT,*) 'Using OpenMP offloading' #endif -#ifdef ACCGPU -!$ACC ENTER DATA COPYIN(D_OFFSETS_GEMM1,D_OFFSETS_GEMM2) -#endif - -D_NUMP=D%NUMP -D_NDGL_FS=D%NDGL_FS +WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS', C_SIZEOF(FG%ZAS(1,1,1))*SIZE(FG%ZAS) +WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA', C_SIZEOF(FG%ZAA(1,1,1))*SIZE(FG%ZAA) +WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS0', C_SIZEOF(FG%ZAS0(1,1))*SIZE(FG%ZAS0) +WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA0', C_SIZEOF(FG%ZAA0(1,1))*SIZE(FG%ZAA0) +WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZEPSNM', C_SIZEOF(FG%ZEPSNM(1,1))*SIZE(FG%ZEPSNM) -KMLOC0 = -1 -DO I=1,SIZE(D%MYMS) - D_MYMS(I)=D%MYMS(I) - IF(D_MYMS(I) == 0) KMLOC0 = I -end DO - -! arrays for m=0 in ledir_mod: -IF(KMLOC0 >= 0) THEN - ALLOCATE(ZAA0(SIZE(ZAA,1),SIZE(ZAA,2))) - ALLOCATE(ZAS0(SIZE(ZAS,1),SIZE(ZAS,2))) - ZAA0 = ZAA(:,:,KMLOC0) - ZAS0 = ZAS(:,:,KMLOC0) +IF (IMLOC0(1) > 0) THEN #ifdef ACCGPU - !$ACC ENTER DATA COPYIN(ZAA0,ZAS0) -#endif -#ifdef OMPGPU - !$OMP TARGET ENTER DATA MAP(TO:ZAA0,ZAS0) -#endif - WRITE(NOUT,*) 'GPU arrays for m=0 successfully allocated' -#ifdef ACCGPU - WRITE(NOUT,*) 'Using OpenACC' + !$ACC ENTER DATA COPYIN(FG%ZAA0,FG%ZAS0) ASYNC(1) #endif #ifdef OMPGPU - WRITE(NOUT,*) 'Using OpenMP offloading' + !$OMP TARGET ENTER DATA MAP(TO:FG%ZAA0,FG%ZAS0) #endif ENDIF - -DO I=1,SIZE(F%RW) - F_RW(I)=F%RW(I) -END DO -DO I=-1,SIZE(F%RLAPIN)-2 - F_RLAPIN(I)=REAL(F%RLAPIN(I),JPRBT) -END DO -DO I=1,SIZE(F%RACTHE) - F_RACTHE(I)=F%RACTHE(I) -END DO - #ifdef ACCGPU -!$ACC ENTER DATA COPYIN(R_NSMAX,R_NTMAX,R_NDGL,R_NDGNH,D_NSTAGT0B,D_NSTAGT1B,& -!$ACC& D_NPNTGTB1,D_NPROCL,D_NUMP,D_NDGL_FS,D_MYMS,D_NASM0,D_NSTAGTF,D_MSTABF,& -!$ACC& D_NPNTGTB0,D_NPROCM,D_NPTRLS,G_NDGLU,G_NMEN,G_NMEN_MAX,G_NLOEN,& -!$ACC& G_NLOEN_MAX,F_RW,F_RLAPIN,F_RACTHE) +!$ACC ENTER DATA COPYIN(R) ASYNC(1) +!$ACC ENTER DATA COPYIN(F,F%RLAPIN,F%RACTHE,F%RW) ASYNC(1) +!$ACC ENTER DATA COPYIN(FG,FG%ZAA,FG%ZAS,FG%ZEPSNM) ASYNC(1) +!$ACC ENTER DATA COPYIN(D,D%MYMS,D%NPNTGTB0,D%NPNTGTB1,D%NSTAGT0B,D%NSTAGT1B,D%NSTAGTF,D%NPROCM)& +!$ACC& COPYIN(D%NPTRLS,D%MSTABF,D%NASM0,D%OFFSETS_GEMM1,D%OFFSETS_GEMM2) ASYNC(1) +!$ACC ENTER DATA COPYIN(G,G%NDGLU,G%NMEN,G%NLOEN) ASYNC(1) +!$ACC WAIT(1) #endif #ifdef OMPGPU -!$OMP TARGET ENTER DATA MAP(TO:R_NSMAX,R_NTMAX,R_NDGL,R_NDGNH,D_NSTAGT0B,D_NSTAGT1B) -!$OMP TARGET ENTER DATA MAP(TO:D_NPNTGTB1,D_NPROCL,D_NUMP,D_NDGL_FS,D_MYMS,D_NASM0,D_NSTAGTF,D_MSTABF) -!$OMP TARGET ENTER DATA MAP(TO:D_NPNTGTB0,D_NPROCM,D_NPTRLS,G_NDGLU,G_NMEN,G_NMEN_MAX,G_NLOEN) -!$OMP TARGET ENTER DATA MAP(TO:G_NLOEN_MAX,F_RW,F_RLAPIN,F_RACTHE) +!$OMP TARGET ENTER DATA MAP(ALLOC:FG%ZAA,FG%ZAS) +!$OMP TARGET ENTER DATA MAP(TO:FG,F,S,D,R,G) +!$OMP BARRIER #endif WRITE(NOUT,*) '===GPU arrays successfully allocated' -#ifdef ACCGPU -!$ACC wait -#endif -#ifdef OMPGPU -!$OMP BARRIER -#endif -! free memory +! TODO: This might be good idea - those polynomials are not needed !DO JMLOC=1,D%NUMP ! DEALLOCATE(S%FA(JMLOC)%RPNMA) ! DEALLOCATE(S%FA(JMLOC)%RPNMS) @@ -719,6 +545,6 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& !endif INTERFACE -ENDIF +ENDIF ! D%LGRIDONLY END SUBROUTINE SETUP_TRANS diff --git a/src/trans/gpu/external/trans_end.F90 b/src/trans/gpu/external/trans_end.F90 index 5329c859..366c759a 100755 --- a/src/trans/gpu/external/trans_end.F90 +++ b/src/trans/gpu/external/trans_end.F90 @@ -47,16 +47,15 @@ SUBROUTINE TRANS_END(CDMODE) !ifndef INTERFACE USE TPM_GEN, ONLY: MSETUP0, NCUR_RESOL, NMAX_RESOL, LENABLED, NDEF_RESOL -USE TPM_DIM, ONLY: R, DIM_RESOL, R_NSMAX, R_NTMAX, R_NDGNH, R_NDGL -USE TPM_DISTR, ONLY: D, DISTR_RESOL, NPRCIDS, D_NUMP, D_MYMS, D_NSTAGT0B, D_NSTAGT1B, & - & D_NPROCL, D_NPNTGTB1, D_NASM0, D_NSTAGTF, D_MSTABF, D_NPNTGTB0, & - & D_NPROCM, D_NPTRLS -USE TPM_GEOMETRY, ONLY: G, GEOM_RESOL, G_NDGLU, G_NMEN, G_NMEN_MAX,G_NLOEN, G_NLOEN_MAX +USE TPM_DIM, ONLY: R, DIM_RESOL +USE TPM_DISTR, ONLY: D, DISTR_RESOL, NPRCIDS +USE TPM_GEOMETRY, ONLY: G, GEOM_RESOL USE TPM_FIELDS, ONLY: F, FIELDS_RESOL -USE TPM_FIELDS_FLAT, ONLY: F_RW, ZEPSNM, ZAA, ZAS, ZAA0, ZAS0 +USE TPM_FIELDS_GPU, ONLY: FG, FIELDS_GPU_RESOL USE TPM_CTL, ONLY: C, CTL_RESOL USE TPM_FLT, ONLY: S, FLT_RESOL -USE TPM_TRANS, ONLY: FOUBUF, FOUBUF_IN +USE TPM_TRANS, ONLY: GROWING_ALLOCATION +USE GROWING_ALLOCATOR_MOD,ONLY: DESTROY_GROWING_ALLOCATOR USE EQ_REGIONS_MOD, ONLY: N_REGIONS USE SET_RESOL_MOD, ONLY: SET_RESOL USE DEALLOC_RESOL_MOD, ONLY: DEALLOC_RESOL @@ -67,32 +66,12 @@ SUBROUTINE TRANS_END(CDMODE) ! Local variables INTEGER(KIND=JPIM) :: JRES CHARACTER*5 :: CLMODE + ! ------------------------------------------------------------------ CLMODE='FINAL' IF (PRESENT(CDMODE)) CLMODE=CDMODE IF (CLMODE == 'FINAL') THEN -#ifdef ACCGPU - !$ACC EXIT DATA DELETE(ZAA0,ZAS0,ZEPSNM,ZAA,ZAS) -#endif -#ifdef OMPGPU -#endif - DEALLOCATE(ZAA0) - DEALLOCATE(ZAS0) - DEALLOCATE(ZEPSNM) - DEALLOCATE(ZAA) - DEALLOCATE(ZAS) - - - DEALLOCATE(D_NSTAGT0B,D_NSTAGT1B,D_NPNTGTB1,D_MYMS,D_NPROCL,D_NASM0,D_NSTAGTF,D_MSTABF,D_NPNTGTB0,D_NPROCM,D_NPTRLS,G_NDGLU,G_NMEN,G_NLOEN,F_RW) -#ifdef ACCGPU - !$ACC EXIT DATA DELETE(R_NSMAX,R_NTMAX,R_NDGL,R_NDGNH,D_NSTAGT0B,D_NSTAGT1B,D_NPNTGTB1,D_NPROCL, D_NUMP,D_MYMS, & - !$ACC& G_NDGLU,G_NMEN,G_NMEN_MAX,G_NLOEN,G_NLOEN_MAX,D_NSTAGTF,D_MSTABF,D_NPNTGTB0,D_NPROCM,D_NPTRLS,D_NASM0,F_RW) - -#endif -#ifdef OMPGPU - !$OMP TARGET EXIT DATA MAP(DELETE: ) -#endif !CALL HIP_DGEMM_BATCHED_FINALIZE() IF( ALLOCATED( LENABLED ) ) THEN @@ -104,6 +83,8 @@ SUBROUTINE TRANS_END(CDMODE) DEALLOCATE(LENABLED) ENDIF + CALL DESTROY_GROWING_ALLOCATOR(GROWING_ALLOCATION) + NULLIFY(R) IF( ALLOCATED(DIM_RESOL) ) DEALLOCATE(DIM_RESOL) @@ -122,15 +103,14 @@ SUBROUTINE TRANS_END(CDMODE) NULLIFY(F) IF( ALLOCATED(FIELDS_RESOL) ) DEALLOCATE(FIELDS_RESOL) + !TPM_FIELDS_GPU + NULLIFY(FG) + IF( ALLOCATED(FIELDS_GPU_RESOL) ) DEALLOCATE(FIELDS_GPU_RESOL) !TPM_GEOMETRY NULLIFY(G) IF( ALLOCATED(GEOM_RESOL) ) DEALLOCATE(GEOM_RESOL) - !TPM_TRANS - IF(ALLOCATED(FOUBUF_IN)) DEALLOCATE(FOUBUF_IN) - IF(ALLOCATED(FOUBUF)) DEALLOCATE(FOUBUF) - MSETUP0 = 0 NMAX_RESOL = 0 NCUR_RESOL = 0 diff --git a/src/trans/gpu/internal/cdmap_mod.F90 b/src/trans/gpu/internal/cdmap_mod.F90 index e0e7d0b5..98aac868 100755 --- a/src/trans/gpu/internal/cdmap_mod.F90 +++ b/src/trans/gpu/internal/cdmap_mod.F90 @@ -17,8 +17,9 @@ SUBROUTINE CDMAP(KM,KMLOC,KSL,KSLO,PEPSNM, KDIR, KDGNH, KDGNHD,& USE YOMHOOK, ONLY: LHOOK, DR_HOOK, JPHOOK USE TPM_FLT, ONLY: S USE TPM_DISTR, ONLY: D -USE TPM_TRANS, ONLY: FOUBUF_IN, FOUBUF +!USE TPM_TRANS, ONLY: FOUBUF_IN, FOUBUF USE SEEFMM_MIX, ONLY: SEEFMM_MULM +USE MPL_MODULE, ONLY: MPL_ABORT !**** *CDMAP* - REMAP ROOTS ! @@ -80,6 +81,8 @@ SUBROUTINE CDMAP(KM,KMLOC,KSL,KSLO,PEPSNM, KDIR, KDGNH, KDGNHD,& ! ------------------------------------------------------------------ +CALL MPL_ABORT("CDMAP not yet supported in ecTrans GPU version") + !* 1. PERFORM LEGENDRE TRANFORM. ! -------------------------- @@ -121,10 +124,10 @@ SUBROUTINE CDMAP(KM,KMLOC,KSL,KSLO,PEPSNM, KDIR, KDGNH, KDGNHD,& DO IGL=KSLO,KDGNHD IGLS = 2*KDGNHD+1-IGL DO JF=1,KFIELDS - FOUBUF_IN(ISTN(IGL)+JF) = S%FA(KMLOC)%RPNMWO(IGL-KSLO+1,1)*ZALL1(JF,IGL) & - & - S%FA(KMLOC)%RPNMWO(IGL-KSLO+1,2)*ZALL(JF,IGL) - FOUBUF_IN(ISTS(IGL)+JF) = S%FA(KMLOC)%RPNMWO(IGLS-KSLO+1,1)*ZALL1(JF,IGLS) & - & - S%FA(KMLOC)%RPNMWO(IGLS-KSLO+1,2)*ZALL(JF,IGLS) + !FOUBUF_IN(ISTN(IGL)+JF) = S%FA(KMLOC)%RPNMWO(IGL-KSLO+1,1)*ZALL1(JF,IGL) & + ! & - S%FA(KMLOC)%RPNMWO(IGL-KSLO+1,2)*ZALL(JF,IGL) + !FOUBUF_IN(ISTS(IGL)+JF) = S%FA(KMLOC)%RPNMWO(IGLS-KSLO+1,1)*ZALL1(JF,IGLS) & + ! & - S%FA(KMLOC)%RPNMWO(IGLS-KSLO+1,2)*ZALL(JF,IGLS) ENDDO ENDDO DEALLOCATE(ZALL1) @@ -154,8 +157,8 @@ SUBROUTINE CDMAP(KM,KMLOC,KSL,KSLO,PEPSNM, KDIR, KDGNH, KDGNHD,& DO JGL=KSLO, KDGNHD IGLS = 2*KDGNHD+1-JGL DO JF=1,KFIELDS - ZQX(JF,JGL)=FOUBUF(ISTN(JGL)+JF) - ZQX(JF,IGLS)=FOUBUF(ISTS(JGL)+JF) + !ZQX(JF,JGL)=FOUBUF(ISTN(JGL)+JF) + !ZQX(JF,IGLS)=FOUBUF(ISTS(JGL)+JF) ENDDO ENDDO diff --git a/src/trans/gpu/internal/dealloc_resol_mod.F90 b/src/trans/gpu/internal/dealloc_resol_mod.F90 index 8c872650..1d10cc96 100755 --- a/src/trans/gpu/internal/dealloc_resol_mod.F90 +++ b/src/trans/gpu/internal/dealloc_resol_mod.F90 @@ -1,5 +1,6 @@ ! (C) Copyright 2013- ECMWF. ! (C) Copyright 2013- Meteo-France. +! (C) Copyright 2024- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -42,12 +43,15 @@ SUBROUTINE DEALLOC_RESOL(KRESOL) ! ------------------------------------------------------------------ USE PARKIND_ECTRANS, ONLY: JPIM -USE TPM_DIM, ONLY: R +USE TPM_DIM, ONLY: R, DIM_TYPE USE TPM_GEN, ONLY: LENABLED, NOUT, NDEF_RESOL -USE TPM_DISTR, ONLY: D, NPRTRV -USE TPM_GEOMETRY, ONLY: G -USE TPM_FIELDS, ONLY: F -USE TPM_FLT, ONLY: S +USE TPM_DISTR, ONLY: D, DISTR_TYPE, NPRTRV +USE TPM_GEOMETRY, ONLY: G, GEOM_TYPE +USE TPM_FIELDS, ONLY: F, FIELDS_TYPE +USE TPM_FIELDS_GPU, ONLY: FG, FIELDS_GPU_TYPE +USE TPM_HICFFT, ONLY: CLEAN_FFT +USE HICBLAS_MOD, ONLY: CLEAN_GEMM +USE TPM_FLT, ONLY: S, FLT_TYPE_WRAP USE TPM_CTL, ONLY: C USE SEEFMM_MIX, ONLY: FREE_SEEFMM USE SET_RESOL_MOD, ONLY: SET_RESOL @@ -57,6 +61,12 @@ SUBROUTINE DEALLOC_RESOL(KRESOL) INTEGER(KIND=JPIM), INTENT(IN) :: KRESOL INTEGER(KIND=JPIM) :: JMLOC,IPRTRV,JSETV,IMLOC,IM,ILA,ILS, JRESOL +TYPE(DIM_TYPE) :: R_ +TYPE(DISTR_TYPE) :: D_ +TYPE(GEOM_TYPE) :: G_ +TYPE(FIELDS_TYPE) :: F_ +TYPE(FIELDS_GPU_TYPE) :: FG_ +TYPE(FLT_TYPE_WRAP) :: S_ ! ------------------------------------------------------------------ @@ -68,7 +78,20 @@ SUBROUTINE DEALLOC_RESOL(KRESOL) CALL SET_RESOL(KRESOL) - !TPM_FLT +#ifdef ACCGPU +!$ACC EXIT DATA DELETE(R) ASYNC(1) +!$ACC EXIT DATA DELETE(FG,FG%ZAA0,FG%ZAS0) IF(ALLOCATED(FG%ZAA0)) ASYNC(1) +!$ACC EXIT DATA DELETE(FG,FG%ZAA,FG%ZAS,FG%ZEPSNM) ASYNC(1) +!$ACC EXIT DATA DELETE(F,F%RLAPIN,F%RACTHE,F%RW) ASYNC(1) +!$ACC EXIT DATA DELETE(D,D%MYMS,D%NPNTGTB0,D%NPNTGTB1,D%NSTAGT0B,D%NSTAGT1B,D%NSTAGTF,D%NPROCM)& +!$ACC& DELETE(D%NPTRLS,D%MSTABF,D%NASM0,D%OFFSETS_GEMM1,D%OFFSETS_GEMM2) ASYNC(1) +!$ACC EXIT DATA DELETE(G,G%NDGLU,G%NMEN,G%NLOEN) ASYNC(1) +!$ACC WAIT(1) +#endif +#ifdef OMPGPU +#endif + + ! TPM_FLD is more complex because it has pointers IF( ALLOCATED(S%FA) ) THEN DO JMLOC=1,D%NUMP,NPRTRV ! +++++++++++++++++++++ JMLOC LOOP ++++++++++ IPRTRV=MIN(NPRTRV,D%NUMP-JMLOC+1) @@ -91,70 +114,17 @@ SUBROUTINE DEALLOC_RESOL(KRESOL) CALL FREE_SEEFMM(S%FMM_INTI) IF(ASSOCIATED(S%FMM_INTI)) DEALLOCATE(S%FMM_INTI) ENDIF + S = S_ - !TPM_DISTR - IF(ALLOCATED(D%NFRSTLAT)) DEALLOCATE(D%NFRSTLAT) - IF(ALLOCATED(D%NLSTLAT)) DEALLOCATE(D%NLSTLAT) - IF(ALLOCATED(D%NPTRLAT)) DEALLOCATE(D%NPTRLAT) - IF(ALLOCATED(D%NPTRFRSTLAT)) DEALLOCATE(D%NPTRFRSTLAT) - IF(ALLOCATED(D%NPTRLSTLAT)) DEALLOCATE(D%NPTRLSTLAT) - IF(ALLOCATED(D%LSPLITLAT)) DEALLOCATE(D%LSPLITLAT) - IF(ALLOCATED(D%NSTA)) DEALLOCATE(D%NSTA) - IF(ALLOCATED(D%NONL)) DEALLOCATE(D%NONL) - IF(ALLOCATED(D%NGPTOTL)) DEALLOCATE(D%NGPTOTL) - IF(ALLOCATED(D%NPROCA_GP)) DEALLOCATE(D%NPROCA_GP) - - IF(D%LWEIGHTED_DISTR) THEN - IF(ALLOCATED(D%RWEIGHT)) DEALLOCATE(D%RWEIGHT) - ENDIF - - IF(ALLOCATED(D%MYMS)) DEALLOCATE(D%MYMS) - IF(ALLOCATED(D%NUMPP)) DEALLOCATE(D%NUMPP) - IF(ALLOCATED(D%NPOSSP)) DEALLOCATE(D%NPOSSP) - IF(ALLOCATED(D%NPROCM)) DEALLOCATE(D%NPROCM) - IF(ALLOCATED(D%NDIM0G)) DEALLOCATE(D%NDIM0G) - IF(ALLOCATED(D%NASM0)) DEALLOCATE(D%NASM0) - IF(ALLOCATED(D%NATM0)) DEALLOCATE(D%NATM0) - IF(ALLOCATED(D%NLATLS)) DEALLOCATE(D%NLATLS) - IF(ALLOCATED(D%NLATLE)) DEALLOCATE(D%NLATLE) - IF(ALLOCATED(D%NPMT)) DEALLOCATE(D%NPMT) - IF(ALLOCATED(D%NPMS)) DEALLOCATE(D%NPMS) - IF(ALLOCATED(D%NPMG)) DEALLOCATE(D%NPMG) - IF(ALLOCATED(D%NULTPP)) DEALLOCATE(D%NULTPP) - IF(ALLOCATED(D%NPROCL)) DEALLOCATE(D%NPROCL) - IF(ALLOCATED(D%NPTRLS)) DEALLOCATE(D%NPTRLS) - IF(ALLOCATED(D%NALLMS)) DEALLOCATE(D%NALLMS) - IF(ALLOCATED(D%NPTRMS)) DEALLOCATE(D%NPTRMS) - IF(ALLOCATED(D%NSTAGT0B)) DEALLOCATE(D%NSTAGT0B) - IF(ALLOCATED(D%NSTAGT1B)) DEALLOCATE(D%NSTAGT1B) - IF(ALLOCATED(D%NPNTGTB0)) DEALLOCATE(D%NPNTGTB0) - IF(ALLOCATED(D%NPNTGTB1)) DEALLOCATE(D%NPNTGTB1) - IF(ALLOCATED(D%NLTSFTB)) DEALLOCATE(D%NLTSFTB) - IF(ALLOCATED(D%NLTSGTB)) DEALLOCATE(D%NLTSGTB) - IF(ALLOCATED(D%MSTABF)) DEALLOCATE(D%MSTABF) - IF(ALLOCATED(D%NSTAGTF)) DEALLOCATE(D%NSTAGTF) - - !TPM_FIELDS - IF(ALLOCATED(F%RMU)) DEALLOCATE(F%RMU) - IF(ALLOCATED(F%RW)) DEALLOCATE(F%RW) - IF(ALLOCATED(F%R1MU2)) DEALLOCATE(F%R1MU2) - IF(ALLOCATED(F%RACTHE)) DEALLOCATE(F%RACTHE) - IF(ALLOCATED(F%REPSNM)) DEALLOCATE(F%REPSNM) - IF(ALLOCATED(F%RN)) DEALLOCATE(F%RN) - IF(ALLOCATED(F%RLAPIN)) DEALLOCATE(F%RLAPIN) - IF(ALLOCATED(F%NLTN)) DEALLOCATE(F%NLTN) - IF( S%LKEEPRPNM ) THEN - IF(ALLOCATED(F%RPNM)) DEALLOCATE(F%RPNM) - ENDIF - IF( S%LDLL ) THEN - IF(ALLOCATED(F%RMU2)) DEALLOCATE(F%RMU2) - IF(ALLOCATED(F%RACTHE2)) DEALLOCATE(F%RACTHE2) - ENDIF + ! Empty all fields (none of them has pointers; allocatable arrays implicitly deallocate) + D = D_ + F = F_ + FG = FG_ + R = R_ + G = G_ - !TPM_GEOMETRY - IF(ALLOCATED(G%NMEN)) DEALLOCATE(G%NMEN) - IF(ALLOCATED(G%NDGLU)) DEALLOCATE(G%NDGLU) - IF(ALLOCATED(G%NLOEN)) DEALLOCATE(G%NLOEN) + CALL CLEAN_FFT(KRESOL) + CALL CLEAN_GEMM(KRESOL) LENABLED(KRESOL)=.FALSE. NDEF_RESOL = COUNT(LENABLED) diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index 2254af6f..65c3a6bf 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -65,11 +65,11 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE ! ------------------------------------------------------------------ USE TPM_TRANS, ONLY: LATLON -USE TPM_DISTR, ONLY: MYSETW, MYPROC, NPROC, D_NUMP, D_NPTRLS, D_NSTAGTF -USE TPM_GEOMETRY, ONLY: G_NMEN, G_NLOEN, G_NLOEN_MAX -USE TPM_FIELDS_FLAT, ONLY: F_RACTHE +USE TPM_DISTR, ONLY: MYSETW, MYPROC, NPROC, D +USE TPM_GEOMETRY, ONLY: G +USE TPM_FIELDS, ONLY: F USE TPM_GEN, ONLY: NOUT -USE TPM_DIM, ONLY: R_NSMAX +USE TPM_DIM, ONLY: R ! IMPLICIT NONE @@ -83,13 +83,14 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE REAL(KIND=JPRBT) :: ZACHTE2 REAL(KIND=JPRBT) :: ZAMP, ZPHASE -INTEGER(KIND=JPIM) :: IOFF_LAT,OFFSET_VAR +INTEGER(KIND=JPIM) :: IOFF_LAT,OFFSET_VAR,ILOEN_MAX INTEGER(KIND=JPIM) :: IOFF_SCALARS,IOFF_SCALARS_EWDER,IOFF_UV,IOFF_UV_EWDER,IOFF_KSCALARS_NSDER INTEGER(KIND=JPIM) :: JF,IGLG,II,JM INTEGER(KIND=JPIM) :: IBEG,IEND,IINC REAL(KIND=JPRBT) :: RET_REAL, RET_COMPLEX - +ASSOCIATE(D_NUMP=>D%NUMP, D_NPTRLS=>D%NPTRLS, D_NSTAGTF=>D%NSTAGTF, G_NMEN=>G%NMEN, & + & G_NLOEN=>G%NLOEN, F_RACTHE=>F%RACTHE, R_NSMAX=>R%NSMAX) ! ------------------------------------------------------------------ IF(MYPROC > NPROC/2)THEN @@ -104,7 +105,7 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE #ifdef ACCGPU !$ACC DATA & -!$ACC& PRESENT(D_NPTRLS,D_NSTAGTF,PREEL_COMPLEX,F_RACTHE,G_NMEN,G_NLOEN, G_NLOEN_MAX, R_NSMAX) +!$ACC& PRESENT(D,D_NPTRLS,D_NSTAGTF,PREEL_COMPLEX,F,F_RACTHE,G,G_NMEN,G_NLOEN,R,R_NSMAX) #endif #ifdef OMPGPU !$OMP TARGET DATA MAP(PRESENT,ALLOC:ZGTF) & @@ -182,17 +183,18 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE !* 2.1 U AND V. +ILOEN_MAX = MAXVAL(G_NLOEN) IF (KUV_EWDER_OFFSET >= 0) THEN #ifdef OMPGPU !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) SHARED(KF_UV,PUVDERS,ZACHTE2,PUV) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IGLG,IOFF_LAT,IOFF_UV,IOFF_UV_EWDER,RET_REAL,RET_COMPLEX,ZACHTE2,JM,JF,KGL) & - !$ACC& FIRSTPRIVATE(IBEG,IEND,IINC,OFFSET_VAR,KF_UV,KUV_EWDER_OFFSET,KUV_OFFSET,KF_FS) ASYNC(1) + !$ACC& FIRSTPRIVATE(IBEG,IEND,IINC,OFFSET_VAR,KF_UV,KUV_EWDER_OFFSET,KUV_OFFSET,KF_FS,ILOEN_MAX) ASYNC(1) #endif DO KGL=IBEG,IEND,IINC DO JF=1,2*KF_UV - DO JM=0,G_NLOEN_MAX/2 + DO JM=0,ILOEN_MAX/2 IGLG = OFFSET_VAR+KGL-1 ! FFT transforms NLON real values to floor(NLON/2)+1 complex numbers. Hence we have ! to fill those floor(NLON/2)+1 values. @@ -229,11 +231,11 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE #endif #ifdef ACCGPU !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IGLG,IOFF_LAT,IOFF_SCALARS_EWDER,IOFF_SCALARS,ZACHTE2,RET_REAL,RET_COMPLEX) & - !$ACC& FIRSTPRIVATE(IBEG,IEND,IINC,KF_SCALARS,OFFSET_VAR,KSCALARS_EWDER_OFFSET,KSCALARS_OFFSET,KF_FS) ASYNC(1) + !$ACC& FIRSTPRIVATE(IBEG,IEND,IINC,KF_SCALARS,OFFSET_VAR,KSCALARS_EWDER_OFFSET,KSCALARS_OFFSET,KF_FS,ILOEN_MAX) ASYNC(1) #endif DO KGL=IBEG,IEND,IINC DO JF=1,KF_SCALARS - DO JM=0,G_NLOEN_MAX/2 + DO JM=0,ILOEN_MAX/2 IGLG = OFFSET_VAR+KGL-1 ! FFT transforms NLON real values to floor(NLON/2)+1 complex numbers. Hence we have ! to fill those floor(NLON/2)+1 values. @@ -272,6 +274,7 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE !$OMP END TARGET DATA #endif ! ------------------------------------------------------------------ +END ASSOCIATE END SUBROUTINE FSC END MODULE FSC_MOD diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index b182a7e3..03ebc5ff 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -73,11 +73,10 @@ SUBROUTINE FTDIR(ALLOCATOR,HFTDIR,PREEL_REAL,PREEL_COMPLEX,KFIELD) ! G. Mozdzynski (Jun 2015): Support alternative FFTs to FFTW ! ------------------------------------------------------------------ - USE TPM_GEN, ONLY: LSYNC_TRANS + USE TPM_GEN, ONLY: LSYNC_TRANS, NCUR_RESOL USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT - USE TPM_DISTR, ONLY: MYSETW, MYPROC, NPROC, D_NSTAGT0B, D_NSTAGTF,D_NPTRLS, & - & D_NPNTGTB0, D_NPROCM, D_NDGL_FS, D - USE TPM_GEOMETRY, ONLY: G_NMEN, G_NLOEN + USE TPM_DISTR, ONLY: MYSETW, MYPROC, NPROC, D + USE TPM_GEOMETRY, ONLY: G USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION USE TPM_HICFFT, ONLY: EXECUTE_DIR_FFT USE MPL_MODULE, ONLY: MPL_BARRIER,MPL_ALL_MS_COMM @@ -101,6 +100,10 @@ SUBROUTINE FTDIR(ALLOCATOR,HFTDIR,PREEL_REAL,PREEL_COMPLEX,KFIELD) & 1_C_SIZE_T, INT(KFIELD*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(PREEL_COMPLEX(1))) #endif + ASSOCIATE(D_NDGL_FS=>D%NDGL_FS, D_NSTAGT0B=>D%NSTAGT0B, D_NSTAGTF=>D%NSTAGTF, & + & D_NPTRLS=>D%NPTRLS, D_NPNTGTB0=>D%NPNTGTB0, D_NPROCM=>D%NPROCM, & + & G_NMEN=>G%NMEN, G_NLOEN=>G%NLOEN) + #ifdef ACCGPU !$ACC DATA PRESENT(PREEL_REAL, PREEL_COMPLEX, & !$ACC& D_NSTAGTF,D_NSTAGT0B,D_NPTRLS,D_NPROCM,D_NPNTGTB0,G_NMEN,G_NLOEN) @@ -115,7 +118,7 @@ SUBROUTINE FTDIR(ALLOCATOR,HFTDIR,PREEL_REAL,PREEL_COMPLEX,KFIELD) CALL GSTATS(430,1) ENDIF CALL GSTATS(413,0) - CALL EXECUTE_DIR_FFT(PREEL_REAL(:),PREEL_COMPLEX(:),KFIELD, & + CALL EXECUTE_DIR_FFT(PREEL_REAL(:),PREEL_COMPLEX(:),NCUR_RESOL,KFIELD, & & LOENS=G_NLOEN(D_NPTRLS(MYSETW):D_NPTRLS(MYSETW)+D_NDGL_FS-1), & & OFFSETS=D_NSTAGTF(1:D_NDGL_FS+1),ALLOC=ALLOCATOR%PTR) @@ -134,6 +137,7 @@ SUBROUTINE FTDIR(ALLOCATOR,HFTDIR,PREEL_REAL,PREEL_COMPLEX,KFIELD) #endif NULLIFY(PREEL_REAL) + END ASSOCIATE ! ------------------------------------------------------------------ END SUBROUTINE FTDIR diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index b6bb0a11..2ef8d868 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -72,13 +72,13 @@ SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD) ! G. Mozdzynski (Jun 2015): Support alternative FFTs to FFTW ! ------------------------------------------------------------------ - USE TPM_GEN, ONLY: LSYNC_TRANS - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT - USE TPM_DISTR, ONLY: MYSETW, D_NPTRLS, D_NDGL_FS, D_NSTAGTF, D - USE TPM_GEOMETRY, ONLY: G_NLOEN - USE TPM_HICFFT, ONLY: EXECUTE_INV_FFT - USE MPL_MODULE, ONLY: MPL_BARRIER,MPL_ALL_MS_COMM - USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX + USE TPM_GEN, ONLY: LSYNC_TRANS, NCUR_RESOL + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE TPM_DISTR, ONLY: MYSETW, D + USE TPM_GEOMETRY, ONLY: G + USE TPM_HICFFT, ONLY: EXECUTE_INV_FFT + USE MPL_MODULE, ONLY: MPL_BARRIER,MPL_ALL_MS_COMM + USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX USE BUFFERED_ALLOCATOR_MOD, ONLY: ASSIGN_PTR, GET_ALLOCATION USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF @@ -92,6 +92,8 @@ SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD) INTEGER(KIND=JPIM) :: KGL + ASSOCIATE(D_NDGL_FS=>D%NDGL_FS, D_NPTRLS=>D%NPTRLS, D_NSTAGTF=>D%NSTAGTF, G_NLOEN=>G%NLOEN) + #ifdef IN_PLACE_FFT PREEL_REAL => PREEL_COMPLEX #else @@ -111,7 +113,7 @@ SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD) CALL GSTATS(440,1) ENDIF CALL GSTATS(423,0) - CALL EXECUTE_INV_FFT(PREEL_COMPLEX(:),PREEL_REAL(:),KFIELD, & + CALL EXECUTE_INV_FFT(PREEL_COMPLEX(:),PREEL_REAL(:),NCUR_RESOL,KFIELD, & & LOENS=G_NLOEN(D_NPTRLS(MYSETW):D_NPTRLS(MYSETW)+D_NDGL_FS-1), & & OFFSETS=D_NSTAGTF(1:D_NDGL_FS),ALLOC=ALLOCATOR%PTR) @@ -130,6 +132,8 @@ SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD) NULLIFY(PREEL_COMPLEX) + END ASSOCIATE + ! ------------------------------------------------------------------ END SUBROUTINE FTINV END MODULE FTINV_MOD diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index e12f89af..0c1ee9e3 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -23,7 +23,7 @@ MODULE LEDIR_MOD SUBROUTINE LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& IOUT0_STRIDES0,IOUT0_SIZE,IIN0_STRIDES0,IIN0_SIZE) USE TPM_DIM, ONLY: R - USE TPM_DISTR, ONLY: D, D_OFFSETS_GEMM1, D_OFFSETS_GEMM2 + USE TPM_DISTR, ONLY: D IMPLICIT NONE @@ -34,6 +34,8 @@ SUBROUTINE LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& INTEGER(KIND=JPIM), OPTIONAL :: IOUT0_STRIDES0, IOUT0_SIZE INTEGER(KIND=JPIM), OPTIONAL :: IIN0_STRIDES0, IIN0_SIZE + ASSOCIATE(D_OFFSETS_GEMM1=>D%OFFSETS_GEMM1, D_OFFSETS_GEMM2=>D%OFFSETS_GEMM2) + IF (PRESENT(IOUT_STRIDES0)) & IOUT_STRIDES0 = ALIGN(2*KF_FS,A) IF (PRESENT(IOUT_SIZE)) & @@ -50,6 +52,8 @@ SUBROUTINE LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& IIN0_STRIDES0 = ALIGN(KF_FS,A) IF (PRESENT(IIN0_SIZE)) & IIN0_SIZE = IIN0_STRIDES0 * ALIGN(R%NDGNH,A) + + END ASSOCIATE END SUBROUTINE SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) @@ -94,12 +98,12 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) ! F. Vana 05-Mar-2015 Support for single precision ! ------------------------------------------------------------------ - USE TPM_GEN, ONLY: LSYNC_TRANS, NOUT + USE TPM_GEN, ONLY: LSYNC_TRANS, NOUT, NCUR_RESOL USE YOMHOOK, ONLY: LHOOK, DR_HOOK, JPHOOK - USE TPM_DIM, ONLY: R_NDGNH,R_NSMAX,R_NTMAX,R_NDGL - USE TPM_GEOMETRY, ONLY: G_NDGLU - USE TPM_FIELDS_FLAT, ONLY: ZAA,ZAS,ZAA0,ZAS0,KMLOC0 - USE TPM_DISTR, ONLY: D_NUMP, D_MYMS, D_OFFSETS_GEMM1, D_OFFSETS_GEMM2 + USE TPM_DIM, ONLY: R + USE TPM_GEOMETRY, ONLY: G + USE TPM_FIELDS_GPU, ONLY: FG + USE TPM_DISTR, ONLY: D USE HICBLAS_MOD, ONLY: HIP_DGEMM_BATCHED_OVERLOAD, & & HIP_DGEMM_GROUPED_OVERLOAD, HIP_SGEMM_GROUPED_OVERLOAD USE MPL_MODULE, ONLY: MPL_BARRIER,MPL_ALL_MS_COMM @@ -126,8 +130,8 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) ! LOCAL VARIABLES INTEGER(KIND=JPIM) :: KM INTEGER(KIND=JPIM) :: KMLOC - INTEGER(KIND=JPIM) :: IA, IS, ISL, J - INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP), AOFFSETS(D_NUMP), BOFFSETS(D_NUMP), COFFSETS(D_NUMP) + INTEGER(KIND=JPIM) :: IA, IS, ISL, J, IMLOC0(1) + INTEGER(KIND=JPIM) :: KS(D%NUMP), NS(D%NUMP), AOFFSETS(D%NUMP), BOFFSETS(D%NUMP), COFFSETS(D%NUMP) REAL(KIND=JPHOOK) :: ZHOOK_HANDLE REAL(KIND=JPRBT) :: PAIA, PAIS, V1, V2 @@ -140,6 +144,10 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 INTEGER(KIND=8) :: ALLOC_SZ, ALLOC_POS + ASSOCIATE(D_NUMP=>D%NUMP, R_NSMAX=>R%NSMAX, R_NTMAX=>R%NTMAX, G_NDGLU=>G%NDGLU, & + & D_MYMS=>D%MYMS, D_OFFSETS_GEMM1=>D%OFFSETS_GEMM1, & + & D_OFFSETS_GEMM2=>D%OFFSETS_GEMM2, & + & ZAA=>FG%ZAA, ZAS=>FG%ZAS, ZAA0=>FG%ZAA0, ZAS0=>FG%ZAS0) IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) CALL LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& @@ -150,7 +158,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) #ifdef ACCGPU !$ACC DATA & !$ACC& PRESENT(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0) & - !$ACC& PRESENT(D_MYMS,D_NUMP,R_NTMAX,R_NSMAX,G_NDGLU) & + !$ACC& PRESENT(D,D_MYMS,D_NUMP,R,R_NTMAX,R_NSMAX) & !$ACC& PRESENT(ZAA,ZAS,POA1,D_OFFSETS_GEMM1,D_OFFSETS_GEMM2) #endif @@ -164,7 +172,9 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) ENDIF CALL GSTATS(414,0) - IF(KMLOC0 > 0) THEN + ! anti-symmetric + IMLOC0 = FINDLOC(D_MYMS,0) + IF(IMLOC0(1) > 0) THEN ! compute m=0 in double precision: #ifdef OMPGPU !$OMP TARGET DATA USE_DEVICE_PTR(ZAA0,ZINPA0,ZOUT0) @@ -199,9 +209,9 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC) ENDDO - IF(KMLOC0 > 0) THEN - NS(KMLOC0) = 0 - KS(KMLOC0) = 0 + IF(IMLOC0(1) > 0) THEN + NS(IMLOC0(1)) = 0 + KS(IMLOC0(1)) = 0 ENDIF #ifdef OMPGPU !$OMP TARGET DATA USE_DEVICE_PTR(ZAA,ZINPA,ZOUT) @@ -210,7 +220,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) !$ACC HOST_DATA USE_DEVICE(ZAA,ZINPA,ZOUT) #endif CALL HIP_GEMM( & - & 21, & ! unique identifier + & NCUR_RESOL, 21, & ! unique identifier & 'N', 'N', & & 2*KF_FS, NS(:), KS(:), & & 1.0_JPRBT, & @@ -278,7 +288,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) ENDIF CALL GSTATS(414,0) - IF(KMLOC0 > 0) THEN + IF(IMLOC0(1) > 0) THEN #ifdef OMPGPU !$OMP TARGET DATA USE_DEVICE_PTR(ZAS0,ZINPS0,ZOUT0) #endif @@ -314,9 +324,9 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC) ENDDO - IF(KMLOC0 > 0) THEN - NS(KMLOC0) = 0 - KS(KMLOC0) = 0 + IF(IMLOC0(1) > 0) THEN + NS(IMLOC0(1)) = 0 + KS(IMLOC0(1)) = 0 ENDIF #ifdef OMPGPU !$OMP TARGET DATA USE_DEVICE_PTR(ZAS,ZINPS,ZOUT) @@ -325,7 +335,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) !$ACC HOST_DATA USE_DEVICE(ZAS,ZINPS,ZOUT) #endif CALL HIP_GEMM( & - & 22, & ! unique identifier + & NCUR_RESOL, 22, & ! unique identifier & 'N', 'N', & & 2*KF_FS, NS(:), KS(:), & & 1.0_JPRBT, & @@ -390,5 +400,6 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) IF (LHOOK) CALL DR_HOOK('LE_DGEMM',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ + END ASSOCIATE END SUBROUTINE LEDIR END MODULE LEDIR_MOD diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 70a729ac..7d7d378c 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -24,7 +24,7 @@ MODULE LEINV_MOD SUBROUTINE LEINV_STRIDES(KF_LEG,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& IOUT0_STRIDES0,IOUT0_SIZE,IIN0_STRIDES0,IIN0_SIZE) USE TPM_DIM, ONLY: R - USE TPM_DISTR, ONLY: D, D_OFFSETS_GEMM1, D_OFFSETS_GEMM2 + USE TPM_DISTR, ONLY: D IMPLICIT NONE @@ -35,6 +35,8 @@ SUBROUTINE LEINV_STRIDES(KF_LEG,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& INTEGER(KIND=JPIM), OPTIONAL :: IOUT0_STRIDES0, IOUT0_SIZE INTEGER(KIND=JPIM), OPTIONAL :: IIN0_STRIDES0, IIN0_SIZE + ASSOCIATE(D_OFFSETS_GEMM1=>D%OFFSETS_GEMM1, D_OFFSETS_GEMM2=>D%OFFSETS_GEMM2) + IF (PRESENT(IOUT_STRIDES0)) & IOUT0_STRIDES0 = ALIGN(KF_LEG,A) @@ -52,6 +54,8 @@ SUBROUTINE LEINV_STRIDES(KF_LEG,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& IIN0_STRIDES0 = ALIGN(KF_LEG,A) IF (PRESENT(IIN0_SIZE)) & IIN0_SIZE = IIN0_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) + + END ASSOCIATE END SUBROUTINE LEINV_STRIDES SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) @@ -93,12 +97,12 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) ! F. Vana 05-Mar-2015 Support for single precision ! ------------------------------------------------------------------ - USE TPM_GEN, ONLY: LSYNC_TRANS, NOUT + USE TPM_GEN, ONLY: LSYNC_TRANS, NOUT, NCUR_RESOL USE YOMHOOK, ONLY: LHOOK, DR_HOOK, JPHOOK - USE TPM_DIM, ONLY: R_NDGNH, R_NSMAX, R_NDGL - USE TPM_GEOMETRY, ONLY: G_NDGLU - USE TPM_FIELDS_FLAT, ONLY: ZAA, ZAS, ZAA0, ZAS0, KMLOC0 - USE TPM_DISTR, ONLY: D_NUMP, D_MYMS, MYPROC, D_OFFSETS_GEMM1, D_OFFSETS_GEMM2 + USE TPM_DIM, ONLY: R + USE TPM_GEOMETRY, ONLY: G + USE TPM_FIELDS_GPU, ONLY: FG + USE TPM_DISTR, ONLY: D USE HICBLAS_MOD, ONLY: HIP_DGEMM_BATCHED_OVERLOAD, & & HIP_DGEMM_GROUPED_OVERLOAD, HIP_SGEMM_GROUPED_OVERLOAD USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_INT @@ -119,8 +123,8 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR ! LOCAL - INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP), AOFFSETS(D_NUMP), BOFFSETS(D_NUMP), COFFSETS(D_NUMP) - INTEGER(KIND=JPIM) :: KM, KMLOC, IA, IS, ISL, J1, JGL, JK, J + INTEGER(KIND=JPIM) :: KS(D%NUMP), NS(D%NUMP), AOFFSETS(D%NUMP), BOFFSETS(D%NUMP), COFFSETS(D%NUMP) + INTEGER(KIND=JPIM) :: KM, KMLOC, IA, IS, ISL, J1, JGL, JK, J, IMLOC0(1) INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE @@ -128,6 +132,9 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + ASSOCIATE(D_NUMP=>D%NUMP, R_NSMAX=>R%NSMAX, G_NDGLU=>G%NDGLU, D_MYMS=>D%MYMS, D_OFFSETS_GEMM1=>D%OFFSETS_GEMM1,& + D_OFFSETS_GEMM2=>D%OFFSETS_GEMM2, & + ZAA=>FG%ZAA, ZAS=>FG%ZAS, ZAA0=>FG%ZAA0, ZAS0=>FG%ZAS0) !* 1.1 PREPARATIONS. IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) @@ -146,10 +153,10 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) #ifdef OMPGPU #endif #ifdef ACCGPU - !$ACC DATA PRESENT(D_MYMS,D_NUMP,G_NDGLU) & + !$ACC DATA PRESENT(D,D_MYMS,D_NUMP) & !$ACC& PRESENT(ZINP,ZOUTS,ZOUTA,ZINP0,ZOUTS0,ZOUTA0) & !$ACC& PRESENT(ZAA,ZAS,PIA) & - !$ACC& PRESENT(R_NSMAX,G_NDGLU,D_OFFSETS_GEMM2) + !$ACC& PRESENT(R,R_NSMAX,D_OFFSETS_GEMM2) #endif ! READ 2:NSMAX+3 @@ -220,7 +227,8 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) ENDIF CALL GSTATS(424,0) - IF (KMLOC0 > 0) THEN + IMLOC0 = FINDLOC(D_MYMS,0) + IF (IMLOC0(1) > 0) THEN ! compute m=0 in double precision #ifdef OMPGPU !$OMP TARGET DATA USE_DEVICE_PTR(ZAA0,ZINP0,ZOUTA0) @@ -255,9 +263,9 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC) ENDDO - IF(KMLOC0 > 0) THEN - NS(KMLOC0) = 0 - KS(KMLOC0) = 0 + IF(IMLOC0(1) > 0) THEN + NS(IMLOC0(1)) = 0 + KS(IMLOC0(1)) = 0 ENDIF #ifdef OMPGPU !$OMP TARGET DATA USE_DEVICE_PTR(ZAA,ZINP,ZOUTA) @@ -266,7 +274,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) !$ACC HOST_DATA USE_DEVICE(ZAA,ZINP,ZOUTA) #endif CALL HIP_GEMM( & - & 11, & ! unique identifier + & NCUR_RESOL, 11, & ! unique identifier & 'N', 'T', & & 2*KF_LEG, NS(:), KS(:), & & 1.0_JPRBT, & @@ -357,7 +365,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) ENDIF CALL GSTATS(424,0) - IF (KMLOC0 > 0) THEN + IF (IMLOC0(1) > 0) THEN #ifdef OMPGPU !$OMP TARGET DATA USE_DEVICE_PTR(ZAS0,ZINP0,ZOUTS0) #endif @@ -389,9 +397,9 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC) ENDDO - IF(KMLOC0 > 0) THEN - NS(KMLOC0) = 0 - KS(KMLOC0) = 0 + IF(IMLOC0(1) > 0) THEN + NS(IMLOC0(1)) = 0 + KS(IMLOC0(1)) = 0 ENDIF #ifdef OMPGPU !$OMP TARGET DATA USE_DEVICE_PTR(ZAS,ZINP,ZOUTS) @@ -400,7 +408,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) !$ACC HOST_DATA USE_DEVICE(ZAS,ZINP,ZOUTS) #endif CALL HIP_GEMM( & - & 12, & ! unique identifier + & NCUR_RESOL, 12, & ! unique identifier & 'N', 'T', & & 2*KF_LEG, NS(:), KS(:), & & 1.0_JPRBT, & @@ -435,5 +443,6 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) IF (LHOOK) CALL DR_HOOK('LE_DGEMM',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ + END ASSOCIATE END SUBROUTINE LEINV END MODULE LEINV_MOD diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index 11ed079a..b95eeeab 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -113,8 +113,7 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& USE SPNSDE_MOD, ONLY: SPNSDE USE LEINV_MOD, ONLY: LEINV_STRIDES, LEINV USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS - USE TPM_FIELDS, ONLY: F - USE TPM_FIELDS_FLAT, ONLY: ZEPSNM + USE TPM_FIELDS_GPU, ONLY: FG USE MPL_MODULE, ONLY: MPL_BARRIER,MPL_ALL_MS_COMM USE TPM_GEN, ONLY: LSYNC_TRANS USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX @@ -204,6 +203,8 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& REAL(KIND=JPRBT), POINTER :: ZINP(:) REAL(KIND=JPRD), POINTER :: ZINP0(:) + ASSOCIATE(ZEPSNM=>FG%ZEPSNM) + ! ------------------------------------------------------------------ !* 1. PERFORM LEGENDRE TRANFORM. @@ -402,6 +403,7 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& CALL LEINV(ALLOCATOR,PIA(2*(IF_READIN-IF_LEG)+1:IF_READIN,:,:),ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,IF_LEG) IF (LHOOK) CALL DR_HOOK('LTINV_MOD',1,ZHOOK_HANDLE) + END ASSOCIATE ! ------------------------------------------------------------------ END SUBROUTINE LTINV END MODULE LTINV_MOD diff --git a/src/trans/gpu/internal/prepsnm_mod.F90 b/src/trans/gpu/internal/prepsnm_mod.F90 index 71841e4e..351e693e 100755 --- a/src/trans/gpu/internal/prepsnm_mod.F90 +++ b/src/trans/gpu/internal/prepsnm_mod.F90 @@ -53,7 +53,7 @@ SUBROUTINE PREPSNM USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT USE TPM_DIM, ONLY: R USE TPM_FIELDS, ONLY: F - USE TPM_FIELDS_FLAT, ONLY: ZEPSNM + USE TPM_FIELDS_GPU, ONLY: FG USE TPM_DISTR, ONLY: D ! @@ -85,12 +85,12 @@ SUBROUTINE PREPSNM !$ACC loop #endif DO JN=0,KM-1 - ZEPSNM(KMLOC,JN) = 0.0_JPRBT + FG%ZEPSNM(KMLOC,JN) = 0.0_JPRBT ENDDO ENDIF DO JN=KM,R%NTMAX+2 - ZEPSNM(KMLOC,JN) =REAL(F%REPSNM(D%NPMT(KM)+KMLOC-KM+JN),JPRBT) + FG%ZEPSNM(KMLOC,JN) = REAL(F%REPSNM(D%NPMT(KM)+KMLOC-KM+JN),JPRBT) ENDDO ! end loop over wavenumber ENDDO diff --git a/src/trans/gpu/internal/prfi1b_mod.F90 b/src/trans/gpu/internal/prfi1b_mod.F90 index 00a2985b..83b7b8b9 100755 --- a/src/trans/gpu/internal/prfi1b_mod.F90 +++ b/src/trans/gpu/internal/prfi1b_mod.F90 @@ -14,8 +14,8 @@ MODULE PRFI1B_MOD SUBROUTINE PRFI1B(PIA,PSPEC,KFIELDS,KDIM,KFLDPTR) USE PARKIND1, ONLY: JPIM, JPRB - USE TPM_DIM, ONLY: R, R_NSMAX - USE TPM_DISTR, ONLY: D, D_NUMP, D_MYMS, D_NASM0 + USE TPM_DIM, ONLY: R + USE TPM_DISTR, ONLY: D USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS !**** *PRFI1* - Prepare spectral fields for inverse Legendre transform @@ -77,10 +77,11 @@ SUBROUTINE PRFI1B(PIA,PSPEC,KFIELDS,KDIM,KFLDPTR) !* 1. EXTRACT FIELDS FROM SPECTRAL ARRAYS. ! -------------------------------------------------- + ASSOCIATE(D_NUMP=>D%NUMP, D_MYMS=>D%MYMS, D_NASM0=>D%NASM0, R_NSMAX=>R%NSMAX) #ifdef ACCGPU !$ACC DATA & - !$ACC& PRESENT(D_NUMP,R_NSMAX,D_MYMS,D_NASM0) & + !$ACC& PRESENT(D,D_NUMP,R,R_NSMAX,D_MYMS,D_NASM0) & !$ACC& PRESENT(PIA) & !$ACC& PRESENT(PSPEC) ASYNC(1) #endif @@ -188,6 +189,7 @@ SUBROUTINE PRFI1B(PIA,PSPEC,KFIELDS,KDIM,KFLDPTR) #endif ! ------------------------------------------------------------------ + END ASSOCIATE END SUBROUTINE PRFI1B END MODULE PRFI1B_MOD diff --git a/src/trans/gpu/internal/set_resol_mod.F90 b/src/trans/gpu/internal/set_resol_mod.F90 index c7367bdd..afd7cbca 100755 --- a/src/trans/gpu/internal/set_resol_mod.F90 +++ b/src/trans/gpu/internal/set_resol_mod.F90 @@ -18,6 +18,7 @@ SUBROUTINE SET_RESOL(KRESOL,LDSETUP) USE TPM_DISTR, ONLY: D, DISTR_RESOL USE TPM_GEOMETRY, ONLY: G, GEOM_RESOL USE TPM_FIELDS, ONLY: F, FIELDS_RESOL +USE TPM_FIELDS_GPU, ONLY: FG, FIELDS_GPU_RESOL USE TPM_FLT, ONLY: S, FLT_RESOL USE TPM_CTL, ONLY: C, CTL_RESOL USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS @@ -57,6 +58,7 @@ SUBROUTINE SET_RESOL(KRESOL,LDSETUP) NCUR_RESOL = IRESOL R => DIM_RESOL(NCUR_RESOL) F => FIELDS_RESOL(NCUR_RESOL) + FG => FIELDS_GPU_RESOL(NCUR_RESOL) G => GEOM_RESOL(NCUR_RESOL) D => DISTR_RESOL(NCUR_RESOL) S => FLT_RESOL(NCUR_RESOL) diff --git a/src/trans/gpu/internal/spnsde_mod.F90 b/src/trans/gpu/internal/spnsde_mod.F90 index 10acc5dc..e7c7283b 100755 --- a/src/trans/gpu/internal/spnsde_mod.F90 +++ b/src/trans/gpu/internal/spnsde_mod.F90 @@ -14,9 +14,8 @@ MODULE SPNSDE_MOD SUBROUTINE SPNSDE(KF_SCALARS,PEPSNM,PF,PNSD) USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT -USE TPM_DIM, ONLY: R, R_NTMAX -USE TPM_DISTR, ONLY: D, D_MYMS, D_NUMP -USE TPM_FIELDS_FLAT, ONLY: ZEPSNM +USE TPM_DIM, ONLY: R +USE TPM_DISTR, ONLY: D !**** *SPNSDE* - Compute North-South derivative in spectral space @@ -82,14 +81,16 @@ SUBROUTINE SPNSDE(KF_SCALARS,PEPSNM,PF,PNSD) ! LOCAL INTEGER SCALARS INTEGER(KIND=JPIM) :: IJ, ISKIP, J, JN, JI, IR, II +ASSOCIATE(D_NUMP=>D%NUMP, R_NTMAX=>R%NTMAX, D_MYMS=>D%MYMS) + #ifdef ACCGPU !$ACC DATA & -!$ACC& PRESENT (R_NTMAX, D_MYMS) & +!$ACC& PRESENT (R,R_NTMAX, D,D_MYMS) & !$ACC& PRESENT (D_NUMP,PEPSNM, PF, PNSD) ASYNC(1) #endif #ifdef OMPGPU !$OMP TARGET DATA & -!$OMP& MAP(PRESENT,ALLOC:ZN) +!$OMP& MAP(PRESENT,PEPSNM,ALLOC:ZN) #endif ! ------------------------------------------------------------------ @@ -103,7 +104,7 @@ SUBROUTINE SPNSDE(KF_SCALARS,PEPSNM,PF,PNSD) #ifdef OMPGPU !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO !! DEFAULT(NONE) PRIVATE(IJ) & - !!$OMP& SHARED(KM,F,ZN,ZEPSNM,KMLOC) + !!$OMP& SHARED(KM,ZN,KMLOC) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,IR,II,JI) & @@ -142,6 +143,7 @@ SUBROUTINE SPNSDE(KF_SCALARS,PEPSNM,PF,PNSD) #endif ! ------------------------------------------------------------------ +END ASSOCIATE END SUBROUTINE SPNSDE END MODULE SPNSDE_MOD diff --git a/src/trans/gpu/internal/tpm_fields_flat.F90 b/src/trans/gpu/internal/tpm_fields_gpu.F90 old mode 100755 new mode 100644 similarity index 62% rename from src/trans/gpu/internal/tpm_fields_flat.F90 rename to src/trans/gpu/internal/tpm_fields_gpu.F90 index 780d0629..09baef27 --- a/src/trans/gpu/internal/tpm_fields_flat.F90 +++ b/src/trans/gpu/internal/tpm_fields_gpu.F90 @@ -1,6 +1,6 @@ ! (C) Copyright 2000- ECMWF. ! (C) Copyright 2000- Meteo-France. -! (C) Copyright 2022- NVIDIA. +! (C) Copyright 2024- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -9,29 +9,26 @@ ! nor does it submit to any jurisdiction. ! -MODULE TPM_FIELDS_FLAT +MODULE TPM_FIELDS_GPU -USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD +USE EC_PARKIND, ONLY: JPRD, JPRBT IMPLICIT NONE SAVE -! flat copies of the fields defined in TPM_FIELDS -REAL(KIND=JPRD) ,ALLOCATABLE :: F_RW(:) ! Weights of the Gaussian quadrature -REAL(KIND=JPRBT) ,ALLOCATABLE :: F_RLAPIN(:) ! eigen-values of the inverse Laplace operator -REAL(KIND=JPRD) ,ALLOCATABLE :: F_RACTHE(:) ! eigen-values of the inverse Laplace operator - +TYPE FIELDS_GPU_TYPE ! scratch arrays for ltinv and ltdir and associated dimension variables - REAL(KIND=JPRBT),ALLOCATABLE :: ZAA(:,:,:) !! JPRL for 1/2 REAL(KIND=JPRBT),ALLOCATABLE :: ZAS(:,:,:) !! JPRL for 1/2 ! for m=0 in ledir_mod: REAL(KIND=JPRD),ALLOCATABLE :: ZAA0(:,:) REAL(KIND=JPRD),ALLOCATABLE :: ZAS0(:,:) -INTEGER(KIND=JPIM) :: KMLOC0 - REAL(KIND=JPRBT),ALLOCATABLE :: ZEPSNM(:,:) +END TYPE FIELDS_GPU_TYPE + +TYPE(FIELDS_GPU_TYPE),ALLOCATABLE,TARGET :: FIELDS_GPU_RESOL(:) +TYPE(FIELDS_GPU_TYPE),POINTER :: FG -END MODULE TPM_FIELDS_FLAT +END MODULE TPM_FIELDS_GPU diff --git a/src/trans/gpu/internal/tpm_hicfft.F90 b/src/trans/gpu/internal/tpm_hicfft.F90 index 9ae63df0..abb06a9f 100755 --- a/src/trans/gpu/internal/tpm_hicfft.F90 +++ b/src/trans/gpu/internal/tpm_hicfft.F90 @@ -28,6 +28,7 @@ MODULE TPM_HICFFT PRIVATE PUBLIC EXECUTE_DIR_FFT, EXECUTE_INV_FFT + PUBLIC CLEAN_FFT INTERFACE EXECUTE_DIR_FFT MODULE PROCEDURE EXECUTE_DIR_FFT_FLOAT,EXECUTE_DIR_FFT_DOUBLE @@ -37,6 +38,13 @@ MODULE TPM_HICFFT MODULE PROCEDURE EXECUTE_INV_FFT_FLOAT,EXECUTE_INV_FFT_DOUBLE END INTERFACE +INTERFACE + SUBROUTINE CLEAN_FFT(RESOL_ID) BIND(C, NAME="clean_fft") + USE ISO_C_BINDING + INTEGER(KIND=C_INT), INTENT(IN), VALUE :: RESOL_ID + END SUBROUTINE +END INTERFACE + ! ------------------------------------------------------------------ @@ -44,22 +52,24 @@ MODULE TPM_HICFFT ! ------------------------------------------------------------------ -SUBROUTINE EXECUTE_DIR_FFT_FLOAT(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,ALLOC) +SUBROUTINE EXECUTE_DIR_FFT_FLOAT(PREEL_REAL,PREEL_COMPLEX,RESOL_ID,KFIELD,LOENS,OFFSETS,ALLOC) USE EC_PARKIND ,ONLY : JPIM IMPLICIT NONE REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_REAL(:) REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_COMPLEX(:) + INTEGER(KIND=JPIM),INTENT(IN) :: RESOL_ID INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC INTERFACE - SUBROUTINE EXECUTE_DIR_FFT_FLOAT_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & + SUBROUTINE EXECUTE_DIR_FFT_FLOAT_C(PREEL_REAL,PREEL_COMPLEX,RESOL_ID,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & & BIND(C, NAME="execute_dir_fft_float") USE ISO_C_BINDING REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_REAL(*) REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_COMPLEX(*) + INTEGER(KIND=C_INT),INTENT(IN),VALUE :: RESOL_ID INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(*), OFFSETS(*) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT @@ -70,28 +80,30 @@ SUBROUTINE EXECUTE_DIR_FFT_FLOAT_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS #ifdef ACCGPU !$ACC HOST_DATA USE_DEVICE(PREEL_REAL,PREEL_COMPLEX) #endif - CALL EXECUTE_DIR_FFT_FLOAT_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,SIZE(LOENS),C_LOC(ALLOC)) + CALL EXECUTE_DIR_FFT_FLOAT_C(PREEL_REAL,PREEL_COMPLEX,RESOL_ID,KFIELD,LOENS,OFFSETS,SIZE(LOENS),C_LOC(ALLOC)) #ifdef ACCGPU !$ACC END HOST_DATA #endif END SUBROUTINE EXECUTE_DIR_FFT_FLOAT -SUBROUTINE EXECUTE_DIR_FFT_DOUBLE(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,ALLOC) +SUBROUTINE EXECUTE_DIR_FFT_DOUBLE(PREEL_REAL,PREEL_COMPLEX,RESOL_ID,KFIELD,LOENS,OFFSETS,ALLOC) USE EC_PARKIND ,ONLY : JPIM IMPLICIT NONE REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_REAL(:) REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_COMPLEX(:) + INTEGER(KIND=JPIM),INTENT(IN) :: RESOL_ID INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC INTERFACE - SUBROUTINE EXECUTE_DIR_FFT_DOUBLE_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & + SUBROUTINE EXECUTE_DIR_FFT_DOUBLE_C(PREEL_REAL,PREEL_COMPLEX,RESOL_ID,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & & BIND(C, NAME="execute_dir_fft_double") USE ISO_C_BINDING, ONLY: C_DOUBLE, C_INT, C_PTR REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_REAL(*) REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_COMPLEX(*) + INTEGER(KIND=C_INT),INTENT(IN),VALUE :: RESOL_ID INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(*), OFFSETS(*) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT @@ -102,29 +114,31 @@ SUBROUTINE EXECUTE_DIR_FFT_DOUBLE_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSET #ifdef ACCGPU !$ACC HOST_DATA USE_DEVICE(PREEL_REAL,PREEL_COMPLEX) #endif - CALL EXECUTE_DIR_FFT_DOUBLE_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,SIZE(LOENS),C_LOC(ALLOC)) + CALL EXECUTE_DIR_FFT_DOUBLE_C(PREEL_REAL,PREEL_COMPLEX,RESOL_ID,KFIELD,LOENS,OFFSETS,SIZE(LOENS),C_LOC(ALLOC)) #ifdef ACCGPU !$ACC END HOST_DATA #endif END SUBROUTINE EXECUTE_DIR_FFT_DOUBLE -SUBROUTINE EXECUTE_INV_FFT_FLOAT(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,ALLOC) +SUBROUTINE EXECUTE_INV_FFT_FLOAT(PREEL_COMPLEX,PREEL_REAL,RESOL_ID,KFIELD,LOENS,OFFSETS,ALLOC) USE EC_PARKIND ,ONLY : JPIM IMPLICIT NONE REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_COMPLEX(:) REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_REAL(:) + INTEGER(KIND=JPIM),INTENT(IN) :: RESOL_ID INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC INTERFACE - SUBROUTINE EXECUTE_INV_FFT_FLOAT_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & + SUBROUTINE EXECUTE_INV_FFT_FLOAT_C(PREEL_COMPLEX,PREEL_REAL,RESOL_ID,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & & BIND(C, NAME="execute_inv_fft_float") USE ISO_C_BINDING, ONLY: C_FLOAT, C_INT, C_PTR REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_COMPLEX(*) REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_REAL(*) + INTEGER(KIND=C_INT),INTENT(IN),VALUE :: RESOL_ID INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(*), OFFSETS(*) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT @@ -135,28 +149,30 @@ SUBROUTINE EXECUTE_INV_FFT_FLOAT_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS #ifdef ACCGPU !$ACC HOST_DATA USE_DEVICE(PREEL_COMPLEX,PREEL_REAL) #endif - CALL EXECUTE_INV_FFT_FLOAT_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,SIZE(LOENS),C_LOC(ALLOC)) + CALL EXECUTE_INV_FFT_FLOAT_C(PREEL_COMPLEX,PREEL_REAL,RESOL_ID,KFIELD,LOENS,OFFSETS,SIZE(LOENS),C_LOC(ALLOC)) #ifdef ACCGPU !$ACC END HOST_DATA #endif END SUBROUTINE -SUBROUTINE EXECUTE_INV_FFT_DOUBLE(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,ALLOC) +SUBROUTINE EXECUTE_INV_FFT_DOUBLE(PREEL_COMPLEX,PREEL_REAL,RESOL_ID,KFIELD,LOENS,OFFSETS,ALLOC) USE EC_PARKIND ,ONLY : JPIM IMPLICIT NONE REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_COMPLEX(:) REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_REAL(:) + INTEGER(KIND=JPIM),INTENT(IN) :: RESOL_ID INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC INTERFACE - SUBROUTINE EXECUTE_INV_FFT_DOUBLE_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & + SUBROUTINE EXECUTE_INV_FFT_DOUBLE_C(PREEL_COMPLEX,PREEL_REAL,RESOL_ID,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & & BIND(C, NAME="execute_inv_fft_double") USE ISO_C_BINDING, ONLY: C_DOUBLE, C_INT, C_PTR REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_COMPLEX(*) REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_REAL(*) + INTEGER(KIND=C_INT),INTENT(IN),VALUE :: RESOL_ID INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(*), OFFSETS(*) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT @@ -167,7 +183,7 @@ SUBROUTINE EXECUTE_INV_FFT_DOUBLE_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSET #ifdef ACCGPU !$ACC HOST_DATA USE_DEVICE(PREEL_COMPLEX,PREEL_REAL) #endif - CALL EXECUTE_INV_FFT_DOUBLE_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,SIZE(LOENS),C_LOC(ALLOC)) + CALL EXECUTE_INV_FFT_DOUBLE_C(PREEL_COMPLEX,PREEL_REAL,RESOL_ID,KFIELD,LOENS,OFFSETS,SIZE(LOENS),C_LOC(ALLOC)) #ifdef ACCGPU !$ACC END HOST_DATA #endif diff --git a/src/trans/gpu/internal/tpm_trans.F90 b/src/trans/gpu/internal/tpm_trans.F90 index 9d1262d1..c1d933b7 100755 --- a/src/trans/gpu/internal/tpm_trans.F90 +++ b/src/trans/gpu/internal/tpm_trans.F90 @@ -49,9 +49,6 @@ MODULE TPM_TRANS !INTEGER_M :: NF_UV_G ! Global version of NF_UV (grid-point space) !INTEGER_M :: NF_SCALARS_G ! Global version of NF_SCALARS (grid-point space) -REAL(KIND=JPRBT), ALLOCATABLE :: FOUBUF_IN(:) ! Fourier buffer -REAL(KIND=JPRBT), ALLOCATABLE :: FOUBUF(:) ! Fourier buffer - INTEGER(KIND=JPIM) :: NPROMA ! Blocking factor for gridpoint input/output INTEGER(KIND=JPIM) :: NGPBLKS ! Number of NPROMA blocks diff --git a/src/trans/gpu/internal/trltom_pack_unpack.F90 b/src/trans/gpu/internal/trltom_pack_unpack.F90 index d77da0f8..f0937afa 100755 --- a/src/trans/gpu/internal/trltom_pack_unpack.F90 +++ b/src/trans/gpu/internal/trltom_pack_unpack.F90 @@ -71,9 +71,9 @@ SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT - USE TPM_DISTR, ONLY: D, MYSETW, D_NSTAGTF, D_NPNTGTB0, D_NPTRLS, D_NDGL_FS - USE TPM_GEOMETRY, ONLY: G_NMEN, G_NLOEN - USE TPM_DIM, ONLY: R_NSMAX + USE TPM_DISTR, ONLY: D, MYSETW + USE TPM_GEOMETRY, ONLY: G + USE TPM_DIM, ONLY: R USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF ! @@ -89,13 +89,16 @@ SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) REAL(KIND=JPRBT) :: SCAL + ASSOCIATE(D_NSTAGTF=>D%NSTAGTF, D_NPNTGTB0=>D%NPNTGTB0, D_NPTRLS=>D%NPTRLS, & + & D_NDGL_FS=>D%NDGL_FS, G_NMEN=>G%NMEN, G_NLOEN=>G%NLOEN, R_NSMAX=>R%NSMAX) + CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRLTOM_PACK%HFOUBUF_IN),& & 1_C_SIZE_T, INT(D%NLENGT0B*KF_FS*2,KIND=C_SIZE_T)*C_SIZEOF(FOUBUF_IN(1))) #ifdef OMPGPU #endif #ifdef ACCGPU - !$ACC DATA PRESENT(G_NMEN,D_NPNTGTB0,FOUBUF_IN,PREEL_COMPLEX,D_NSTAGTF,D_NDGL_FS,G_NLOEN, R_NSMAX) ASYNC(1) + !$ACC DATA PRESENT(G,G_NMEN,D,D_NPNTGTB0,FOUBUF_IN,PREEL_COMPLEX,D_NSTAGTF,D_NDGL_FS,G_NLOEN, R,R_NSMAX) ASYNC(1) #endif ! scale results and move into next transformation buffer @@ -131,6 +134,7 @@ SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) !$ACC WAIT(1) #endif + END ASSOCIATE END SUBROUTINE TRLTOM_PACK FUNCTION PREPARE_TRLTOM_UNPACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_UNPACK) @@ -166,11 +170,11 @@ END FUNCTION PREPARE_TRLTOM_UNPACK SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD - USE TPM_DIM, ONLY: R_NDGNH, R_NDGL - USE TPM_GEOMETRY, ONLY: G_NDGLU + USE TPM_DIM, ONLY: R + USE TPM_GEOMETRY, ONLY: G USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION - USE TPM_FIELDS_FLAT, ONLY: F_RW, F_RACTHE - USE TPM_DISTR, ONLY: D_NUMP, D_MYMS, D_NPNTGTB1, D_OFFSETS_GEMM1 + USE TPM_FIELDS, ONLY: F + USE TPM_DISTR, ONLY: D USE LEDIR_MOD, ONLY: LEDIR_STRIDES USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF @@ -195,6 +199,9 @@ SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINP REAL(KIND=JPRBT) :: PAIA, PAIS + ASSOCIATE(D_NUMP=>D%NUMP, R_NDGNH=>R%NDGNH, R_NDGL=>R%NDGL, F_RW=>F%RW, F_RACTHE=>F%RACTHE, & + & D_MYMS=>D%MYMS, D_NPNTGTB1=>D%NPNTGTB1, D_OFFSETS_GEMM1=>D%OFFSETS_GEMM1, & + & G_NDGLU=>G%NDGLU) CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_SIZE=IIN_SIZE,& IIN0_STRIDES0=IIN0_STRIDES0,IIN0_SIZE=IIN0_SIZE) @@ -225,8 +232,8 @@ SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINP #ifdef ACCGPU !$ACC DATA & !$ACC& PRESENT(ZINPS,ZINPA,ZINPS0,ZINPA0) & - !$ACC& PRESENT(F_RW,F_RACTHE) & - !$ACC& PRESENT(D_MYMS,D_NUMP,R_NDGNH,R_NDGL,G_NDGLU) & + !$ACC& PRESENT(F,F_RW,F_RACTHE) & + !$ACC& PRESENT(D,D_MYMS,D_NUMP,R,R_NDGNH,R_NDGL,G,G_NDGLU) & !$ACC& PRESENT(D_NPNTGTB1,D_OFFSETS_GEMM1,FOUBUF) !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS) & @@ -290,6 +297,7 @@ SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINP !$ACC END DATA #endif + END ASSOCIATE END SUBROUTINE TRLTOM_UNPACK END MODULE TRLTOM_PACK_UNPACK diff --git a/src/trans/gpu/internal/trmtol_pack_unpack.F90 b/src/trans/gpu/internal/trmtol_pack_unpack.F90 index 4468af60..0dd9d3a8 100755 --- a/src/trans/gpu/internal/trmtol_pack_unpack.F90 +++ b/src/trans/gpu/internal/trmtol_pack_unpack.F90 @@ -86,9 +86,9 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT, JPRD USE YOMHOOK, ONLY: LHOOK, DR_HOOK, JPHOOK - USE TPM_DIM, ONLY: R_NDGNH, R_NDGL - USE TPM_GEOMETRY, ONLY: G_NDGLU - USE TPM_DISTR, ONLY: D, D_NUMP, D_MYMS, D_NPNTGTB1, D_OFFSETS_GEMM1 + USE TPM_DIM, ONLY: R + USE TPM_GEOMETRY, ONLY: G + USE TPM_DISTR, ONLY: D USE LEINV_MOD, ONLY: LEINV_STRIDES USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF @@ -113,6 +113,9 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + ASSOCIATE(D_NUMP=>D%NUMP, R_NDGNH=>R%NDGNH, R_NDGL=>R%NDGL, G_NDGLU=>G%NDGLU, & + & D_MYMS=>D%MYMS, D_NPNTGTB1=>D%NPNTGTB1, D_OFFSETS_GEMM1=>D%OFFSETS_GEMM1) + IF (LHOOK) CALL DR_HOOK('TRMTOL_PACK',0,ZHOOK_HANDLE) CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRMTOL_PACK%HFOUBUF_IN),& @@ -124,7 +127,7 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I #ifdef OMPGPU #endif #ifdef ACCGPU - !$ACC DATA PRESENT(D_MYMS,D_NPNTGTB1,D_NUMP,G_NDGLU,R_NDGNH,R_NDGL) & + !$ACC DATA PRESENT(D,D_MYMS,D_NPNTGTB1,D_NUMP,G,G_NDGLU,R,R_NDGNH,R_NDGL) & !$ACC& PRESENT(ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,D_OFFSETS_GEMM1) #endif @@ -174,6 +177,7 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I IF (LHOOK) CALL DR_HOOK('TRMTOL_PACK',1,ZHOOK_HANDLE) + END ASSOCIATE END SUBROUTINE TRMTOL_PACK FUNCTION PREPARE_TRMTOL_UNPACK(ALLOCATOR,KF_FS) RESULT(HTRMTOL_UNPACK) @@ -224,8 +228,8 @@ SUBROUTINE TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURREN ! ------------------------------------------------------------------ USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT -USE TPM_DISTR, ONLY: D, MYSETW, D_NSTAGTF, D_NPNTGTB0, D_NPTRLS, D_NDGL_FS -USE TPM_GEOMETRY, ONLY: G_NMEN, G_NLOEN, G_NLOEN_MAX +USE TPM_DISTR, ONLY: D, MYSETW +USE TPM_GEOMETRY, ONLY: G USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF ! @@ -238,29 +242,33 @@ SUBROUTINE TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURREN TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR TYPE(TRMTOL_UNPACK_HANDLE), INTENT(IN) :: HTRMTOL_UNPACK -INTEGER(KIND=JPIM) :: JM,JF,IGLG,ISTA,OFFSET_VAR,IOFF_LAT,KGL +INTEGER(KIND=JPIM) :: JM,JF,IGLG,ISTA,OFFSET_VAR,IOFF_LAT,KGL,ILOEN_MAX REAL(KIND=JPRBT) :: RET_REAL, RET_COMPLEX +ASSOCIATE(D_NDGL_FS=>D%NDGL_FS, D_NSTAGTF=>D%NSTAGTF, D_NPNTGTB0=>D%NPNTGTB0, D_NPTRLS=>D%NPTRLS, & + & G_NLOEN=>G%NLOEN, G_NMEN=>G%NMEN) + CALL ASSIGN_PTR(PREEL_COMPLEX, GET_ALLOCATION(ALLOCATOR, HTRMTOL_UNPACK%HREEL),& & 1_C_SIZE_T, INT(KF_TOTAL*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(PREEL_COMPLEX(1))) #ifdef OMPGPU #endif #ifdef ACCGPU -!$ACC DATA PRESENT(G_NLOEN,G_NMEN,D_NPNTGTB0,FOUBUF,PREEL_COMPLEX,D_NSTAGTF,D_NDGL_FS) ASYNC(1) +!$ACC DATA PRESENT(G,G_NLOEN,G_NMEN,D,D_NPNTGTB0,FOUBUF,PREEL_COMPLEX,D_NSTAGTF,D_NDGL_FS) ASYNC(1) #endif OFFSET_VAR=D_NPTRLS(MYSETW) +ILOEN_MAX=MAXVAL(G_NLOEN) #ifdef OMPGPU #endif #ifdef ACCGPU !$ACC PARALLEL LOOP PRIVATE(IGLG,IOFF_LAT,ISTA,RET_REAL,RET_COMPLEX) FIRSTPRIVATE(KF_CURRENT,& -!$ACC& KF_TOTAL,OFFSET_VAR,G_NLOEN_MAX) DEFAULT(NONE) & +!$ACC& KF_TOTAL,OFFSET_VAR,ILOEN_MAX) DEFAULT(NONE) & !$ACC& ASYNC(1) TILE(32,16,1) #endif DO KGL=1,D_NDGL_FS DO JF=1,KF_CURRENT - DO JM=0,G_NLOEN_MAX/2 + DO JM=0,ILOEN_MAX/2 IGLG = OFFSET_VAR+KGL-1 ! FFT transforms NLON real values to floor(NLON/2)+1 complex numbers. Hence we have @@ -290,6 +298,8 @@ SUBROUTINE TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURREN !$ACC WAIT(1) #endif +END ASSOCIATE + END SUBROUTINE TRMTOL_UNPACK END MODULE TRMTOL_PACK_UNPACK diff --git a/src/trans/gpu/internal/updspb_mod.F90 b/src/trans/gpu/internal/updspb_mod.F90 index 0f59694e..938a42fc 100755 --- a/src/trans/gpu/internal/updspb_mod.F90 +++ b/src/trans/gpu/internal/updspb_mod.F90 @@ -57,8 +57,8 @@ SUBROUTINE UPDSPB(KFIELD,POA,PSPEC,KFLDPTR) ! ------------------------------------------------------------------ USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT - USE TPM_DIM, ONLY: R_NTMAX - USE TPM_DISTR, ONLY: D_NUMP, D_MYMS, D_NASM0 + USE TPM_DIM, ONLY: R + USE TPM_DISTR, ONLY: D ! IMPLICIT NONE @@ -85,6 +85,7 @@ SUBROUTINE UPDSPB(KFIELD,POA,PSPEC,KFLDPTR) ! and nn=NTMAX+2-n from NTMAX+2-m to NTMAX+2-NSMAX. ! NLTN(m)=NTMAX+2-m : n=NLTN(nn),nn=NLTN(n) ! nn is the loop index. + ASSOCIATE(D_NUMP=>D%NUMP, D_MYMS=>D%MYMS, D_NASM0=>D%NASM0, R_NTMAX=>R%NTMAX) IF(PRESENT(KFLDPTR)) THEN stop 'Error: code path not (yet) supported in GPU version' @@ -95,7 +96,7 @@ SUBROUTINE UPDSPB(KFIELD,POA,PSPEC,KFLDPTR) !loop over wavenumber #ifdef ACCGPU - !$ACC DATA PRESENT(PSPEC,POA,R_NTMAX,D_NUMP,D_MYMS,D_NASM0) ASYNC(1) + !$ACC DATA PRESENT(PSPEC,POA,R,R_NTMAX,D,D_NUMP,D_MYMS,D_NASM0) ASYNC(1) #endif #ifdef OMPGPU !WARNING: following line should be PRESENT,ALLOC but causes issues with AMD compiler! @@ -135,6 +136,7 @@ SUBROUTINE UPDSPB(KFIELD,POA,PSPEC,KFLDPTR) !$ACC END DATA #endif +END ASSOCIATE ! ------------------------------------------------------------------ END SUBROUTINE UPDSPB diff --git a/src/trans/gpu/internal/uvtvd_mod.F90 b/src/trans/gpu/internal/uvtvd_mod.F90 index 36646f8e..27f5e036 100755 --- a/src/trans/gpu/internal/uvtvd_mod.F90 +++ b/src/trans/gpu/internal/uvtvd_mod.F90 @@ -59,9 +59,9 @@ SUBROUTINE UVTVD(KF_UV,PU,PV,PVOR,PDIV) ! ------------------------------------------------------------------ USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT -USE TPM_DIM, ONLY: R, R_NTMAX -USE TPM_DISTR, ONLY: D, D_NUMP, D_MYMS -USE TPM_FIELDS_FLAT, ONLY: ZEPSNM +USE TPM_DIM, ONLY: R +USE TPM_DISTR, ONLY: D +USE TPM_FIELDS_GPU, ONLY: FG ! IMPLICIT NONE @@ -79,14 +79,15 @@ SUBROUTINE UVTVD(KF_UV,PU,PV,PVOR,PDIV) REAL(KIND=JPRBT) :: ZKM,ZJN ! ------------------------------------------------------------------ +ASSOCIATE(D_NUMP=>D%NUMP, R_NTMAX=>R%NTMAX, D_MYMS=>D%MYMS, ZEPSNM=>FG%ZEPSNM) !* 1. COMPUTE U V FROM VORTICITY AND DIVERGENCE. ! ------------------------------------------ #ifdef ACCGPU !$ACC DATA & -!$ACC& PRESENT(D_MYMS,D_NUMP,R_NTMAX) & -!$ACC& PRESENT(ZEPSNM,PU,PV,PVOR,PDIV) ASYNC(1) +!$ACC& PRESENT(D,D_MYMS,D_NUMP,R,R_NTMAX) & +!$ACC& PRESENT(FG,ZEPSNM,PU,PV,PVOR,PDIV) ASYNC(1) #endif #ifdef OMPGPU !WARNING: following line should be PRESENT,ALLOC but causes issues with AMD compiler! @@ -170,6 +171,7 @@ SUBROUTINE UVTVD(KF_UV,PU,PV,PVOR,PDIV) !$ACC END DATA #endif ! ------------------------------------------------------------------ +END ASSOCIATE END SUBROUTINE UVTVD END MODULE UVTVD_MOD diff --git a/src/trans/gpu/internal/vdtuv_mod.F90 b/src/trans/gpu/internal/vdtuv_mod.F90 index 68e8c47c..d9e90053 100755 --- a/src/trans/gpu/internal/vdtuv_mod.F90 +++ b/src/trans/gpu/internal/vdtuv_mod.F90 @@ -14,10 +14,9 @@ MODULE VDTUV_MOD SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT -USE TPM_DIM, ONLY: R, R_NTMAX +USE TPM_DIM, ONLY: R USE TPM_FIELDS, ONLY: F -USE TPM_FIELDS_FLAT, ONLY: F_RLAPIN -USE TPM_DISTR, ONLY: D, D_NUMP, D_MYMS +USE TPM_DISTR, ONLY: D !**** *VDTUV* - Compute U,V in spectral space @@ -86,9 +85,11 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) ! LOCAL REAL SCALARS REAL(KIND=JPRBT) :: ZKM +ASSOCIATE(D_NUMP=>D%NUMP, D_MYMS=>D%MYMS, R_NTMAX=>R%NTMAX, F_RLAPIN=>F%RLAPIN) + #ifdef ACCGPU !$ACC DATA & -!$ACC& PRESENT(R_NTMAX,D_MYMS,D_NUMP,F_RLAPIN) & +!$ACC& PRESENT(R,R_NTMAX,D,D_MYMS,D_NUMP,F,F_RLAPIN) & !$ACC& PRESENT(PEPSNM, PVOR, PDIV) & !$ACC& PRESENT(PU, PV) #endif @@ -157,6 +158,7 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) !$ACC END DATA #endif ! ------------------------------------------------------------------ +END ASSOCIATE END SUBROUTINE VDTUV END MODULE VDTUV_MOD diff --git a/src/trans/sedrenames.txt b/src/trans/sedrenames.txt index 49cb5148..e999ecfc 100644 --- a/src/trans/sedrenames.txt +++ b/src/trans/sedrenames.txt @@ -110,7 +110,7 @@ s/\ SULEG_MOD/\ SULEG_MOD_VARIANTDESIGNATOR/g s/SUTRLE_MOD/SUTRLE_MOD_VARIANTDESIGNATOR/g s/TPM_FFTW/TPM_FFTW_VARIANTDESIGNATOR/g s/TPM_FFT/TPM_FFT_VARIANTDESIGNATOR/g -s/TPM_FIELDS_FLAT/TPM_FIELDS_FLAT_VARIANTDESIGNATOR/g +s/TPM_FIELDS_GPU/TPM_FIELDS_GPU_VARIANTDESIGNATOR/g s/TPM_FLT/TPM_FLT_VARIANTDESIGNATOR/g s/TPM_TRANS/TPM_TRANS_VARIANTDESIGNATOR/g s/trans_end( *($|\(| |\*|\.h))/trans_end_VARIANTDESIGNATOR\1/g