ecmwf-ifs · lukasm91 · Oct 3, 2024 · Oct 3, 2024 · Oct 3, 2024 · Oct 17, 2024
diff --git a/src/programs/ectrans-benchmark-ifs.F90 b/src/programs/ectrans-benchmark-ifs.F90
@@ -1094,6 +1094,17 @@ subroutine parse_grid(cgrid,ndgl,nloen)
 
 !===================================================================================================
 
+subroutine str2int(str, int, stat)
+
+  character(len=*), intent(in) :: str
+  integer, intent(out) :: int
+  integer, intent(out) :: stat
+  read(str, *, iostat=stat) int
+
+end subroutine str2int
+
+!===================================================================================================
+
 function get_int_value(cname, iarg) result(value)
 
   integer :: value
@@ -1130,6 +1141,78 @@ function get_str_value(cname, iarg) result(value)
 
 !===================================================================================================
 
+subroutine print_help(unit)
+
+  integer, optional :: unit
+  integer :: nout = 6
+  if (present(unit)) then
+    nout = unit
+  endif
+
+  write(nout, "(a)") ""
+
+  if (jprb == jprd) then
+    write(nout, "(a)") "NAME    ectrans-benchmark-dp"
+  else
+    write(nout, "(a)") "NAME    ectrans-benchmark-sp"
+  end if
+  write(nout, "(a)") ""
+
+  write(nout, "(a)") "DESCRIPTION"
+  write(nout, "(a)") "        This program tests ecTrans by transforming fields back and forth&
+    & between spectral "
+  if (jprb == jprd) then
+    write(nout, "(a)") "        space and grid-point space (double-precision version)"
+  else
+    write(nout, "(a)") "        space and grid-point space (single-precision version)"
+  end if
+  write(nout, "(a)") ""
+
+  write(nout, "(a)") "USAGE"
+  if (jprb == jprd) then
+    write(nout, "(a)") "        ectrans-benchmark-dp [options]"
+  else
+    write(nout, "(a)") "        ectrans-benchmark-sp [options]"
+  end if
+  write(nout, "(a)") ""
+
+  write(nout, "(a)") "OPTIONS"
+  write(nout, "(a)") "    -h, --help          Print this message"
+  write(nout, "(a)") "    -v                  Run with verbose output"
+  write(nout, "(a)") "    -t, --truncation T  Run with this triangular spectral truncation&
+    & (default = 79)"
+  write(nout, "(a)") "    -g, --grid GRID     Run with this grid. Possible values: O<N>, F<N>"
+  write(nout, "(a)") "                        If not specified, O<N> is used with N=truncation+1&
+    & (cubic relation)"
+  write(nout, "(a)") "    -n, --niter NITER   Run for this many inverse/direct transform&
+    & iterations (default = 10)"
+  write(nout, "(a)") "    -f, --nfld NFLD     Number of scalar fields (default = 1)"
+  write(nout, "(a)") "    -l, --nlev NLEV     Number of vertical levels (default = 1)"
+  write(nout, "(a)") "    --vordiv            Also transform vorticity-divergence to wind"
+  write(nout, "(a)") "    --scders            Compute scalar derivatives (default off)"
+  write(nout, "(a)") "    --uvders            Compute uv East-West derivatives (default off). Only&
+    & when also --vordiv is given"
+  write(nout, "(a)") "    --flt               Run with fast Legendre transforms (default off)"
+  write(nout, "(a)") "    --nproma NPROMA     Run with NPROMA (default no blocking: NPROMA=ngptot)"
+  write(nout, "(a)") "    --norms             Calculate and print spectral norms of transformed&
+    & fields"
+  write(nout, "(a)") "                        The computation of spectral norms will skew overall&
+    & timings"
+  write(nout, "(a)") "    --meminfo           Show diagnostic information from FIAT's ec_meminfo&
+    & subroutine on memory usage, thread-binding etc."
+  write(nout, "(a)") "    --nprtrv            Size of V set in spectral decomposition"
+  write(nout, "(a)") "    --nprtrw            Size of W set in spectral decomposition"
+  write(nout, "(a)") "    -c, --check VALUE   The multiplier of the machine epsilon used as a&
+   & tolerance for correctness checking"
+  write(nout, "(a)") ""
+  write(nout, "(a)") "DEBUGGING"
+  write(nout, "(a)") "    --dump-values       Output gridpoint fields in unformatted binary file"
+  write(nout, "(a)") ""
+
+end subroutine print_help
+
+!===================================================================================================
+
 subroutine parsing_failed(message)
 
   character(len=*), intent(in) :: message
@@ -1240,17 +1323,6 @@ function cubic_octahedral_gaussian_grid(nsmax) result(cgrid)
 
 !===================================================================================================
 
-subroutine str2int(str, int, stat)
-
-  character(len=*), intent(in) :: str
-  integer, intent(out) :: int
-  integer, intent(out) :: stat
-  read(str, *, iostat=stat) int
-
-end subroutine str2int
-
-!===================================================================================================
-
 subroutine sort(a, n)
 
   real(kind=jprd), intent(inout) :: a(n)
@@ -1275,78 +1347,6 @@ end subroutine sort
 
 !===================================================================================================
 
-subroutine print_help(unit)
-
-  integer, optional :: unit
-  integer :: nout = 6
-  if (present(unit)) then
-    nout = unit
-  endif
-
-  write(nout, "(a)") ""
-
-  if (jprb == jprd) then
-    write(nout, "(a)") "NAME    ectrans-benchmark-dp"
-  else
-    write(nout, "(a)") "NAME    ectrans-benchmark-sp"
-  end if
-  write(nout, "(a)") ""
-
-  write(nout, "(a)") "DESCRIPTION"
-  write(nout, "(a)") "        This program tests ecTrans by transforming fields back and forth&
-    & between spectral "
-  if (jprb == jprd) then
-    write(nout, "(a)") "        space and grid-point space (double-precision version)"
-  else
-    write(nout, "(a)") "        space and grid-point space (single-precision version)"
-  end if
-  write(nout, "(a)") ""
-
-  write(nout, "(a)") "USAGE"
-  if (jprb == jprd) then
-    write(nout, "(a)") "        ectrans-benchmark-dp [options]"
-  else
-    write(nout, "(a)") "        ectrans-benchmark-sp [options]"
-  end if
-  write(nout, "(a)") ""
-
-  write(nout, "(a)") "OPTIONS"
-  write(nout, "(a)") "    -h, --help          Print this message"
-  write(nout, "(a)") "    -v                  Run with verbose output"
-  write(nout, "(a)") "    -t, --truncation T  Run with this triangular spectral truncation&
-    & (default = 79)"
-  write(nout, "(a)") "    -g, --grid GRID     Run with this grid. Possible values: O<N>, F<N>"
-  write(nout, "(a)") "                        If not specified, O<N> is used with N=truncation+1&
-    & (cubic relation)"
-  write(nout, "(a)") "    -n, --niter NITER   Run for this many inverse/direct transform&
-    & iterations (default = 10)"
-  write(nout, "(a)") "    -f, --nfld NFLD     Number of scalar fields (default = 1)"
-  write(nout, "(a)") "    -l, --nlev NLEV     Number of vertical levels (default = 1)"
-  write(nout, "(a)") "    --vordiv            Also transform vorticity-divergence to wind"
-  write(nout, "(a)") "    --scders            Compute scalar derivatives (default off)"
-  write(nout, "(a)") "    --uvders            Compute uv East-West derivatives (default off). Only&
-    & when also --vordiv is given"
-  write(nout, "(a)") "    --flt               Run with fast Legendre transforms (default off)"
-  write(nout, "(a)") "    --nproma NPROMA     Run with NPROMA (default no blocking: NPROMA=ngptot)"
-  write(nout, "(a)") "    --norms             Calculate and print spectral norms of transformed&
-    & fields"
-  write(nout, "(a)") "                        The computation of spectral norms will skew overall&
-    & timings"
-  write(nout, "(a)") "    --meminfo           Show diagnostic information from FIAT's ec_meminfo&
-    & subroutine on memory usage, thread-binding etc."
-  write(nout, "(a)") "    --nprtrv            Size of V set in spectral decomposition"
-  write(nout, "(a)") "    --nprtrw            Size of W set in spectral decomposition"
-  write(nout, "(a)") "    -c, --check VALUE   The multiplier of the machine epsilon used as a&
-   & tolerance for correctness checking"
-  write(nout, "(a)") ""
-  write(nout, "(a)") "DEBUGGING"
-  write(nout, "(a)") "    --dump-values       Output gridpoint fields in unformatted binary file"
-  write(nout, "(a)") ""
-
-end subroutine print_help
-
-!===================================================================================================
-
 subroutine initialize_spectral_arrays(nsmax, zsp, sp3d)
 
   integer,         intent(in)    :: nsmax       ! Spectral truncation

diff --git a/src/trans/common/internal/tpm_dim.F90 b/src/trans/common/internal/tpm_dim.F90
@@ -49,10 +49,4 @@ MODULE TPM_DIM
 TYPE(DIM_TYPE),ALLOCATABLE,TARGET :: DIM_RESOL(:)
 TYPE(DIM_TYPE),POINTER     :: R
 
-! flat copies of above
-INTEGER(KIND=JPIM) :: R_NSMAX      ! Truncation order
-INTEGER(KIND=JPIM) :: R_NTMAX      ! Truncation order for tendencies
-INTEGER(KIND=JPIM) :: R_NDGNH      ! Number of rows in northern hemisphere
-INTEGER(KIND=JPIM) :: R_NDGL       ! Number of rows of latitudes
-
 END MODULE TPM_DIM
diff --git a/src/trans/common/internal/tpm_distr.F90 b/src/trans/common/internal/tpm_distr.F90
@@ -178,29 +178,5 @@ MODULE TPM_DISTR
 TYPE(DISTR_TYPE),ALLOCATABLE,TARGET :: DISTR_RESOL(:)
 TYPE(DISTR_TYPE),POINTER     :: D
 
-!flat versions of the above
-INTEGER(KIND=JPIM) :: D_NUMP      ! No. of spectral waves handled by this processor
-INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_MYMS(:)    ! Wave numbers handled by this PE
-INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NSTAGT0B(:) ! Start adresses for segments within buffer
-                                  ! (according to processors to whom data 
-                                  ! is going to be sent) 
-INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NSTAGT1B(:) 
-INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPROCL(:) ! Process responsible for each lat. (F.S)
-INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPNTGTB1(:,:)
-INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NASM0(:)  ! Address in a spectral array of (m, n=m)
-INTEGER(KIND=JPIM) ,ALLOCATABLE  :: D_NSTAGTF(:) ! Offset for specific latitude in 
-INTEGER(KIND=JPIM) :: D_NDGL_FS     ! Number of rows of latitudes for which this process is
-                                  ! performing Fourier Space calculations
-INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_MSTABF(:)
-INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPNTGTB0(:,:)
-INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPROCM(:)  ! Process that does the calc. for certain 
-INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPTRLS(:) ! Pointer to first lat. (F.S)
-
-
-! The offsets in the input and output arrays to the gemms.
-! (1) are the offsets in the "inputs" of dirtrans ("outputs" invtrans)
-! (2) are the offsets in the "outputs" of invtrans ("inputs" dirtrans)
-INTEGER(KIND=JPIM), POINTER :: D_OFFSETS_GEMM1(:), D_OFFSETS_GEMM2(:)
-
 END MODULE TPM_DISTR
 
diff --git a/src/trans/common/internal/tpm_geometry.F90 b/src/trans/common/internal/tpm_geometry.F90
@@ -34,11 +34,4 @@ MODULE TPM_GEOMETRY
 TYPE(GEOM_TYPE),ALLOCATABLE,TARGET :: GEOM_RESOL(:)
 TYPE(GEOM_TYPE),POINTER     :: G
 
-!flat copies of the above
-INTEGER(KIND=JPIM),ALLOCATABLE :: G_NDGLU(:) ! NUMBER OF HEMISPERIC LATITUDES
-INTEGER(KIND=JPIM),ALLOCATABLE :: G_NMEN(:)  ! ASSOCIATED CUT-OFF WAVE NUMBER
-INTEGER(KIND=JPIM) :: G_NMEN_MAX
-INTEGER(KIND=JPIM),ALLOCATABLE :: G_NLOEN(:) ! NUMBER OF POINTS ON A PARALLEL
-INTEGER(KIND=JPIM) :: G_NLOEN_MAX
-
 END MODULE TPM_GEOMETRY
diff --git a/src/trans/gpu/algor/growing_allocator_mod.F90 b/src/trans/gpu/algor/growing_allocator_mod.F90
@@ -5,6 +5,7 @@ MODULE GROWING_ALLOCATOR_MOD
   PRIVATE
   PUBLIC :: GROWING_ALLOCATION_TYPE
   PUBLIC :: REALLOCATE_GROWING_ALLOCATION, REGISTER_FREE_FUNCTION
+  PUBLIC :: DESTROY_GROWING_ALLOCATOR
 
   ABSTRACT INTERFACE
     SUBROUTINE FREE_FUNC_PROC(PTR, SZ) BIND(C)
@@ -32,19 +33,12 @@ SUBROUTINE REALLOCATE_GROWING_ALLOCATION(ALLOC, SZ)
     USE TPM_GEN,       ONLY: NOUT
     IMPLICIT NONE
     TYPE(GROWING_ALLOCATION_TYPE), INTENT(INOUT) :: ALLOC
-    INTEGER(C_SIZE_T) :: SZ
-    INTEGER :: I
+    INTEGER(C_SIZE_T), INTENT(IN) :: SZ
 
     ! Deallocate existing pointer
     IF (ASSOCIATED(ALLOC%PTR) .AND. SZ > SIZE(ALLOC%PTR, 1, C_SIZE_T)) THEN
       WRITE(NOUT,*) "WARNING: REALLOCATING GROWING POINTER CAUSING GRAPH REINSTANTIATION"
-      DO I = 1, ALLOC%FREE_FUNCS_SZ
-        CALL ALLOC%FREE_FUNCS(I)%FUNC(ALLOC%PTR, &
-                                      SIZE(ALLOC%PTR, 1, C_SIZE_T))
-      ENDDO
-      !$ACC EXIT DATA DELETE(ALLOC%PTR)
-      DEALLOCATE(ALLOC%PTR)
-      NULLIFY(ALLOC%PTR)
+      CALL DESTROY_GROWING_ALLOCATOR(ALLOC)
     ENDIF
 
     IF (.NOT. ASSOCIATED(ALLOC%PTR)) THEN
@@ -89,4 +83,20 @@ SUBROUTINE REGISTER_FREE_C(ALLOC_C, FREE_FUNC_C) BIND(C, NAME="growing_allocator
 
   END SUBROUTINE
 
+  SUBROUTINE DESTROY_GROWING_ALLOCATOR(ALLOC)
+    USE ISO_C_BINDING, ONLY: C_SIZE_T
+    IMPLICIT NONE
+    TYPE(GROWING_ALLOCATION_TYPE) :: ALLOC
+    INTEGER :: I
+    IF (ASSOCIATED(ALLOC%PTR)) THEN
+      DO I = 1, ALLOC%FREE_FUNCS_SZ
+        CALL ALLOC%FREE_FUNCS(I)%FUNC(ALLOC%PTR, &
+                                      SIZE(ALLOC%PTR, 1, C_SIZE_T))
+      ENDDO
+      !$ACC EXIT DATA DELETE(ALLOC%PTR)
+      DEALLOCATE(ALLOC%PTR)
+      NULLIFY(ALLOC%PTR)
+    ENDIF
+  END SUBROUTINE
+
 END MODULE
diff --git a/src/trans/gpu/algor/hicblas_cutlass.cuda.h b/src/trans/gpu/algor/hicblas_cutlass.cuda.h
@@ -72,6 +72,7 @@ class cutlass_sgemm_grouped<CutlassType::cutlass_3xtf32, TransA, TransB> {
   static constexpr int sz_align = 8;
 
 public:
+  using real_type = float;
   void operator()(cudaStream_t stream, int m, int n, int k, float alpha,
                   const float *A, int lda, const float *B, int ldb, float beta,
                   float *C, int ldc) const {
@@ -129,6 +130,7 @@ class cutlass_sgemm_grouped<CutlassType::cutlass_fp32, TransA, TransB> {
   static constexpr int sz_align = 1;
 
 public:
+  using real_type = float;
   void operator()(cudaStream_t stream, int m, int n, int k, float alpha,
                   const float *A, int lda, const float *B, int ldb, float beta,
                   float *C, int ldc) const {
@@ -149,7 +151,7 @@ class cutlass_sgemm_grouped<CutlassType::cutlass_fp32, TransA, TransB> {
 
 } // namespace detail
 template <cublasOperation_t TransA, cublasOperation_t TransB>
-void cutlass_sgemm_wrapper_grouped_op(int blas_id, int m, int *n, int *k,
+void cutlass_sgemm_wrapper_grouped_op(int resol_id, int blas_id, int m, int *n, int *k,
                                       float alpha, const float *A, int lda,
                                       int *offsetsA, const float *B, int ldb,
                                       int *offsetsB, float beta, float *C,
@@ -165,18 +167,18 @@ void cutlass_sgemm_wrapper_grouped_op(int blas_id, int m, int *n, int *k,
   if (capability_major >= 8 && use_3xtf32)
     run_group_graph(cutlass_sgemm_grouped<detail::CutlassType::cutlass_3xtf32,
                                           TransA, TransB>(),
-                    m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C,
+                    resol_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C,
                     ldc, offsetsC, batchCount, stream, blas_id,
                     growing_allocator);
   else
     run_group_graph(cutlass_sgemm_grouped<detail::CutlassType::cutlass_fp32,
                                           TransA, TransB>(),
-                    m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C,
+                    resol_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C,
                     ldc, offsetsC, batchCount, stream, blas_id,
                     growing_allocator);
 }
 
-void cutlass_sgemm_wrapper_grouped(int blas_id, char transa, char transb,
+void cutlass_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa, char transb,
                                    int m, int *n, int *k, float alpha,
                                    const float *A, int lda, int *offsetsA,
                                    const float *B, int ldb, int *offsetsB, float beta,
@@ -186,19 +188,19 @@ void cutlass_sgemm_wrapper_grouped(int blas_id, char transa, char transb,
 
   if (transa == 'N' && transb == 'N')
     cutlass_sgemm_wrapper_grouped_op<CUBLAS_OP_N, CUBLAS_OP_N>(
-        blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C,
+        resol_id, blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C,
         ldc, offsetsC, batchCount, stream, growing_allocator);
   else if (transa == 'N' && transb == 'T')
     cutlass_sgemm_wrapper_grouped_op<CUBLAS_OP_N, CUBLAS_OP_T>(
-        blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C,
+        resol_id, blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C,
         ldc, offsetsC, batchCount, stream, growing_allocator);
   else if (transa == 'T' && transb == 'N')
     cutlass_sgemm_wrapper_grouped_op<CUBLAS_OP_T, CUBLAS_OP_N>(
-        blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C,
+        resol_id, blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C,
         ldc, offsetsC, batchCount, stream, growing_allocator);
   else if (transa == 'T' && transb == 'T')
     cutlass_sgemm_wrapper_grouped_op<CUBLAS_OP_T, CUBLAS_OP_T>(
-        blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C,
+        resol_id, blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C,
         ldc, offsetsC, batchCount, stream, growing_allocator);
   else
     assert(false);