Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable multiple resolutions - first step #151

Open
wants to merge 12 commits into
base: develop
Choose a base branch
from
166 changes: 83 additions & 83 deletions src/programs/ectrans-benchmark-ifs.F90
Original file line number Diff line number Diff line change
Expand Up @@ -1094,6 +1094,17 @@ subroutine parse_grid(cgrid,ndgl,nloen)

!===================================================================================================

subroutine str2int(str, int, stat)

character(len=*), intent(in) :: str
integer, intent(out) :: int
integer, intent(out) :: stat
read(str, *, iostat=stat) int

end subroutine str2int

!===================================================================================================

function get_int_value(cname, iarg) result(value)

integer :: value
Expand Down Expand Up @@ -1130,6 +1141,78 @@ function get_str_value(cname, iarg) result(value)

!===================================================================================================

subroutine print_help(unit)

integer, optional :: unit
integer :: nout = 6
if (present(unit)) then
nout = unit
endif

write(nout, "(a)") ""

if (jprb == jprd) then
write(nout, "(a)") "NAME ectrans-benchmark-dp"
else
write(nout, "(a)") "NAME ectrans-benchmark-sp"
end if
write(nout, "(a)") ""

write(nout, "(a)") "DESCRIPTION"
write(nout, "(a)") " This program tests ecTrans by transforming fields back and forth&
& between spectral "
if (jprb == jprd) then
write(nout, "(a)") " space and grid-point space (double-precision version)"
else
write(nout, "(a)") " space and grid-point space (single-precision version)"
end if
write(nout, "(a)") ""

write(nout, "(a)") "USAGE"
if (jprb == jprd) then
write(nout, "(a)") " ectrans-benchmark-dp [options]"
else
write(nout, "(a)") " ectrans-benchmark-sp [options]"
end if
write(nout, "(a)") ""

write(nout, "(a)") "OPTIONS"
write(nout, "(a)") " -h, --help Print this message"
write(nout, "(a)") " -v Run with verbose output"
write(nout, "(a)") " -t, --truncation T Run with this triangular spectral truncation&
& (default = 79)"
write(nout, "(a)") " -g, --grid GRID Run with this grid. Possible values: O<N>, F<N>"
write(nout, "(a)") " If not specified, O<N> is used with N=truncation+1&
& (cubic relation)"
write(nout, "(a)") " -n, --niter NITER Run for this many inverse/direct transform&
& iterations (default = 10)"
write(nout, "(a)") " -f, --nfld NFLD Number of scalar fields (default = 1)"
write(nout, "(a)") " -l, --nlev NLEV Number of vertical levels (default = 1)"
write(nout, "(a)") " --vordiv Also transform vorticity-divergence to wind"
write(nout, "(a)") " --scders Compute scalar derivatives (default off)"
write(nout, "(a)") " --uvders Compute uv East-West derivatives (default off). Only&
& when also --vordiv is given"
write(nout, "(a)") " --flt Run with fast Legendre transforms (default off)"
write(nout, "(a)") " --nproma NPROMA Run with NPROMA (default no blocking: NPROMA=ngptot)"
write(nout, "(a)") " --norms Calculate and print spectral norms of transformed&
& fields"
write(nout, "(a)") " The computation of spectral norms will skew overall&
& timings"
write(nout, "(a)") " --meminfo Show diagnostic information from FIAT's ec_meminfo&
& subroutine on memory usage, thread-binding etc."
write(nout, "(a)") " --nprtrv Size of V set in spectral decomposition"
write(nout, "(a)") " --nprtrw Size of W set in spectral decomposition"
write(nout, "(a)") " -c, --check VALUE The multiplier of the machine epsilon used as a&
& tolerance for correctness checking"
write(nout, "(a)") ""
write(nout, "(a)") "DEBUGGING"
write(nout, "(a)") " --dump-values Output gridpoint fields in unformatted binary file"
write(nout, "(a)") ""

end subroutine print_help

!===================================================================================================

subroutine parsing_failed(message)

character(len=*), intent(in) :: message
Expand Down Expand Up @@ -1240,17 +1323,6 @@ function cubic_octahedral_gaussian_grid(nsmax) result(cgrid)

!===================================================================================================

subroutine str2int(str, int, stat)

character(len=*), intent(in) :: str
integer, intent(out) :: int
integer, intent(out) :: stat
read(str, *, iostat=stat) int

end subroutine str2int

!===================================================================================================

subroutine sort(a, n)

real(kind=jprd), intent(inout) :: a(n)
Expand All @@ -1275,78 +1347,6 @@ end subroutine sort

!===================================================================================================

subroutine print_help(unit)

integer, optional :: unit
integer :: nout = 6
if (present(unit)) then
nout = unit
endif

write(nout, "(a)") ""

if (jprb == jprd) then
write(nout, "(a)") "NAME ectrans-benchmark-dp"
else
write(nout, "(a)") "NAME ectrans-benchmark-sp"
end if
write(nout, "(a)") ""

write(nout, "(a)") "DESCRIPTION"
write(nout, "(a)") " This program tests ecTrans by transforming fields back and forth&
& between spectral "
if (jprb == jprd) then
write(nout, "(a)") " space and grid-point space (double-precision version)"
else
write(nout, "(a)") " space and grid-point space (single-precision version)"
end if
write(nout, "(a)") ""

write(nout, "(a)") "USAGE"
if (jprb == jprd) then
write(nout, "(a)") " ectrans-benchmark-dp [options]"
else
write(nout, "(a)") " ectrans-benchmark-sp [options]"
end if
write(nout, "(a)") ""

write(nout, "(a)") "OPTIONS"
write(nout, "(a)") " -h, --help Print this message"
write(nout, "(a)") " -v Run with verbose output"
write(nout, "(a)") " -t, --truncation T Run with this triangular spectral truncation&
& (default = 79)"
write(nout, "(a)") " -g, --grid GRID Run with this grid. Possible values: O<N>, F<N>"
write(nout, "(a)") " If not specified, O<N> is used with N=truncation+1&
& (cubic relation)"
write(nout, "(a)") " -n, --niter NITER Run for this many inverse/direct transform&
& iterations (default = 10)"
write(nout, "(a)") " -f, --nfld NFLD Number of scalar fields (default = 1)"
write(nout, "(a)") " -l, --nlev NLEV Number of vertical levels (default = 1)"
write(nout, "(a)") " --vordiv Also transform vorticity-divergence to wind"
write(nout, "(a)") " --scders Compute scalar derivatives (default off)"
write(nout, "(a)") " --uvders Compute uv East-West derivatives (default off). Only&
& when also --vordiv is given"
write(nout, "(a)") " --flt Run with fast Legendre transforms (default off)"
write(nout, "(a)") " --nproma NPROMA Run with NPROMA (default no blocking: NPROMA=ngptot)"
write(nout, "(a)") " --norms Calculate and print spectral norms of transformed&
& fields"
write(nout, "(a)") " The computation of spectral norms will skew overall&
& timings"
write(nout, "(a)") " --meminfo Show diagnostic information from FIAT's ec_meminfo&
& subroutine on memory usage, thread-binding etc."
write(nout, "(a)") " --nprtrv Size of V set in spectral decomposition"
write(nout, "(a)") " --nprtrw Size of W set in spectral decomposition"
write(nout, "(a)") " -c, --check VALUE The multiplier of the machine epsilon used as a&
& tolerance for correctness checking"
write(nout, "(a)") ""
write(nout, "(a)") "DEBUGGING"
write(nout, "(a)") " --dump-values Output gridpoint fields in unformatted binary file"
write(nout, "(a)") ""

end subroutine print_help

!===================================================================================================

subroutine initialize_spectral_arrays(nsmax, zsp, sp3d)

integer, intent(in) :: nsmax ! Spectral truncation
Expand Down
6 changes: 0 additions & 6 deletions src/trans/common/internal/tpm_dim.F90
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,4 @@ MODULE TPM_DIM
TYPE(DIM_TYPE),ALLOCATABLE,TARGET :: DIM_RESOL(:)
TYPE(DIM_TYPE),POINTER :: R

! flat copies of above
INTEGER(KIND=JPIM) :: R_NSMAX ! Truncation order
INTEGER(KIND=JPIM) :: R_NTMAX ! Truncation order for tendencies
INTEGER(KIND=JPIM) :: R_NDGNH ! Number of rows in northern hemisphere
INTEGER(KIND=JPIM) :: R_NDGL ! Number of rows of latitudes

END MODULE TPM_DIM
24 changes: 0 additions & 24 deletions src/trans/common/internal/tpm_distr.F90
Original file line number Diff line number Diff line change
Expand Up @@ -178,29 +178,5 @@ MODULE TPM_DISTR
TYPE(DISTR_TYPE),ALLOCATABLE,TARGET :: DISTR_RESOL(:)
TYPE(DISTR_TYPE),POINTER :: D

!flat versions of the above
INTEGER(KIND=JPIM) :: D_NUMP ! No. of spectral waves handled by this processor
INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_MYMS(:) ! Wave numbers handled by this PE
INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NSTAGT0B(:) ! Start adresses for segments within buffer
! (according to processors to whom data
! is going to be sent)
INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NSTAGT1B(:)
INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPROCL(:) ! Process responsible for each lat. (F.S)
INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPNTGTB1(:,:)
INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NASM0(:) ! Address in a spectral array of (m, n=m)
INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NSTAGTF(:) ! Offset for specific latitude in
INTEGER(KIND=JPIM) :: D_NDGL_FS ! Number of rows of latitudes for which this process is
! performing Fourier Space calculations
INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_MSTABF(:)
INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPNTGTB0(:,:)
INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPROCM(:) ! Process that does the calc. for certain
INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPTRLS(:) ! Pointer to first lat. (F.S)


! The offsets in the input and output arrays to the gemms.
! (1) are the offsets in the "inputs" of dirtrans ("outputs" invtrans)
! (2) are the offsets in the "outputs" of invtrans ("inputs" dirtrans)
INTEGER(KIND=JPIM), POINTER :: D_OFFSETS_GEMM1(:), D_OFFSETS_GEMM2(:)

END MODULE TPM_DISTR

7 changes: 0 additions & 7 deletions src/trans/common/internal/tpm_geometry.F90
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,4 @@ MODULE TPM_GEOMETRY
TYPE(GEOM_TYPE),ALLOCATABLE,TARGET :: GEOM_RESOL(:)
TYPE(GEOM_TYPE),POINTER :: G

!flat copies of the above
INTEGER(KIND=JPIM),ALLOCATABLE :: G_NDGLU(:) ! NUMBER OF HEMISPERIC LATITUDES
INTEGER(KIND=JPIM),ALLOCATABLE :: G_NMEN(:) ! ASSOCIATED CUT-OFF WAVE NUMBER
INTEGER(KIND=JPIM) :: G_NMEN_MAX
INTEGER(KIND=JPIM),ALLOCATABLE :: G_NLOEN(:) ! NUMBER OF POINTS ON A PARALLEL
INTEGER(KIND=JPIM) :: G_NLOEN_MAX

END MODULE TPM_GEOMETRY
28 changes: 19 additions & 9 deletions src/trans/gpu/algor/growing_allocator_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ MODULE GROWING_ALLOCATOR_MOD
PRIVATE
PUBLIC :: GROWING_ALLOCATION_TYPE
PUBLIC :: REALLOCATE_GROWING_ALLOCATION, REGISTER_FREE_FUNCTION
PUBLIC :: DESTROY_GROWING_ALLOCATOR

ABSTRACT INTERFACE
SUBROUTINE FREE_FUNC_PROC(PTR, SZ) BIND(C)
Expand Down Expand Up @@ -32,19 +33,12 @@ SUBROUTINE REALLOCATE_GROWING_ALLOCATION(ALLOC, SZ)
USE TPM_GEN, ONLY: NOUT
IMPLICIT NONE
TYPE(GROWING_ALLOCATION_TYPE), INTENT(INOUT) :: ALLOC
INTEGER(C_SIZE_T) :: SZ
INTEGER :: I
INTEGER(C_SIZE_T), INTENT(IN) :: SZ

! Deallocate existing pointer
IF (ASSOCIATED(ALLOC%PTR) .AND. SZ > SIZE(ALLOC%PTR, 1, C_SIZE_T)) THEN
WRITE(NOUT,*) "WARNING: REALLOCATING GROWING POINTER CAUSING GRAPH REINSTANTIATION"
DO I = 1, ALLOC%FREE_FUNCS_SZ
CALL ALLOC%FREE_FUNCS(I)%FUNC(ALLOC%PTR, &
SIZE(ALLOC%PTR, 1, C_SIZE_T))
ENDDO
!$ACC EXIT DATA DELETE(ALLOC%PTR)
DEALLOCATE(ALLOC%PTR)
NULLIFY(ALLOC%PTR)
CALL DESTROY_GROWING_ALLOCATOR(ALLOC)
ENDIF

IF (.NOT. ASSOCIATED(ALLOC%PTR)) THEN
Expand Down Expand Up @@ -89,4 +83,20 @@ SUBROUTINE REGISTER_FREE_C(ALLOC_C, FREE_FUNC_C) BIND(C, NAME="growing_allocator

END SUBROUTINE

SUBROUTINE DESTROY_GROWING_ALLOCATOR(ALLOC)
USE ISO_C_BINDING, ONLY: C_SIZE_T
IMPLICIT NONE
TYPE(GROWING_ALLOCATION_TYPE) :: ALLOC
INTEGER :: I
IF (ASSOCIATED(ALLOC%PTR)) THEN
DO I = 1, ALLOC%FREE_FUNCS_SZ
CALL ALLOC%FREE_FUNCS(I)%FUNC(ALLOC%PTR, &
SIZE(ALLOC%PTR, 1, C_SIZE_T))
ENDDO
!$ACC EXIT DATA DELETE(ALLOC%PTR)
DEALLOCATE(ALLOC%PTR)
NULLIFY(ALLOC%PTR)
ENDIF
END SUBROUTINE

END MODULE
18 changes: 10 additions & 8 deletions src/trans/gpu/algor/hicblas_cutlass.cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ class cutlass_sgemm_grouped<CutlassType::cutlass_3xtf32, TransA, TransB> {
static constexpr int sz_align = 8;

public:
using real_type = float;
void operator()(cudaStream_t stream, int m, int n, int k, float alpha,
const float *A, int lda, const float *B, int ldb, float beta,
float *C, int ldc) const {
Expand Down Expand Up @@ -129,6 +130,7 @@ class cutlass_sgemm_grouped<CutlassType::cutlass_fp32, TransA, TransB> {
static constexpr int sz_align = 1;

public:
using real_type = float;
void operator()(cudaStream_t stream, int m, int n, int k, float alpha,
const float *A, int lda, const float *B, int ldb, float beta,
float *C, int ldc) const {
Expand All @@ -149,7 +151,7 @@ class cutlass_sgemm_grouped<CutlassType::cutlass_fp32, TransA, TransB> {

} // namespace detail
template <cublasOperation_t TransA, cublasOperation_t TransB>
void cutlass_sgemm_wrapper_grouped_op(int blas_id, int m, int *n, int *k,
void cutlass_sgemm_wrapper_grouped_op(int resol_id, int blas_id, int m, int *n, int *k,
float alpha, const float *A, int lda,
int *offsetsA, const float *B, int ldb,
int *offsetsB, float beta, float *C,
Expand All @@ -165,18 +167,18 @@ void cutlass_sgemm_wrapper_grouped_op(int blas_id, int m, int *n, int *k,
if (capability_major >= 8 && use_3xtf32)
run_group_graph(cutlass_sgemm_grouped<detail::CutlassType::cutlass_3xtf32,
TransA, TransB>(),
m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C,
resol_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C,
ldc, offsetsC, batchCount, stream, blas_id,
growing_allocator);
else
run_group_graph(cutlass_sgemm_grouped<detail::CutlassType::cutlass_fp32,
TransA, TransB>(),
m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C,
resol_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C,
ldc, offsetsC, batchCount, stream, blas_id,
growing_allocator);
}

void cutlass_sgemm_wrapper_grouped(int blas_id, char transa, char transb,
void cutlass_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa, char transb,
int m, int *n, int *k, float alpha,
const float *A, int lda, int *offsetsA,
const float *B, int ldb, int *offsetsB, float beta,
Expand All @@ -186,19 +188,19 @@ void cutlass_sgemm_wrapper_grouped(int blas_id, char transa, char transb,

if (transa == 'N' && transb == 'N')
cutlass_sgemm_wrapper_grouped_op<CUBLAS_OP_N, CUBLAS_OP_N>(
blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C,
resol_id, blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C,
ldc, offsetsC, batchCount, stream, growing_allocator);
else if (transa == 'N' && transb == 'T')
cutlass_sgemm_wrapper_grouped_op<CUBLAS_OP_N, CUBLAS_OP_T>(
blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C,
resol_id, blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C,
ldc, offsetsC, batchCount, stream, growing_allocator);
else if (transa == 'T' && transb == 'N')
cutlass_sgemm_wrapper_grouped_op<CUBLAS_OP_T, CUBLAS_OP_N>(
blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C,
resol_id, blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C,
ldc, offsetsC, batchCount, stream, growing_allocator);
else if (transa == 'T' && transb == 'T')
cutlass_sgemm_wrapper_grouped_op<CUBLAS_OP_T, CUBLAS_OP_T>(
blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C,
resol_id, blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C,
ldc, offsetsC, batchCount, stream, growing_allocator);
else
assert(false);
Expand Down
Loading
Loading