From 67daec335773387f8eaf4de28e9c0e9112f6fda5 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Thu, 3 Oct 2024 05:01:46 -0700 Subject: [PATCH 01/10] Remove flattened arrays --- src/trans/common/internal/tpm_dim.F90 | 6 - src/trans/common/internal/tpm_distr.F90 | 24 -- src/trans/common/internal/tpm_geometry.F90 | 7 - src/trans/gpu/external/gpnorm_trans.F90 | 9 +- src/trans/gpu/external/gpnorm_trans_gpu.F90 | 19 +- src/trans/gpu/external/setup_trans.F90 | 266 +++--------------- src/trans/gpu/external/trans_end.F90 | 41 +-- src/trans/gpu/internal/cdmap_mod.F90 | 17 +- src/trans/gpu/internal/dealloc_resol_mod.F90 | 105 +++---- src/trans/gpu/internal/fsc_mod.F90 | 21 +- src/trans/gpu/internal/ftdir_mod.F90 | 12 +- src/trans/gpu/internal/ftinv_mod.F90 | 18 +- src/trans/gpu/internal/ledir_mod.F90 | 43 +-- src/trans/gpu/internal/leinv_mod.F90 | 41 +-- src/trans/gpu/internal/ltinv_mod.F90 | 6 +- src/trans/gpu/internal/prepsnm_mod.F90 | 6 +- src/trans/gpu/internal/prfi1b_mod.F90 | 6 +- src/trans/gpu/internal/set_resol_mod.F90 | 2 + src/trans/gpu/internal/spnsde_mod.F90 | 11 +- ...tpm_fields_flat.F90 => tpm_fields_gpu.F90} | 21 +- src/trans/gpu/internal/tpm_trans.F90 | 3 - src/trans/gpu/internal/trltom_pack_unpack.F90 | 22 +- src/trans/gpu/internal/trmtol_pack_unpack.F90 | 26 +- src/trans/gpu/internal/updspb_mod.F90 | 6 +- src/trans/gpu/internal/uvtvd_mod.F90 | 8 +- src/trans/gpu/internal/vdtuv_mod.F90 | 8 +- src/trans/sedrenames.txt | 2 +- 27 files changed, 276 insertions(+), 480 deletions(-) rename src/trans/gpu/internal/{tpm_fields_flat.F90 => tpm_fields_gpu.F90} (62%) mode change 100755 => 100644 diff --git a/src/trans/common/internal/tpm_dim.F90 b/src/trans/common/internal/tpm_dim.F90 index 236d08d0d..59287add4 100755 --- a/src/trans/common/internal/tpm_dim.F90 +++ b/src/trans/common/internal/tpm_dim.F90 @@ -49,10 +49,4 @@ MODULE TPM_DIM TYPE(DIM_TYPE),ALLOCATABLE,TARGET :: DIM_RESOL(:) TYPE(DIM_TYPE),POINTER :: R -! flat copies of above -INTEGER(KIND=JPIM) :: R_NSMAX ! Truncation order -INTEGER(KIND=JPIM) :: R_NTMAX ! Truncation order for tendencies -INTEGER(KIND=JPIM) :: R_NDGNH ! Number of rows in northern hemisphere -INTEGER(KIND=JPIM) :: R_NDGL ! Number of rows of latitudes - END MODULE TPM_DIM diff --git a/src/trans/common/internal/tpm_distr.F90 b/src/trans/common/internal/tpm_distr.F90 index 6a151192f..dc62c94c6 100755 --- a/src/trans/common/internal/tpm_distr.F90 +++ b/src/trans/common/internal/tpm_distr.F90 @@ -178,29 +178,5 @@ MODULE TPM_DISTR TYPE(DISTR_TYPE),ALLOCATABLE,TARGET :: DISTR_RESOL(:) TYPE(DISTR_TYPE),POINTER :: D -!flat versions of the above -INTEGER(KIND=JPIM) :: D_NUMP ! No. of spectral waves handled by this processor -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_MYMS(:) ! Wave numbers handled by this PE -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NSTAGT0B(:) ! Start adresses for segments within buffer - ! (according to processors to whom data - ! is going to be sent) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NSTAGT1B(:) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPROCL(:) ! Process responsible for each lat. (F.S) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPNTGTB1(:,:) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NASM0(:) ! Address in a spectral array of (m, n=m) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NSTAGTF(:) ! Offset for specific latitude in -INTEGER(KIND=JPIM) :: D_NDGL_FS ! Number of rows of latitudes for which this process is - ! performing Fourier Space calculations -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_MSTABF(:) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPNTGTB0(:,:) -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPROCM(:) ! Process that does the calc. for certain -INTEGER(KIND=JPIM) ,ALLOCATABLE :: D_NPTRLS(:) ! Pointer to first lat. (F.S) - - -! The offsets in the input and output arrays to the gemms. -! (1) are the offsets in the "inputs" of dirtrans ("outputs" invtrans) -! (2) are the offsets in the "outputs" of invtrans ("inputs" dirtrans) -INTEGER(KIND=JPIM), POINTER :: D_OFFSETS_GEMM1(:), D_OFFSETS_GEMM2(:) - END MODULE TPM_DISTR diff --git a/src/trans/common/internal/tpm_geometry.F90 b/src/trans/common/internal/tpm_geometry.F90 index 1ff1a1be9..48454a371 100644 --- a/src/trans/common/internal/tpm_geometry.F90 +++ b/src/trans/common/internal/tpm_geometry.F90 @@ -34,11 +34,4 @@ MODULE TPM_GEOMETRY TYPE(GEOM_TYPE),ALLOCATABLE,TARGET :: GEOM_RESOL(:) TYPE(GEOM_TYPE),POINTER :: G -!flat copies of the above -INTEGER(KIND=JPIM),ALLOCATABLE :: G_NDGLU(:) ! NUMBER OF HEMISPERIC LATITUDES -INTEGER(KIND=JPIM),ALLOCATABLE :: G_NMEN(:) ! ASSOCIATED CUT-OFF WAVE NUMBER -INTEGER(KIND=JPIM) :: G_NMEN_MAX -INTEGER(KIND=JPIM),ALLOCATABLE :: G_NLOEN(:) ! NUMBER OF POINTS ON A PARALLEL -INTEGER(KIND=JPIM) :: G_NLOEN_MAX - END MODULE TPM_GEOMETRY diff --git a/src/trans/gpu/external/gpnorm_trans.F90 b/src/trans/gpu/external/gpnorm_trans.F90 index 7a691a538..711cd7f25 100755 --- a/src/trans/gpu/external/gpnorm_trans.F90 +++ b/src/trans/gpu/external/gpnorm_trans.F90 @@ -59,11 +59,9 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) USE TPM_GEN, ONLY: NOUT USE TPM_DIM, ONLY: R USE TPM_TRANS, ONLY: LGPNORM, NGPBLKS, NPROMA -USE TPM_DISTR, ONLY: D, NPRCIDS, NPRTRV, NPRTRW, MYSETV, MYSETW, NPROC, D_NSTAGTF, & - & D_NPTRLS, MYPROC -USE TPM_GEOMETRY, ONLY: G, G_NLOEN +USE TPM_DISTR, ONLY: D, NPRCIDS, NPRTRV, NPRTRW, MYSETV, MYSETW, NPROC, MYPROC +USE TPM_GEOMETRY, ONLY: G USE TPM_FIELDS, ONLY: F -USE TPM_FIELDS_FLAT, ONLY: F_RW USE SET_RESOL_MOD, ONLY: SET_RESOL USE SET2PE_MOD, ONLY: SET2PE USE MPL_MODULE, ONLY: MPL_RECV, MPL_SEND, JP_BLOCKING_STANDARD @@ -142,6 +140,7 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) CALL ABORT_TRANS('GPNORM_TRANS:THIRD DIMENSION OF PGP TOO SMALL ') ENDIF +ASSOCIATE(F_RW=>F%RW, D_NSTAGTF=>D%NSTAGTF, D_NPTRLS=>D%NPTRLS, G_NLOEN=>G%NLOEN) IF_GP=KFIELDS IF_SCALARS_G=KFIELDS @@ -253,6 +252,8 @@ SUBROUTINE GPNORM_TRANS(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) !$ACC end data CALL GSTATS(1429,1) +END ASSOCIATE + ! from here rest on CPU ! IT IS IMPORTANT THAT SUMS ARE NOW DONE IN LATITUDE ORDER diff --git a/src/trans/gpu/external/gpnorm_trans_gpu.F90 b/src/trans/gpu/external/gpnorm_trans_gpu.F90 index d72b7a1bf..36f87875d 100755 --- a/src/trans/gpu/external/gpnorm_trans_gpu.F90 +++ b/src/trans/gpu/external/gpnorm_trans_gpu.F90 @@ -58,9 +58,9 @@ SUBROUTINE GPNORM_TRANS_GPU(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) USE TPM_GEN, ONLY: NOUT USE TPM_DIM, ONLY: R USE TPM_TRANS, ONLY: LGPNORM, NGPBLKS, NPROMA -USE TPM_DISTR, ONLY: D, NPRCIDS, NPRTRV, NPRTRW, MYSETV, MYSETW, NPROC, D_NSTAGTF, D_NPTRLS -USE TPM_GEOMETRY, ONLY: G, G_NLOEN, G_NLOEN_MAX -USE TPM_FIELDS_FLAT, ONLY: F_RW +USE TPM_DISTR, ONLY: D, NPRCIDS, NPRTRV, NPRTRW, MYSETV, MYSETW, NPROC +USE TPM_GEOMETRY, ONLY: G +USE TPM_FIELDS, ONLY: F USE SET_RESOL_MOD, ONLY: SET_RESOL USE TRGTOL_MOD, ONLY: TRGTOL USE SET2PE_MOD, ONLY: SET2PE @@ -197,17 +197,18 @@ SUBROUTINE GPNORM_TRANS_GPU(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) IBEG=1 IEND=D%NDGL_FS +ASSOCIATE(D_NSTAGTF=>D%NSTAGTF,D_NPTRLS=>D%NPTRLS,G_NLOEN=>G%NLOEN) + CALL GSTATS(1429,0) IF( IF_FS > 0 )THEN #ifdef ACCGPU !$ACC DATA & - !$ACC& COPY(F_RW) & - !$ACC& COPY(D,D_NSTAGTF,D_NPTRLS,G_NLOEN,G_NLOEN_MAX) & + !$ACC& COPY(D,D_NSTAGTF,D_NPTRLS,G_NLOEN) & !$ACC& PRESENT(ZGTF,ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) #endif #ifdef OMPGPU - !$OMP TARGET DATA MAP(TO:F_RW,D,D_NSTAGTF,D_NPTRLS,G_NLOEN,G_NLOEN_MAX) & + !$OMP TARGET DATA MAP(TO:F,D,D_NSTAGTF,D_NPTRLS,G_NLOEN) & !$OMP& MAP(PRESENT,ALLOC:ZGTF,ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO #endif @@ -273,8 +274,8 @@ SUBROUTINE GPNORM_TRANS_GPU(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) DO JGL=IBEG,IEND IGL = D_NPTRLS(MYSETW) + JGL - 1 DO JF=1,IF_FS - ZAVE(JF,JGL)=ZAVE(JF,JGL)*F_RW(IGL)/G_NLOEN(IGL) - !write(iunit,*) 'aver inside ',JF,IF_FS,IGL,ZAVE(JF,JGL), F_RW(IGL), G_NLOEN(IGL),ZMINGPN(JF),ZMAXGPN(JF) + ZAVE(JF,JGL)=ZAVE(JF,JGL)*F%RW(IGL)/G_NLOEN(IGL) + !write(iunit,*) 'aver inside ',JF,IF_FS,IGL,ZAVE(JF,JGL), F%RW(IGL), G_NLOEN(IGL),ZMINGPN(JF),ZMAXGPN(JF) ENDDO ENDDO #ifdef ACCGPU @@ -316,6 +317,8 @@ SUBROUTINE GPNORM_TRANS_GPU(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) ENDIF CALL GSTATS(1429,1) +END ASSOCIATE + ! from here rest on CPU ! IT IS IMPORTANT THAT SUMS ARE NOW DONE IN LATITUDE ORDER diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index bf1cec9db..591ef25ab 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -107,16 +107,11 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& USE EC_ENV_MOD, ONLY: EC_GETENV USE TPM_GEN, ONLY: NOUT, MSETUP0, NCUR_RESOL, NDEF_RESOL, & & NMAX_RESOL, NPRINTLEV, LENABLED, NERR -USE TPM_DIM, ONLY: R, DIM_RESOL, R_NSMAX,R_NTMAX, R_NDGNH, R_NDGL -USE TPM_DISTR, ONLY: D, DISTR_RESOL, NPROC, NPRTRV, D_NUMP, D_NDGL_FS, D_MYMS, & - & D_NSTAGT0B, D_NSTAGT1B, D_NPROCL, D_NPNTGTB1, D_NASM0, & - & D_NSTAGTF, D_MSTABF, D_NPNTGTB0, D_NPROCM, D_NPTRLS, & - & MYPROC, D_OFFSETS_GEMM1, D_OFFSETS_GEMM2 -USE TPM_GEOMETRY, ONLY: G, GEOM_RESOL, G_NDGLU, G_NMEN, G_NMEN_MAX, G_NLOEN, & - & G_NLOEN_MAX +USE TPM_DIM, ONLY: R, DIM_RESOL +USE TPM_DISTR, ONLY: D, DISTR_RESOL, NPROC, NPRTRV, MYPROC +USE TPM_GEOMETRY, ONLY: G, GEOM_RESOL USE TPM_FIELDS, ONLY: FIELDS_RESOL, F -USE TPM_FIELDS_FLAT, ONLY: F_RW, F_RLAPIN, F_RACTHE, ZEPSNM, & - & ZAA, ZAS, ZAA0, ZAS0, KMLOC0 +USE TPM_FIELDS_GPU, ONLY: FIELDS_GPU_RESOL, FG USE TPM_FLT, ONLY: FLT_RESOL, S USE TPM_CTL, ONLY: CTL_RESOL, C USE SET_RESOL_MOD, ONLY: SET_RESOL @@ -173,7 +168,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& INTEGER(KIND=JPIM) :: JGL,JRES,IDEF_RESOL INTEGER(KIND=JPIM) :: JMLOC, KM, ILA, ILS, KMLOC, KDGLU, JK, I, J -INTEGER(KIND=JPIM) :: IPROC, IPROCS, ISTAN, ISTAS, ISL, IGLS, JFLD +INTEGER(KIND=JPIM) :: IPROC, IPROCS, ISTAN, ISTAS, ISL, IGLS, JFLD, IMLOC0(1) LOGICAL :: LLP1,LLP2, LLSPSETUPONLY REAL(KIND=JPRD) :: ZTIME0,ZTIME1,ZTIME2 @@ -232,6 +227,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& IDEF_RESOL = 1 ALLOCATE(DIM_RESOL(NMAX_RESOL)) ALLOCATE(FIELDS_RESOL(NMAX_RESOL)) + ALLOCATE(FIELDS_GPU_RESOL(NMAX_RESOL)) ALLOCATE(GEOM_RESOL(NMAX_RESOL)) ALLOCATE(DISTR_RESOL(NMAX_RESOL)) ALLOCATE(FLT_RESOL(NMAX_RESOL)) @@ -457,15 +453,6 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& IF( .NOT.D%LGRIDONLY ) THEN -!allocating arrays for the GPU: -!! CALL EC_GETENV("ECTRANS_GPU_NFLEV",CENV) -!! IF(LEN_TRIM(CENV)>0) THEN -!! WRITE(NOUT,'(2A)') "Using temporary solution for buffer allocation using ${ECTRANS_GPU_NFLEV}=",CENV -!! READ(CENV,*) NFLEV0 -!! ELSE -!! NFLEV0 = ceiling(REAL(IMAXFLD)/NPRTRV) -!! ENDIF - IUNIT=300+MYPROC #ifdef ACCGPU @@ -481,33 +468,13 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& WRITE(NOUT,*) 'R%NTMAX=',R%NTMAX WRITE(NOUT,*) 'R%NSMAX=',R%NSMAX -#ifdef ACCGPU -!$ACC ENTER DATA COPYIN(F,S,D,R,G) -!$ACC ENTER DATA & -!$ACC& COPYIN(F%RN,F%RLAPIN) & -!$ACC& COPYIN(S%FA,S%ITHRESHOLD) & -!$ACC& COPYIN(D%NUMP,D%MYMS,D%NPNTGTB0,D%NPNTGTB1,D%NSTAGT0B,D%NSTAGT1B,D%NSTAGTF,D%NPROCM,D%NPTRLS,D%MSTABF) & -!$ACC& COPYIN(R%NDGNH,R%NSMAX) & -!$ACC& COPYIN(G%NDGLU,G%NMEN,G%NLOEN) - -#endif -#ifdef OMPGPU -!$OMP TARGET ENTER DATA MAP(ALLOC:ZAA,ZAS) -!$OMP TARGET ENTER DATA MAP(TO:F,S,D,D_NUMP,D_MYMS,R,R_NDGNH,R_NSMAX,G,G_NDGLU) -!$OMP TARGET ENTER DATA MAP(TO:D_NPNTGTB0,D_NPNTGTB1,D_NSTAGT0B,D_NSTAGT1B,D_NSTAGTF,G_NMEN,D_NPROCM,D_NPTRLS,G,G_NLOEN,D_MSTABF) -#endif - ! Initialize A arrays -ALLOCATE(ZAA(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+2)/2,8),D%NUMP)) -ALLOCATE(ZAS(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+3)/2,8),D%NUMP)) +ALLOCATE(FG%ZAA(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+2)/2,8),D%NUMP)) +ALLOCATE(FG%ZAS(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+3)/2,8),D%NUMP)) -WRITE(NOUT,*)'setup_trans: sizes1 NUMP=',D%NUMP -WRITE(NOUT,*)'ZAS:',size(ZAS) -WRITE(NOUT,*)'ZAA:',size(ZAA) - -ZAA(:,:,:) = 0._JPRBT -ZAS(:,:,:) = 0._JPRBT +FG%ZAA(:,:,:) = 0._JPRBT +FG%ZAS(:,:,:) = 0._JPRBT DO JMLOC=1,D%NUMP KM = D%MYMS(JMLOC) @@ -515,203 +482,62 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ILA = (R%NSMAX-KM+2)/2 ILS = (R%NSMAX-KM+3)/2 - ZAA(1:KDGLU,1:ILA,JMLOC)=S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA) - ZAS(1:KDGLU,1:ILS,JMLOC)=S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS) + FG%ZAA(1:KDGLU,1:ILA,JMLOC)=S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA) + FG%ZAS(1:KDGLU,1:ILS,JMLOC)=S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS) ENDDO -! permanent copy of Legendre polynomials into device - -#ifdef ACCGPU -!$ACC ENTER DATA COPYIN(ZAA,ZAS) -#endif -#ifdef OMPGPU -#endif +! arrays for m=0 in ledir_mod: +IMLOC0 = FINDLOC(D%MYMS,0) +IF(IMLOC0(1) > 0) THEN + ALLOCATE(FG%ZAA0(SIZE(FG%ZAA,1),SIZE(FG%ZAA,2))) + ALLOCATE(FG%ZAS0(SIZE(FG%ZAS,1),SIZE(FG%ZAS,2))) + FG%ZAA0 = FG%ZAA(:,:,IMLOC0(1)) + FG%ZAS0 = FG%ZAS(:,:,IMLOC0(1)) +ENDIF -ALLOCATE(ZEPSNM(D%NUMP,0:R%NTMAX+2)) -WRITE(NOUT,*)'ZEPSNM :',SIZE(ZEPSNM) +ALLOCATE(FG%ZEPSNM(D%NUMP,0:R%NTMAX+2)) +FG%ZEPSNM = 0._JPRBT +CALL PREPSNM !Initialize on the host -ZEPSNM = 0._JPRBT -! on the host -CALL PREPSNM +WRITE(NOUT,*)'setup_trans: sizes1 NUMP=',D%NUMP #ifdef ACCGPU -!$ACC ENTER DATA COPYIN(ZEPSNM) + WRITE(NOUT,*) 'Using OpenACC' #endif - -! TODO: I guess tose might be needed again -! add arrays for GPNORM1 -!ALLOCATE(ZAVE(IF_FS,R%NDGL)) -!ALLOCATE(ZMINGL(IF_FS,R%NDGL)) -!ALLOCATE(ZMAXGL(IF_FS,R%NDGL)) -!ALLOCATE(ZMINGPN(IF_FS)) -!ALLOCATE(ZMAXGPN(IF_FS)) -! -!ZAVE = 0._JPRBT -!ZMINGL = 0._JPRBT -!ZMAXGL = 0._JPRBT -!ZMINGPN = 0._JPRBT -!ZMAXGPN = 0._JPRBT -!#ifdef ACCGPU -!!$ACC ENTER DATA COPYIN(ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) -!#endif - -!set up flat copies of constant data -R_NSMAX=R%NSMAX -R_NTMAX=R%NTMAX -R_NDGNH=R%NDGNH -R_NDGL=R%NDGL - - -ALLOCATE(D_NSTAGT0B(SIZE(D%NSTAGT0B))) -ALLOCATE(D_NSTAGT1B(SIZE(D%NSTAGT1B))) -ALLOCATE(D_NPNTGTB0(0:SIZE(D%NPNTGTB0,1)-1,SIZE(D%NPNTGTB0,2))) -ALLOCATE(D_NPNTGTB1(SIZE(D%NPNTGTB1,1),SIZE(D%NPNTGTB1,2))) -ALLOCATE(D_MYMS(SIZE(D%MYMS))) -ALLOCATE(D_NPROCL(SIZE(D%NPROCL))) -ALLOCATE(D_NASM0(0:SIZE(D%NASM0)-1)) -ALLOCATE(D_NSTAGTF(SIZE(D%NSTAGTF))) -ALLOCATE(D_MSTABF(SIZE(D%MSTABF))) -ALLOCATE(D_NPROCM(0:SIZE(D%NPROCM)-1)) -ALLOCATE(D_NPTRLS(SIZE(D%NPTRLS))) - -ALLOCATE(G_NDGLU(0:SIZE(G%NDGLU)-1)) -ALLOCATE(G_NMEN(SIZE(G%NMEN))) -ALLOCATE(G_NLOEN(SIZE(G%NLOEN))) - -ALLOCATE(F_RW(SIZE(F%RW))) -ALLOCATE(F_RLAPIN(-1:SIZE(F%RLAPIN)-2)) -ALLOCATE(F_RACTHE(SIZE(F%RACTHE))) - - -DO I=0,SIZE(G%NDGLU)-1 - G_NDGLU(I)=G%NDGLU(I) -END DO - -G_NMEN_MAX=0 -DO I=1,SIZE(G%NMEN) - G_NMEN(I)=G%NMEN(I) - IF (G_NMEN(I) .GT. G_NMEN_MAX) G_NMEN_MAX=G_NMEN(I) -END DO - -G_NLOEN_MAX=0 -DO I=1,SIZE(G%NLOEN) - G_NLOEN(I)=G%NLOEN(I) - IF (G_NLOEN(I) .GT. G_NLOEN_MAX) G_NLOEN_MAX=G_NLOEN(I) -END DO - -DO I=1,SIZE(D%NSTAGT0B) - D_NSTAGT0B(I)=D%NSTAGT0B(I) -END DO - -DO I=1,SIZE(D%NSTAGT1B) - D_NSTAGT1B(I)=D%NSTAGT1B(I) -END DO - -DO I=1,SIZE(D%NPROCL) - D_NPROCL(I)=D%NPROCL(I) -END DO - -DO I=0,SIZE(D%NASM0)-1 - D_NASM0(I)=D%NASM0(I) -END DO - -DO I=1,SIZE(D%NSTAGTF) - D_NSTAGTF(I)=D%NSTAGTF(I) -END DO - -DO I=1,SIZE(D%MSTABF) - D_MSTABF(I)=D%MSTABF(I) -END DO - -DO I=0,SIZE(D%NPROCM)-1 - D_NPROCM(I)=D%NPROCM(I) -END DO - -DO I=1,SIZE(D%NPTRLS) - D_NPTRLS(I)=D%NPTRLS(I) -END DO - -DO I=1,SIZE(D%NPNTGTB0,2) - DO J=0,SIZE(D%NPNTGTB0,1)-1 - D_NPNTGTB0(J,I)=D%NPNTGTB0(J,I) - END DO -END DO - -DO I=1,SIZE(D%NPNTGTB1,2) - DO J=1,SIZE(D%NPNTGTB1,1) - D_NPNTGTB1(J,I)=D%NPNTGTB1(J,I) - END DO -END DO - -D_OFFSETS_GEMM1 => D%OFFSETS_GEMM1 -D_OFFSETS_GEMM2 => D%OFFSETS_GEMM2 #ifdef OMPGPU + WRITE(NOUT,*) 'Using OpenMP offloading' #endif -#ifdef ACCGPU -!$ACC ENTER DATA COPYIN(D_OFFSETS_GEMM1,D_OFFSETS_GEMM2) -#endif - -D_NUMP=D%NUMP -D_NDGL_FS=D%NDGL_FS +WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS', SIZEOF(FG%ZAS) +WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA', SIZEOF(FG%ZAA) +WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS0', SIZEOF(FG%ZAS0) +WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA0', SIZEOF(FG%ZAA0) +WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZEPSNM', SIZEOF(FG%ZEPSNM) -KMLOC0 = -1 -DO I=1,SIZE(D%MYMS) - D_MYMS(I)=D%MYMS(I) - IF(D_MYMS(I) == 0) KMLOC0 = I -end DO - -! arrays for m=0 in ledir_mod: -IF(KMLOC0 >= 0) THEN - ALLOCATE(ZAA0(SIZE(ZAA,1),SIZE(ZAA,2))) - ALLOCATE(ZAS0(SIZE(ZAS,1),SIZE(ZAS,2))) - ZAA0 = ZAA(:,:,KMLOC0) - ZAS0 = ZAS(:,:,KMLOC0) +IF (IMLOC0(1) > 0) THEN #ifdef ACCGPU - !$ACC ENTER DATA COPYIN(ZAA0,ZAS0) -#endif -#ifdef OMPGPU - !$OMP TARGET ENTER DATA MAP(TO:ZAA0,ZAS0) -#endif - WRITE(NOUT,*) 'GPU arrays for m=0 successfully allocated' -#ifdef ACCGPU - WRITE(NOUT,*) 'Using OpenACC' + !$ACC ENTER DATA COPYIN(FG%ZAA0,FG%ZAS0) ASYNC(1) #endif #ifdef OMPGPU - WRITE(NOUT,*) 'Using OpenMP offloading' + !$OMP TARGET ENTER DATA MAP(TO:FG%ZAA0,FG%ZAS0) #endif ENDIF - -DO I=1,SIZE(F%RW) - F_RW(I)=F%RW(I) -END DO -DO I=-1,SIZE(F%RLAPIN)-2 - F_RLAPIN(I)=REAL(F%RLAPIN(I),JPRBT) -END DO -DO I=1,SIZE(F%RACTHE) - F_RACTHE(I)=F%RACTHE(I) -END DO - #ifdef ACCGPU -!$ACC ENTER DATA COPYIN(R_NSMAX,R_NTMAX,R_NDGL,R_NDGNH,D_NSTAGT0B,D_NSTAGT1B,& -!$ACC& D_NPNTGTB1,D_NPROCL,D_NUMP,D_NDGL_FS,D_MYMS,D_NASM0,D_NSTAGTF,D_MSTABF,& -!$ACC& D_NPNTGTB0,D_NPROCM,D_NPTRLS,G_NDGLU,G_NMEN,G_NMEN_MAX,G_NLOEN,& -!$ACC& G_NLOEN_MAX,F_RW,F_RLAPIN,F_RACTHE) +!$ACC ENTER DATA COPYIN(R) ASYNC(1) +!$ACC ENTER DATA COPYIN(F,F%RLAPIN,F%RACTHE,F%RW) ASYNC(1) +!$ACC ENTER DATA COPYIN(FG,FG%ZAA,FG%ZAS,FG%ZEPSNM) ASYNC(1) +!$ACC ENTER DATA COPYIN(D,D%MYMS,D%NPNTGTB0,D%NPNTGTB1,D%NSTAGT0B,D%NSTAGT1B,D%NSTAGTF,D%NPROCM)& +!$ACC& COPYIN(D%NPTRLS,D%MSTABF,D%NASM0,D%OFFSETS_GEMM1,D%OFFSETS_GEMM2) ASYNC(1) +!$ACC ENTER DATA COPYIN(G,G%NDGLU,G%NMEN,G%NLOEN) ASYNC(1) +!$ACC WAIT(1) #endif #ifdef OMPGPU -!$OMP TARGET ENTER DATA MAP(TO:R_NSMAX,R_NTMAX,R_NDGL,R_NDGNH,D_NSTAGT0B,D_NSTAGT1B) -!$OMP TARGET ENTER DATA MAP(TO:D_NPNTGTB1,D_NPROCL,D_NUMP,D_NDGL_FS,D_MYMS,D_NASM0,D_NSTAGTF,D_MSTABF) -!$OMP TARGET ENTER DATA MAP(TO:D_NPNTGTB0,D_NPROCM,D_NPTRLS,G_NDGLU,G_NMEN,G_NMEN_MAX,G_NLOEN) -!$OMP TARGET ENTER DATA MAP(TO:G_NLOEN_MAX,F_RW,F_RLAPIN,F_RACTHE) +!$OMP TARGET ENTER DATA MAP(ALLOC:FG%ZAA,FG%ZAS) +!$OMP TARGET ENTER DATA MAP(TO:FG,F,S,D,R,G) +!$OMP BARRIER #endif WRITE(NOUT,*) '===GPU arrays successfully allocated' -#ifdef ACCGPU -!$ACC wait -#endif -#ifdef OMPGPU -!$OMP BARRIER -#endif -! free memory +! TODO: This might be good idea - those polynomials are not needed !DO JMLOC=1,D%NUMP ! DEALLOCATE(S%FA(JMLOC)%RPNMA) ! DEALLOCATE(S%FA(JMLOC)%RPNMS) @@ -719,6 +545,6 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& !endif INTERFACE -ENDIF +ENDIF ! D%LGRIDONLY END SUBROUTINE SETUP_TRANS diff --git a/src/trans/gpu/external/trans_end.F90 b/src/trans/gpu/external/trans_end.F90 index 5329c8598..79e204dea 100755 --- a/src/trans/gpu/external/trans_end.F90 +++ b/src/trans/gpu/external/trans_end.F90 @@ -47,16 +47,14 @@ SUBROUTINE TRANS_END(CDMODE) !ifndef INTERFACE USE TPM_GEN, ONLY: MSETUP0, NCUR_RESOL, NMAX_RESOL, LENABLED, NDEF_RESOL -USE TPM_DIM, ONLY: R, DIM_RESOL, R_NSMAX, R_NTMAX, R_NDGNH, R_NDGL -USE TPM_DISTR, ONLY: D, DISTR_RESOL, NPRCIDS, D_NUMP, D_MYMS, D_NSTAGT0B, D_NSTAGT1B, & - & D_NPROCL, D_NPNTGTB1, D_NASM0, D_NSTAGTF, D_MSTABF, D_NPNTGTB0, & - & D_NPROCM, D_NPTRLS -USE TPM_GEOMETRY, ONLY: G, GEOM_RESOL, G_NDGLU, G_NMEN, G_NMEN_MAX,G_NLOEN, G_NLOEN_MAX +USE TPM_DIM, ONLY: R, DIM_RESOL +USE TPM_DISTR, ONLY: D, DISTR_RESOL, NPRCIDS +USE TPM_GEOMETRY, ONLY: G, GEOM_RESOL USE TPM_FIELDS, ONLY: F, FIELDS_RESOL -USE TPM_FIELDS_FLAT, ONLY: F_RW, ZEPSNM, ZAA, ZAS, ZAA0, ZAS0 +USE TPM_FIELDS_GPU, ONLY: FG, FIELDS_GPU_RESOL USE TPM_CTL, ONLY: C, CTL_RESOL USE TPM_FLT, ONLY: S, FLT_RESOL -USE TPM_TRANS, ONLY: FOUBUF, FOUBUF_IN +USE TPM_TRANS, ONLY: GROWING_ALLOCATION USE EQ_REGIONS_MOD, ONLY: N_REGIONS USE SET_RESOL_MOD, ONLY: SET_RESOL USE DEALLOC_RESOL_MOD, ONLY: DEALLOC_RESOL @@ -67,32 +65,12 @@ SUBROUTINE TRANS_END(CDMODE) ! Local variables INTEGER(KIND=JPIM) :: JRES CHARACTER*5 :: CLMODE + ! ------------------------------------------------------------------ CLMODE='FINAL' IF (PRESENT(CDMODE)) CLMODE=CDMODE IF (CLMODE == 'FINAL') THEN -#ifdef ACCGPU - !$ACC EXIT DATA DELETE(ZAA0,ZAS0,ZEPSNM,ZAA,ZAS) -#endif -#ifdef OMPGPU -#endif - DEALLOCATE(ZAA0) - DEALLOCATE(ZAS0) - DEALLOCATE(ZEPSNM) - DEALLOCATE(ZAA) - DEALLOCATE(ZAS) - - - DEALLOCATE(D_NSTAGT0B,D_NSTAGT1B,D_NPNTGTB1,D_MYMS,D_NPROCL,D_NASM0,D_NSTAGTF,D_MSTABF,D_NPNTGTB0,D_NPROCM,D_NPTRLS,G_NDGLU,G_NMEN,G_NLOEN,F_RW) -#ifdef ACCGPU - !$ACC EXIT DATA DELETE(R_NSMAX,R_NTMAX,R_NDGL,R_NDGNH,D_NSTAGT0B,D_NSTAGT1B,D_NPNTGTB1,D_NPROCL, D_NUMP,D_MYMS, & - !$ACC& G_NDGLU,G_NMEN,G_NMEN_MAX,G_NLOEN,G_NLOEN_MAX,D_NSTAGTF,D_MSTABF,D_NPNTGTB0,D_NPROCM,D_NPTRLS,D_NASM0,F_RW) - -#endif -#ifdef OMPGPU - !$OMP TARGET EXIT DATA MAP(DELETE: ) -#endif !CALL HIP_DGEMM_BATCHED_FINALIZE() IF( ALLOCATED( LENABLED ) ) THEN @@ -122,15 +100,14 @@ SUBROUTINE TRANS_END(CDMODE) NULLIFY(F) IF( ALLOCATED(FIELDS_RESOL) ) DEALLOCATE(FIELDS_RESOL) + !TPM_FIELDS_GPU + NULLIFY(FG) + IF( ALLOCATED(FIELDS_GPU_RESOL) ) DEALLOCATE(FIELDS_GPU_RESOL) !TPM_GEOMETRY NULLIFY(G) IF( ALLOCATED(GEOM_RESOL) ) DEALLOCATE(GEOM_RESOL) - !TPM_TRANS - IF(ALLOCATED(FOUBUF_IN)) DEALLOCATE(FOUBUF_IN) - IF(ALLOCATED(FOUBUF)) DEALLOCATE(FOUBUF) - MSETUP0 = 0 NMAX_RESOL = 0 NCUR_RESOL = 0 diff --git a/src/trans/gpu/internal/cdmap_mod.F90 b/src/trans/gpu/internal/cdmap_mod.F90 index e0e7d0b55..2c1210632 100755 --- a/src/trans/gpu/internal/cdmap_mod.F90 +++ b/src/trans/gpu/internal/cdmap_mod.F90 @@ -17,8 +17,9 @@ SUBROUTINE CDMAP(KM,KMLOC,KSL,KSLO,PEPSNM, KDIR, KDGNH, KDGNHD,& USE YOMHOOK, ONLY: LHOOK, DR_HOOK, JPHOOK USE TPM_FLT, ONLY: S USE TPM_DISTR, ONLY: D -USE TPM_TRANS, ONLY: FOUBUF_IN, FOUBUF +!USE TPM_TRANS, ONLY: FOUBUF_IN, FOUBUF USE SEEFMM_MIX, ONLY: SEEFMM_MULM +USE MPL_MODULE, ONLY: MPL_ABORT !**** *CDMAP* - REMAP ROOTS ! @@ -80,6 +81,8 @@ SUBROUTINE CDMAP(KM,KMLOC,KSL,KSLO,PEPSNM, KDIR, KDGNH, KDGNHD,& ! ------------------------------------------------------------------ +CALL MPL_ABORT("this routine is not supported right now") + !* 1. PERFORM LEGENDRE TRANFORM. ! -------------------------- @@ -121,10 +124,10 @@ SUBROUTINE CDMAP(KM,KMLOC,KSL,KSLO,PEPSNM, KDIR, KDGNH, KDGNHD,& DO IGL=KSLO,KDGNHD IGLS = 2*KDGNHD+1-IGL DO JF=1,KFIELDS - FOUBUF_IN(ISTN(IGL)+JF) = S%FA(KMLOC)%RPNMWO(IGL-KSLO+1,1)*ZALL1(JF,IGL) & - & - S%FA(KMLOC)%RPNMWO(IGL-KSLO+1,2)*ZALL(JF,IGL) - FOUBUF_IN(ISTS(IGL)+JF) = S%FA(KMLOC)%RPNMWO(IGLS-KSLO+1,1)*ZALL1(JF,IGLS) & - & - S%FA(KMLOC)%RPNMWO(IGLS-KSLO+1,2)*ZALL(JF,IGLS) + !FOUBUF_IN(ISTN(IGL)+JF) = S%FA(KMLOC)%RPNMWO(IGL-KSLO+1,1)*ZALL1(JF,IGL) & + ! & - S%FA(KMLOC)%RPNMWO(IGL-KSLO+1,2)*ZALL(JF,IGL) + !FOUBUF_IN(ISTS(IGL)+JF) = S%FA(KMLOC)%RPNMWO(IGLS-KSLO+1,1)*ZALL1(JF,IGLS) & + ! & - S%FA(KMLOC)%RPNMWO(IGLS-KSLO+1,2)*ZALL(JF,IGLS) ENDDO ENDDO DEALLOCATE(ZALL1) @@ -154,8 +157,8 @@ SUBROUTINE CDMAP(KM,KMLOC,KSL,KSLO,PEPSNM, KDIR, KDGNH, KDGNHD,& DO JGL=KSLO, KDGNHD IGLS = 2*KDGNHD+1-JGL DO JF=1,KFIELDS - ZQX(JF,JGL)=FOUBUF(ISTN(JGL)+JF) - ZQX(JF,IGLS)=FOUBUF(ISTS(JGL)+JF) + !ZQX(JF,JGL)=FOUBUF(ISTN(JGL)+JF) + !ZQX(JF,IGLS)=FOUBUF(ISTS(JGL)+JF) ENDDO ENDDO diff --git a/src/trans/gpu/internal/dealloc_resol_mod.F90 b/src/trans/gpu/internal/dealloc_resol_mod.F90 index 8c8726506..3b84e60e0 100755 --- a/src/trans/gpu/internal/dealloc_resol_mod.F90 +++ b/src/trans/gpu/internal/dealloc_resol_mod.F90 @@ -1,5 +1,6 @@ ! (C) Copyright 2013- ECMWF. ! (C) Copyright 2013- Meteo-France. +! (C) Copyright 2024- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -42,12 +43,13 @@ SUBROUTINE DEALLOC_RESOL(KRESOL) ! ------------------------------------------------------------------ USE PARKIND_ECTRANS, ONLY: JPIM -USE TPM_DIM, ONLY: R +USE TPM_DIM, ONLY: R, DIM_TYPE USE TPM_GEN, ONLY: LENABLED, NOUT, NDEF_RESOL -USE TPM_DISTR, ONLY: D, NPRTRV -USE TPM_GEOMETRY, ONLY: G -USE TPM_FIELDS, ONLY: F -USE TPM_FLT, ONLY: S +USE TPM_DISTR, ONLY: D, DISTR_TYPE, NPRTRV +USE TPM_GEOMETRY, ONLY: G, GEOM_TYPE +USE TPM_FIELDS, ONLY: F, FIELDS_TYPE +USE TPM_FIELDS_GPU, ONLY: FG, FIELDS_GPU_TYPE +USE TPM_FLT, ONLY: S, FLT_TYPE_WRAP USE TPM_CTL, ONLY: C USE SEEFMM_MIX, ONLY: FREE_SEEFMM USE SET_RESOL_MOD, ONLY: SET_RESOL @@ -57,6 +59,12 @@ SUBROUTINE DEALLOC_RESOL(KRESOL) INTEGER(KIND=JPIM), INTENT(IN) :: KRESOL INTEGER(KIND=JPIM) :: JMLOC,IPRTRV,JSETV,IMLOC,IM,ILA,ILS, JRESOL +TYPE(DIM_TYPE) :: R_ +TYPE(DISTR_TYPE) :: D_ +TYPE(GEOM_TYPE) :: G_ +TYPE(FIELDS_TYPE) :: F_ +TYPE(FIELDS_GPU_TYPE) :: FG_ +TYPE(FLT_TYPE_WRAP) :: S_ ! ------------------------------------------------------------------ @@ -68,7 +76,27 @@ SUBROUTINE DEALLOC_RESOL(KRESOL) CALL SET_RESOL(KRESOL) - !TPM_FLT +#ifdef ACCGPU +!$ACC EXIT DATA DELETE(R) ASYNC(1) +!$ACC EXIT DATA DELETE(FG,FG%ZAA0,FG%ZAS0) IF(ALLOCATED(FG%ZAA0)) ASYNC(1) +!$ACC EXIT DATA DELETE(FG,FG%ZAA,FG%ZAS,FG%ZEPSNM) ASYNC(1) +!$ACC EXIT DATA DELETE(F,F%RLAPIN,F%RACTHE,F%RW) ASYNC(1) +!$ACC EXIT DATA DELETE(D,D%MYMS,D%NPNTGTB0,D%NPNTGTB1,D%NSTAGT0B,D%NSTAGT1B,D%NSTAGTF,D%NPROCM)& +!$ACC& DELETE(D%NPTRLS,D%MSTABF,D%NASM0,D%OFFSETS_GEMM1,D%OFFSETS_GEMM2) ASYNC(1) +!$ACC EXIT DATA DELETE(G,G%NDGLU,G%NMEN,G%NLOEN) ASYNC(1) +!$ACC WAIT(1) +#endif +#ifdef OMPGPU +#endif + + ! Empty all fields (none of them has pointers; allocatable arrays implicitly deallocate) + D = D_ + F = F_ + FG = FG_ + R = R_ + G = G_ + + ! TPM_FLD is more complex because it has pointers IF( ALLOCATED(S%FA) ) THEN DO JMLOC=1,D%NUMP,NPRTRV ! +++++++++++++++++++++ JMLOC LOOP ++++++++++ IPRTRV=MIN(NPRTRV,D%NUMP-JMLOC+1) @@ -91,70 +119,7 @@ SUBROUTINE DEALLOC_RESOL(KRESOL) CALL FREE_SEEFMM(S%FMM_INTI) IF(ASSOCIATED(S%FMM_INTI)) DEALLOCATE(S%FMM_INTI) ENDIF - - !TPM_DISTR - IF(ALLOCATED(D%NFRSTLAT)) DEALLOCATE(D%NFRSTLAT) - IF(ALLOCATED(D%NLSTLAT)) DEALLOCATE(D%NLSTLAT) - IF(ALLOCATED(D%NPTRLAT)) DEALLOCATE(D%NPTRLAT) - IF(ALLOCATED(D%NPTRFRSTLAT)) DEALLOCATE(D%NPTRFRSTLAT) - IF(ALLOCATED(D%NPTRLSTLAT)) DEALLOCATE(D%NPTRLSTLAT) - IF(ALLOCATED(D%LSPLITLAT)) DEALLOCATE(D%LSPLITLAT) - IF(ALLOCATED(D%NSTA)) DEALLOCATE(D%NSTA) - IF(ALLOCATED(D%NONL)) DEALLOCATE(D%NONL) - IF(ALLOCATED(D%NGPTOTL)) DEALLOCATE(D%NGPTOTL) - IF(ALLOCATED(D%NPROCA_GP)) DEALLOCATE(D%NPROCA_GP) - - IF(D%LWEIGHTED_DISTR) THEN - IF(ALLOCATED(D%RWEIGHT)) DEALLOCATE(D%RWEIGHT) - ENDIF - - IF(ALLOCATED(D%MYMS)) DEALLOCATE(D%MYMS) - IF(ALLOCATED(D%NUMPP)) DEALLOCATE(D%NUMPP) - IF(ALLOCATED(D%NPOSSP)) DEALLOCATE(D%NPOSSP) - IF(ALLOCATED(D%NPROCM)) DEALLOCATE(D%NPROCM) - IF(ALLOCATED(D%NDIM0G)) DEALLOCATE(D%NDIM0G) - IF(ALLOCATED(D%NASM0)) DEALLOCATE(D%NASM0) - IF(ALLOCATED(D%NATM0)) DEALLOCATE(D%NATM0) - IF(ALLOCATED(D%NLATLS)) DEALLOCATE(D%NLATLS) - IF(ALLOCATED(D%NLATLE)) DEALLOCATE(D%NLATLE) - IF(ALLOCATED(D%NPMT)) DEALLOCATE(D%NPMT) - IF(ALLOCATED(D%NPMS)) DEALLOCATE(D%NPMS) - IF(ALLOCATED(D%NPMG)) DEALLOCATE(D%NPMG) - IF(ALLOCATED(D%NULTPP)) DEALLOCATE(D%NULTPP) - IF(ALLOCATED(D%NPROCL)) DEALLOCATE(D%NPROCL) - IF(ALLOCATED(D%NPTRLS)) DEALLOCATE(D%NPTRLS) - IF(ALLOCATED(D%NALLMS)) DEALLOCATE(D%NALLMS) - IF(ALLOCATED(D%NPTRMS)) DEALLOCATE(D%NPTRMS) - IF(ALLOCATED(D%NSTAGT0B)) DEALLOCATE(D%NSTAGT0B) - IF(ALLOCATED(D%NSTAGT1B)) DEALLOCATE(D%NSTAGT1B) - IF(ALLOCATED(D%NPNTGTB0)) DEALLOCATE(D%NPNTGTB0) - IF(ALLOCATED(D%NPNTGTB1)) DEALLOCATE(D%NPNTGTB1) - IF(ALLOCATED(D%NLTSFTB)) DEALLOCATE(D%NLTSFTB) - IF(ALLOCATED(D%NLTSGTB)) DEALLOCATE(D%NLTSGTB) - IF(ALLOCATED(D%MSTABF)) DEALLOCATE(D%MSTABF) - IF(ALLOCATED(D%NSTAGTF)) DEALLOCATE(D%NSTAGTF) - - !TPM_FIELDS - IF(ALLOCATED(F%RMU)) DEALLOCATE(F%RMU) - IF(ALLOCATED(F%RW)) DEALLOCATE(F%RW) - IF(ALLOCATED(F%R1MU2)) DEALLOCATE(F%R1MU2) - IF(ALLOCATED(F%RACTHE)) DEALLOCATE(F%RACTHE) - IF(ALLOCATED(F%REPSNM)) DEALLOCATE(F%REPSNM) - IF(ALLOCATED(F%RN)) DEALLOCATE(F%RN) - IF(ALLOCATED(F%RLAPIN)) DEALLOCATE(F%RLAPIN) - IF(ALLOCATED(F%NLTN)) DEALLOCATE(F%NLTN) - IF( S%LKEEPRPNM ) THEN - IF(ALLOCATED(F%RPNM)) DEALLOCATE(F%RPNM) - ENDIF - IF( S%LDLL ) THEN - IF(ALLOCATED(F%RMU2)) DEALLOCATE(F%RMU2) - IF(ALLOCATED(F%RACTHE2)) DEALLOCATE(F%RACTHE2) - ENDIF - - !TPM_GEOMETRY - IF(ALLOCATED(G%NMEN)) DEALLOCATE(G%NMEN) - IF(ALLOCATED(G%NDGLU)) DEALLOCATE(G%NDGLU) - IF(ALLOCATED(G%NLOEN)) DEALLOCATE(G%NLOEN) + S = S_ LENABLED(KRESOL)=.FALSE. NDEF_RESOL = COUNT(LENABLED) diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index 2254af6fb..949cb303f 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -65,11 +65,11 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE ! ------------------------------------------------------------------ USE TPM_TRANS, ONLY: LATLON -USE TPM_DISTR, ONLY: MYSETW, MYPROC, NPROC, D_NUMP, D_NPTRLS, D_NSTAGTF -USE TPM_GEOMETRY, ONLY: G_NMEN, G_NLOEN, G_NLOEN_MAX -USE TPM_FIELDS_FLAT, ONLY: F_RACTHE +USE TPM_DISTR, ONLY: MYSETW, MYPROC, NPROC, D +USE TPM_GEOMETRY, ONLY: G +USE TPM_FIELDS, ONLY: F USE TPM_GEN, ONLY: NOUT -USE TPM_DIM, ONLY: R_NSMAX +USE TPM_DIM, ONLY: R ! IMPLICIT NONE @@ -83,13 +83,14 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE REAL(KIND=JPRBT) :: ZACHTE2 REAL(KIND=JPRBT) :: ZAMP, ZPHASE -INTEGER(KIND=JPIM) :: IOFF_LAT,OFFSET_VAR +INTEGER(KIND=JPIM) :: IOFF_LAT,OFFSET_VAR,ILOEN_MAX INTEGER(KIND=JPIM) :: IOFF_SCALARS,IOFF_SCALARS_EWDER,IOFF_UV,IOFF_UV_EWDER,IOFF_KSCALARS_NSDER INTEGER(KIND=JPIM) :: JF,IGLG,II,JM INTEGER(KIND=JPIM) :: IBEG,IEND,IINC REAL(KIND=JPRBT) :: RET_REAL, RET_COMPLEX - +ASSOCIATE(D_NUMP=>D%NUMP, D_NPTRLS=>D%NPTRLS, D_NSTAGTF=>D%NSTAGTF, G_NMEN=>G%NMEN, & + & G_NLOEN=>G%NLOEN, F_RACTHE=>F%RACTHE, R_NSMAX=>R%NSMAX) ! ------------------------------------------------------------------ IF(MYPROC > NPROC/2)THEN @@ -104,7 +105,7 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE #ifdef ACCGPU !$ACC DATA & -!$ACC& PRESENT(D_NPTRLS,D_NSTAGTF,PREEL_COMPLEX,F_RACTHE,G_NMEN,G_NLOEN, G_NLOEN_MAX, R_NSMAX) +!$ACC& PRESENT(D_NPTRLS,D_NSTAGTF,PREEL_COMPLEX,F_RACTHE,G_NMEN,G_NLOEN, R_NSMAX) #endif #ifdef OMPGPU !$OMP TARGET DATA MAP(PRESENT,ALLOC:ZGTF) & @@ -182,6 +183,7 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE !* 2.1 U AND V. +ILOEN_MAX = MAXVAL(G_NLOEN) IF (KUV_EWDER_OFFSET >= 0) THEN #ifdef OMPGPU !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) SHARED(KF_UV,PUVDERS,ZACHTE2,PUV) @@ -192,7 +194,7 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE #endif DO KGL=IBEG,IEND,IINC DO JF=1,2*KF_UV - DO JM=0,G_NLOEN_MAX/2 + DO JM=0,ILOEN_MAX/2 IGLG = OFFSET_VAR+KGL-1 ! FFT transforms NLON real values to floor(NLON/2)+1 complex numbers. Hence we have ! to fill those floor(NLON/2)+1 values. @@ -233,7 +235,7 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE #endif DO KGL=IBEG,IEND,IINC DO JF=1,KF_SCALARS - DO JM=0,G_NLOEN_MAX/2 + DO JM=0,ILOEN_MAX/2 IGLG = OFFSET_VAR+KGL-1 ! FFT transforms NLON real values to floor(NLON/2)+1 complex numbers. Hence we have ! to fill those floor(NLON/2)+1 values. @@ -272,6 +274,7 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE !$OMP END TARGET DATA #endif ! ------------------------------------------------------------------ +END ASSOCIATE END SUBROUTINE FSC END MODULE FSC_MOD diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index b182a7e3a..0b72d6c4b 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -73,11 +73,10 @@ SUBROUTINE FTDIR(ALLOCATOR,HFTDIR,PREEL_REAL,PREEL_COMPLEX,KFIELD) ! G. Mozdzynski (Jun 2015): Support alternative FFTs to FFTW ! ------------------------------------------------------------------ - USE TPM_GEN, ONLY: LSYNC_TRANS + USE TPM_GEN, ONLY: LSYNC_TRANS, NCUR_RESOL USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT - USE TPM_DISTR, ONLY: MYSETW, MYPROC, NPROC, D_NSTAGT0B, D_NSTAGTF,D_NPTRLS, & - & D_NPNTGTB0, D_NPROCM, D_NDGL_FS, D - USE TPM_GEOMETRY, ONLY: G_NMEN, G_NLOEN + USE TPM_DISTR, ONLY: MYSETW, MYPROC, NPROC, D + USE TPM_GEOMETRY, ONLY: G USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION USE TPM_HICFFT, ONLY: EXECUTE_DIR_FFT USE MPL_MODULE, ONLY: MPL_BARRIER,MPL_ALL_MS_COMM @@ -101,6 +100,10 @@ SUBROUTINE FTDIR(ALLOCATOR,HFTDIR,PREEL_REAL,PREEL_COMPLEX,KFIELD) & 1_C_SIZE_T, INT(KFIELD*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(PREEL_COMPLEX(1))) #endif + ASSOCIATE(D_NDGL_FS=>D%NDGL_FS, D_NSTAGT0B=>D%NSTAGT0B, D_NSTAGTF=>D%NSTAGTF, & + & D_NPTRLS=>D%NPTRLS, D_NPNTGTB0=>D%NPNTGTB0, D_NPROCM=>D%NPROCM, & + & G_NMEN=>G%NMEN, G_NLOEN=>G%NLOEN) + #ifdef ACCGPU !$ACC DATA PRESENT(PREEL_REAL, PREEL_COMPLEX, & !$ACC& D_NSTAGTF,D_NSTAGT0B,D_NPTRLS,D_NPROCM,D_NPNTGTB0,G_NMEN,G_NLOEN) @@ -134,6 +137,7 @@ SUBROUTINE FTDIR(ALLOCATOR,HFTDIR,PREEL_REAL,PREEL_COMPLEX,KFIELD) #endif NULLIFY(PREEL_REAL) + END ASSOCIATE ! ------------------------------------------------------------------ END SUBROUTINE FTDIR diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index b6bb0a112..f513cceea 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -72,13 +72,13 @@ SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD) ! G. Mozdzynski (Jun 2015): Support alternative FFTs to FFTW ! ------------------------------------------------------------------ - USE TPM_GEN, ONLY: LSYNC_TRANS - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT - USE TPM_DISTR, ONLY: MYSETW, D_NPTRLS, D_NDGL_FS, D_NSTAGTF, D - USE TPM_GEOMETRY, ONLY: G_NLOEN - USE TPM_HICFFT, ONLY: EXECUTE_INV_FFT - USE MPL_MODULE, ONLY: MPL_BARRIER,MPL_ALL_MS_COMM - USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX + USE TPM_GEN, ONLY: LSYNC_TRANS, NCUR_RESOL + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE TPM_DISTR, ONLY: MYSETW, D + USE TPM_GEOMETRY, ONLY: G + USE TPM_HICFFT, ONLY: EXECUTE_INV_FFT + USE MPL_MODULE, ONLY: MPL_BARRIER,MPL_ALL_MS_COMM + USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX USE BUFFERED_ALLOCATOR_MOD, ONLY: ASSIGN_PTR, GET_ALLOCATION USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF @@ -92,6 +92,8 @@ SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD) INTEGER(KIND=JPIM) :: KGL + ASSOCIATE(D_NDGL_FS=>D%NDGL_FS, D_NPTRLS=>D%NPTRLS, D_NSTAGTF=>D%NSTAGTF, G_NLOEN=>G%NLOEN) + #ifdef IN_PLACE_FFT PREEL_REAL => PREEL_COMPLEX #else @@ -130,6 +132,8 @@ SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD) NULLIFY(PREEL_COMPLEX) + END ASSOCIATE + ! ------------------------------------------------------------------ END SUBROUTINE FTINV END MODULE FTINV_MOD diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index e12f89afb..19482dae6 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -23,7 +23,7 @@ MODULE LEDIR_MOD SUBROUTINE LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& IOUT0_STRIDES0,IOUT0_SIZE,IIN0_STRIDES0,IIN0_SIZE) USE TPM_DIM, ONLY: R - USE TPM_DISTR, ONLY: D, D_OFFSETS_GEMM1, D_OFFSETS_GEMM2 + USE TPM_DISTR, ONLY: D IMPLICIT NONE @@ -34,6 +34,8 @@ SUBROUTINE LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& INTEGER(KIND=JPIM), OPTIONAL :: IOUT0_STRIDES0, IOUT0_SIZE INTEGER(KIND=JPIM), OPTIONAL :: IIN0_STRIDES0, IIN0_SIZE + ASSOCIATE(D_OFFSETS_GEMM1=>D%OFFSETS_GEMM1, D_OFFSETS_GEMM2=>D%OFFSETS_GEMM2) + IF (PRESENT(IOUT_STRIDES0)) & IOUT_STRIDES0 = ALIGN(2*KF_FS,A) IF (PRESENT(IOUT_SIZE)) & @@ -50,6 +52,8 @@ SUBROUTINE LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& IIN0_STRIDES0 = ALIGN(KF_FS,A) IF (PRESENT(IIN0_SIZE)) & IIN0_SIZE = IIN0_STRIDES0 * ALIGN(R%NDGNH,A) + + END ASSOCIATE END SUBROUTINE SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) @@ -94,12 +98,12 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) ! F. Vana 05-Mar-2015 Support for single precision ! ------------------------------------------------------------------ - USE TPM_GEN, ONLY: LSYNC_TRANS, NOUT + USE TPM_GEN, ONLY: LSYNC_TRANS, NOUT, NCUR_RESOL USE YOMHOOK, ONLY: LHOOK, DR_HOOK, JPHOOK - USE TPM_DIM, ONLY: R_NDGNH,R_NSMAX,R_NTMAX,R_NDGL - USE TPM_GEOMETRY, ONLY: G_NDGLU - USE TPM_FIELDS_FLAT, ONLY: ZAA,ZAS,ZAA0,ZAS0,KMLOC0 - USE TPM_DISTR, ONLY: D_NUMP, D_MYMS, D_OFFSETS_GEMM1, D_OFFSETS_GEMM2 + USE TPM_DIM, ONLY: R + USE TPM_GEOMETRY, ONLY: G + USE TPM_FIELDS_GPU, ONLY: FG + USE TPM_DISTR, ONLY: D USE HICBLAS_MOD, ONLY: HIP_DGEMM_BATCHED_OVERLOAD, & & HIP_DGEMM_GROUPED_OVERLOAD, HIP_SGEMM_GROUPED_OVERLOAD USE MPL_MODULE, ONLY: MPL_BARRIER,MPL_ALL_MS_COMM @@ -126,8 +130,8 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) ! LOCAL VARIABLES INTEGER(KIND=JPIM) :: KM INTEGER(KIND=JPIM) :: KMLOC - INTEGER(KIND=JPIM) :: IA, IS, ISL, J - INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP), AOFFSETS(D_NUMP), BOFFSETS(D_NUMP), COFFSETS(D_NUMP) + INTEGER(KIND=JPIM) :: IA, IS, ISL, J, IMLOC0(1) + INTEGER(KIND=JPIM) :: KS(D%NUMP), NS(D%NUMP), AOFFSETS(D%NUMP), BOFFSETS(D%NUMP), COFFSETS(D%NUMP) REAL(KIND=JPHOOK) :: ZHOOK_HANDLE REAL(KIND=JPRBT) :: PAIA, PAIS, V1, V2 @@ -140,6 +144,10 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 INTEGER(KIND=8) :: ALLOC_SZ, ALLOC_POS + ASSOCIATE(D_NUMP=>D%NUMP, R_NSMAX=>R%NSMAX, R_NTMAX=>R%NTMAX, G_NDGLU=>G%NDGLU, & + & D_MYMS=>D%MYMS, D_OFFSETS_GEMM1=>D%OFFSETS_GEMM1, & + & D_OFFSETS_GEMM2=>D%OFFSETS_GEMM2, & + & ZAA=>FG%ZAA, ZAS=>FG%ZAS, ZAA0=>FG%ZAA0, ZAS0=>FG%ZAS0) IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) CALL LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& @@ -164,7 +172,9 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) ENDIF CALL GSTATS(414,0) - IF(KMLOC0 > 0) THEN + ! anti-symmetric + IMLOC0 = FINDLOC(D_MYMS,0) + IF(IMLOC0(1) > 0) THEN ! compute m=0 in double precision: #ifdef OMPGPU !$OMP TARGET DATA USE_DEVICE_PTR(ZAA0,ZINPA0,ZOUT0) @@ -199,9 +209,9 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC) ENDDO - IF(KMLOC0 > 0) THEN - NS(KMLOC0) = 0 - KS(KMLOC0) = 0 + IF(IMLOC0(1) > 0) THEN + NS(IMLOC0(1)) = 0 + KS(IMLOC0(1)) = 0 ENDIF #ifdef OMPGPU !$OMP TARGET DATA USE_DEVICE_PTR(ZAA,ZINPA,ZOUT) @@ -278,7 +288,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) ENDIF CALL GSTATS(414,0) - IF(KMLOC0 > 0) THEN + IF(IMLOC0(1) > 0) THEN #ifdef OMPGPU !$OMP TARGET DATA USE_DEVICE_PTR(ZAS0,ZINPS0,ZOUT0) #endif @@ -314,9 +324,9 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC) ENDDO - IF(KMLOC0 > 0) THEN - NS(KMLOC0) = 0 - KS(KMLOC0) = 0 + IF(IMLOC0(1) > 0) THEN + NS(IMLOC0(1)) = 0 + KS(IMLOC0(1)) = 0 ENDIF #ifdef OMPGPU !$OMP TARGET DATA USE_DEVICE_PTR(ZAS,ZINPS,ZOUT) @@ -390,5 +400,6 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) IF (LHOOK) CALL DR_HOOK('LE_DGEMM',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ + END ASSOCIATE END SUBROUTINE LEDIR END MODULE LEDIR_MOD diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 70a729ac0..b678f9dfd 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -24,7 +24,7 @@ MODULE LEINV_MOD SUBROUTINE LEINV_STRIDES(KF_LEG,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& IOUT0_STRIDES0,IOUT0_SIZE,IIN0_STRIDES0,IIN0_SIZE) USE TPM_DIM, ONLY: R - USE TPM_DISTR, ONLY: D, D_OFFSETS_GEMM1, D_OFFSETS_GEMM2 + USE TPM_DISTR, ONLY: D IMPLICIT NONE @@ -35,6 +35,8 @@ SUBROUTINE LEINV_STRIDES(KF_LEG,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& INTEGER(KIND=JPIM), OPTIONAL :: IOUT0_STRIDES0, IOUT0_SIZE INTEGER(KIND=JPIM), OPTIONAL :: IIN0_STRIDES0, IIN0_SIZE + ASSOCIATE(D_OFFSETS_GEMM1=>D%OFFSETS_GEMM1, D_OFFSETS_GEMM2=>D%OFFSETS_GEMM2) + IF (PRESENT(IOUT_STRIDES0)) & IOUT0_STRIDES0 = ALIGN(KF_LEG,A) @@ -52,6 +54,8 @@ SUBROUTINE LEINV_STRIDES(KF_LEG,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& IIN0_STRIDES0 = ALIGN(KF_LEG,A) IF (PRESENT(IIN0_SIZE)) & IIN0_SIZE = IIN0_STRIDES0 * ALIGN(MAX((R%NTMAX+2)/2,(R%NTMAX+3)/2),A) + + END ASSOCIATE END SUBROUTINE LEINV_STRIDES SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) @@ -93,12 +97,12 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) ! F. Vana 05-Mar-2015 Support for single precision ! ------------------------------------------------------------------ - USE TPM_GEN, ONLY: LSYNC_TRANS, NOUT + USE TPM_GEN, ONLY: LSYNC_TRANS, NOUT, NCUR_RESOL USE YOMHOOK, ONLY: LHOOK, DR_HOOK, JPHOOK - USE TPM_DIM, ONLY: R_NDGNH, R_NSMAX, R_NDGL - USE TPM_GEOMETRY, ONLY: G_NDGLU - USE TPM_FIELDS_FLAT, ONLY: ZAA, ZAS, ZAA0, ZAS0, KMLOC0 - USE TPM_DISTR, ONLY: D_NUMP, D_MYMS, MYPROC, D_OFFSETS_GEMM1, D_OFFSETS_GEMM2 + USE TPM_DIM, ONLY: R + USE TPM_GEOMETRY, ONLY: G + USE TPM_FIELDS_GPU, ONLY: FG + USE TPM_DISTR, ONLY: D USE HICBLAS_MOD, ONLY: HIP_DGEMM_BATCHED_OVERLOAD, & & HIP_DGEMM_GROUPED_OVERLOAD, HIP_SGEMM_GROUPED_OVERLOAD USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_INT @@ -119,8 +123,8 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR ! LOCAL - INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP), AOFFSETS(D_NUMP), BOFFSETS(D_NUMP), COFFSETS(D_NUMP) - INTEGER(KIND=JPIM) :: KM, KMLOC, IA, IS, ISL, J1, JGL, JK, J + INTEGER(KIND=JPIM) :: KS(D%NUMP), NS(D%NUMP), AOFFSETS(D%NUMP), BOFFSETS(D%NUMP), COFFSETS(D%NUMP) + INTEGER(KIND=JPIM) :: KM, KMLOC, IA, IS, ISL, J1, JGL, JK, J, IMLOC0(1) INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE @@ -128,6 +132,9 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + ASSOCIATE(D_NUMP=>D%NUMP, R_NSMAX=>R%NSMAX, G_NDGLU=>G%NDGLU, D_MYMS=>D%MYMS, D_OFFSETS_GEMM1=>D%OFFSETS_GEMM1,& + D_OFFSETS_GEMM2=>D%OFFSETS_GEMM2, & + ZAA=>FG%ZAA, ZAS=>FG%ZAS, ZAA0=>FG%ZAA0, ZAS0=>FG%ZAS0) !* 1.1 PREPARATIONS. IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) @@ -220,7 +227,8 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) ENDIF CALL GSTATS(424,0) - IF (KMLOC0 > 0) THEN + IMLOC0 = FINDLOC(D_MYMS,0) + IF (IMLOC0(1) > 0) THEN ! compute m=0 in double precision #ifdef OMPGPU !$OMP TARGET DATA USE_DEVICE_PTR(ZAA0,ZINP0,ZOUTA0) @@ -255,9 +263,9 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC) ENDDO - IF(KMLOC0 > 0) THEN - NS(KMLOC0) = 0 - KS(KMLOC0) = 0 + IF(IMLOC0(1) > 0) THEN + NS(IMLOC0(1)) = 0 + KS(IMLOC0(1)) = 0 ENDIF #ifdef OMPGPU !$OMP TARGET DATA USE_DEVICE_PTR(ZAA,ZINP,ZOUTA) @@ -357,7 +365,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) ENDIF CALL GSTATS(424,0) - IF (KMLOC0 > 0) THEN + IF (IMLOC0(1) > 0) THEN #ifdef OMPGPU !$OMP TARGET DATA USE_DEVICE_PTR(ZAS0,ZINP0,ZOUTS0) #endif @@ -389,9 +397,9 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC) ENDDO - IF(KMLOC0 > 0) THEN - NS(KMLOC0) = 0 - KS(KMLOC0) = 0 + IF(IMLOC0(1) > 0) THEN + NS(IMLOC0(1)) = 0 + KS(IMLOC0(1)) = 0 ENDIF #ifdef OMPGPU !$OMP TARGET DATA USE_DEVICE_PTR(ZAS,ZINP,ZOUTS) @@ -435,5 +443,6 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) IF (LHOOK) CALL DR_HOOK('LE_DGEMM',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ + END ASSOCIATE END SUBROUTINE LEINV END MODULE LEINV_MOD diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index 11ed079a5..b95eeeabc 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -113,8 +113,7 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& USE SPNSDE_MOD, ONLY: SPNSDE USE LEINV_MOD, ONLY: LEINV_STRIDES, LEINV USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS - USE TPM_FIELDS, ONLY: F - USE TPM_FIELDS_FLAT, ONLY: ZEPSNM + USE TPM_FIELDS_GPU, ONLY: FG USE MPL_MODULE, ONLY: MPL_BARRIER,MPL_ALL_MS_COMM USE TPM_GEN, ONLY: LSYNC_TRANS USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX @@ -204,6 +203,8 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& REAL(KIND=JPRBT), POINTER :: ZINP(:) REAL(KIND=JPRD), POINTER :: ZINP0(:) + ASSOCIATE(ZEPSNM=>FG%ZEPSNM) + ! ------------------------------------------------------------------ !* 1. PERFORM LEGENDRE TRANFORM. @@ -402,6 +403,7 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& CALL LEINV(ALLOCATOR,PIA(2*(IF_READIN-IF_LEG)+1:IF_READIN,:,:),ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,IF_LEG) IF (LHOOK) CALL DR_HOOK('LTINV_MOD',1,ZHOOK_HANDLE) + END ASSOCIATE ! ------------------------------------------------------------------ END SUBROUTINE LTINV END MODULE LTINV_MOD diff --git a/src/trans/gpu/internal/prepsnm_mod.F90 b/src/trans/gpu/internal/prepsnm_mod.F90 index 71841e4e1..351e693e2 100755 --- a/src/trans/gpu/internal/prepsnm_mod.F90 +++ b/src/trans/gpu/internal/prepsnm_mod.F90 @@ -53,7 +53,7 @@ SUBROUTINE PREPSNM USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT USE TPM_DIM, ONLY: R USE TPM_FIELDS, ONLY: F - USE TPM_FIELDS_FLAT, ONLY: ZEPSNM + USE TPM_FIELDS_GPU, ONLY: FG USE TPM_DISTR, ONLY: D ! @@ -85,12 +85,12 @@ SUBROUTINE PREPSNM !$ACC loop #endif DO JN=0,KM-1 - ZEPSNM(KMLOC,JN) = 0.0_JPRBT + FG%ZEPSNM(KMLOC,JN) = 0.0_JPRBT ENDDO ENDIF DO JN=KM,R%NTMAX+2 - ZEPSNM(KMLOC,JN) =REAL(F%REPSNM(D%NPMT(KM)+KMLOC-KM+JN),JPRBT) + FG%ZEPSNM(KMLOC,JN) = REAL(F%REPSNM(D%NPMT(KM)+KMLOC-KM+JN),JPRBT) ENDDO ! end loop over wavenumber ENDDO diff --git a/src/trans/gpu/internal/prfi1b_mod.F90 b/src/trans/gpu/internal/prfi1b_mod.F90 index 00a2985b3..23ddd21b8 100755 --- a/src/trans/gpu/internal/prfi1b_mod.F90 +++ b/src/trans/gpu/internal/prfi1b_mod.F90 @@ -14,8 +14,8 @@ MODULE PRFI1B_MOD SUBROUTINE PRFI1B(PIA,PSPEC,KFIELDS,KDIM,KFLDPTR) USE PARKIND1, ONLY: JPIM, JPRB - USE TPM_DIM, ONLY: R, R_NSMAX - USE TPM_DISTR, ONLY: D, D_NUMP, D_MYMS, D_NASM0 + USE TPM_DIM, ONLY: R + USE TPM_DISTR, ONLY: D USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS !**** *PRFI1* - Prepare spectral fields for inverse Legendre transform @@ -77,6 +77,7 @@ SUBROUTINE PRFI1B(PIA,PSPEC,KFIELDS,KDIM,KFLDPTR) !* 1. EXTRACT FIELDS FROM SPECTRAL ARRAYS. ! -------------------------------------------------- + ASSOCIATE(D_NUMP=>D%NUMP, D_MYMS=>D%MYMS, D_NASM0=>D%NASM0, R_NSMAX=>R%NSMAX) #ifdef ACCGPU !$ACC DATA & @@ -188,6 +189,7 @@ SUBROUTINE PRFI1B(PIA,PSPEC,KFIELDS,KDIM,KFLDPTR) #endif ! ------------------------------------------------------------------ + END ASSOCIATE END SUBROUTINE PRFI1B END MODULE PRFI1B_MOD diff --git a/src/trans/gpu/internal/set_resol_mod.F90 b/src/trans/gpu/internal/set_resol_mod.F90 index c7367bdd6..afd7cbcad 100755 --- a/src/trans/gpu/internal/set_resol_mod.F90 +++ b/src/trans/gpu/internal/set_resol_mod.F90 @@ -18,6 +18,7 @@ SUBROUTINE SET_RESOL(KRESOL,LDSETUP) USE TPM_DISTR, ONLY: D, DISTR_RESOL USE TPM_GEOMETRY, ONLY: G, GEOM_RESOL USE TPM_FIELDS, ONLY: F, FIELDS_RESOL +USE TPM_FIELDS_GPU, ONLY: FG, FIELDS_GPU_RESOL USE TPM_FLT, ONLY: S, FLT_RESOL USE TPM_CTL, ONLY: C, CTL_RESOL USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS @@ -57,6 +58,7 @@ SUBROUTINE SET_RESOL(KRESOL,LDSETUP) NCUR_RESOL = IRESOL R => DIM_RESOL(NCUR_RESOL) F => FIELDS_RESOL(NCUR_RESOL) + FG => FIELDS_GPU_RESOL(NCUR_RESOL) G => GEOM_RESOL(NCUR_RESOL) D => DISTR_RESOL(NCUR_RESOL) S => FLT_RESOL(NCUR_RESOL) diff --git a/src/trans/gpu/internal/spnsde_mod.F90 b/src/trans/gpu/internal/spnsde_mod.F90 index 10acc5dc1..eb0ae2470 100755 --- a/src/trans/gpu/internal/spnsde_mod.F90 +++ b/src/trans/gpu/internal/spnsde_mod.F90 @@ -14,9 +14,9 @@ MODULE SPNSDE_MOD SUBROUTINE SPNSDE(KF_SCALARS,PEPSNM,PF,PNSD) USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT -USE TPM_DIM, ONLY: R, R_NTMAX -USE TPM_DISTR, ONLY: D, D_MYMS, D_NUMP -USE TPM_FIELDS_FLAT, ONLY: ZEPSNM +USE TPM_DIM, ONLY: R +USE TPM_DISTR, ONLY: D +USE TPM_FIELDS_GPU, ONLY: FG !**** *SPNSDE* - Compute North-South derivative in spectral space @@ -82,6 +82,8 @@ SUBROUTINE SPNSDE(KF_SCALARS,PEPSNM,PF,PNSD) ! LOCAL INTEGER SCALARS INTEGER(KIND=JPIM) :: IJ, ISKIP, J, JN, JI, IR, II +ASSOCIATE(D_NUMP=>D%NUMP, R_NTMAX=>R%NTMAX, D_MYMS=>D%MYMS, ZEPSNM=>FG%ZEPSNM) + #ifdef ACCGPU !$ACC DATA & !$ACC& PRESENT (R_NTMAX, D_MYMS) & @@ -103,7 +105,7 @@ SUBROUTINE SPNSDE(KF_SCALARS,PEPSNM,PF,PNSD) #ifdef OMPGPU !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO !! DEFAULT(NONE) PRIVATE(IJ) & - !!$OMP& SHARED(KM,F,ZN,ZEPSNM,KMLOC) + !!$OMP& SHARED(KM,ZN,ZEPSNM,KMLOC) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,IR,II,JI) & @@ -142,6 +144,7 @@ SUBROUTINE SPNSDE(KF_SCALARS,PEPSNM,PF,PNSD) #endif ! ------------------------------------------------------------------ +END ASSOCIATE END SUBROUTINE SPNSDE END MODULE SPNSDE_MOD diff --git a/src/trans/gpu/internal/tpm_fields_flat.F90 b/src/trans/gpu/internal/tpm_fields_gpu.F90 old mode 100755 new mode 100644 similarity index 62% rename from src/trans/gpu/internal/tpm_fields_flat.F90 rename to src/trans/gpu/internal/tpm_fields_gpu.F90 index 780d0629c..09baef270 --- a/src/trans/gpu/internal/tpm_fields_flat.F90 +++ b/src/trans/gpu/internal/tpm_fields_gpu.F90 @@ -1,6 +1,6 @@ ! (C) Copyright 2000- ECMWF. ! (C) Copyright 2000- Meteo-France. -! (C) Copyright 2022- NVIDIA. +! (C) Copyright 2024- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 ! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -9,29 +9,26 @@ ! nor does it submit to any jurisdiction. ! -MODULE TPM_FIELDS_FLAT +MODULE TPM_FIELDS_GPU -USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD +USE EC_PARKIND, ONLY: JPRD, JPRBT IMPLICIT NONE SAVE -! flat copies of the fields defined in TPM_FIELDS -REAL(KIND=JPRD) ,ALLOCATABLE :: F_RW(:) ! Weights of the Gaussian quadrature -REAL(KIND=JPRBT) ,ALLOCATABLE :: F_RLAPIN(:) ! eigen-values of the inverse Laplace operator -REAL(KIND=JPRD) ,ALLOCATABLE :: F_RACTHE(:) ! eigen-values of the inverse Laplace operator - +TYPE FIELDS_GPU_TYPE ! scratch arrays for ltinv and ltdir and associated dimension variables - REAL(KIND=JPRBT),ALLOCATABLE :: ZAA(:,:,:) !! JPRL for 1/2 REAL(KIND=JPRBT),ALLOCATABLE :: ZAS(:,:,:) !! JPRL for 1/2 ! for m=0 in ledir_mod: REAL(KIND=JPRD),ALLOCATABLE :: ZAA0(:,:) REAL(KIND=JPRD),ALLOCATABLE :: ZAS0(:,:) -INTEGER(KIND=JPIM) :: KMLOC0 - REAL(KIND=JPRBT),ALLOCATABLE :: ZEPSNM(:,:) +END TYPE FIELDS_GPU_TYPE + +TYPE(FIELDS_GPU_TYPE),ALLOCATABLE,TARGET :: FIELDS_GPU_RESOL(:) +TYPE(FIELDS_GPU_TYPE),POINTER :: FG -END MODULE TPM_FIELDS_FLAT +END MODULE TPM_FIELDS_GPU diff --git a/src/trans/gpu/internal/tpm_trans.F90 b/src/trans/gpu/internal/tpm_trans.F90 index 9d1262d10..c1d933b77 100755 --- a/src/trans/gpu/internal/tpm_trans.F90 +++ b/src/trans/gpu/internal/tpm_trans.F90 @@ -49,9 +49,6 @@ MODULE TPM_TRANS !INTEGER_M :: NF_UV_G ! Global version of NF_UV (grid-point space) !INTEGER_M :: NF_SCALARS_G ! Global version of NF_SCALARS (grid-point space) -REAL(KIND=JPRBT), ALLOCATABLE :: FOUBUF_IN(:) ! Fourier buffer -REAL(KIND=JPRBT), ALLOCATABLE :: FOUBUF(:) ! Fourier buffer - INTEGER(KIND=JPIM) :: NPROMA ! Blocking factor for gridpoint input/output INTEGER(KIND=JPIM) :: NGPBLKS ! Number of NPROMA blocks diff --git a/src/trans/gpu/internal/trltom_pack_unpack.F90 b/src/trans/gpu/internal/trltom_pack_unpack.F90 index d77da0f80..0a157a991 100755 --- a/src/trans/gpu/internal/trltom_pack_unpack.F90 +++ b/src/trans/gpu/internal/trltom_pack_unpack.F90 @@ -71,9 +71,9 @@ SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT - USE TPM_DISTR, ONLY: D, MYSETW, D_NSTAGTF, D_NPNTGTB0, D_NPTRLS, D_NDGL_FS - USE TPM_GEOMETRY, ONLY: G_NMEN, G_NLOEN - USE TPM_DIM, ONLY: R_NSMAX + USE TPM_DISTR, ONLY: D, MYSETW + USE TPM_GEOMETRY, ONLY: G + USE TPM_DIM, ONLY: R USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF ! @@ -89,6 +89,9 @@ SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) REAL(KIND=JPRBT) :: SCAL + ASSOCIATE(D_NSTAGTF=>D%NSTAGTF, D_NPNTGTB0=>D%NPNTGTB0, D_NPTRLS=>D%NPTRLS, & + & D_NDGL_FS=>D%NDGL_FS, G_NMEN=>G%NMEN, G_NLOEN=>G%NLOEN, R_NSMAX=>R%NSMAX) + CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRLTOM_PACK%HFOUBUF_IN),& & 1_C_SIZE_T, INT(D%NLENGT0B*KF_FS*2,KIND=C_SIZE_T)*C_SIZEOF(FOUBUF_IN(1))) @@ -131,6 +134,7 @@ SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) !$ACC WAIT(1) #endif + END ASSOCIATE END SUBROUTINE TRLTOM_PACK FUNCTION PREPARE_TRLTOM_UNPACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_UNPACK) @@ -166,11 +170,11 @@ END FUNCTION PREPARE_TRLTOM_UNPACK SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD - USE TPM_DIM, ONLY: R_NDGNH, R_NDGL - USE TPM_GEOMETRY, ONLY: G_NDGLU + USE TPM_DIM, ONLY: R + USE TPM_GEOMETRY, ONLY: G USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION - USE TPM_FIELDS_FLAT, ONLY: F_RW, F_RACTHE - USE TPM_DISTR, ONLY: D_NUMP, D_MYMS, D_NPNTGTB1, D_OFFSETS_GEMM1 + USE TPM_FIELDS, ONLY: F + USE TPM_DISTR, ONLY: D USE LEDIR_MOD, ONLY: LEDIR_STRIDES USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF @@ -195,6 +199,9 @@ SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINP REAL(KIND=JPRBT) :: PAIA, PAIS + ASSOCIATE(D_NUMP=>D%NUMP, R_NDGNH=>R%NDGNH, R_NDGL=>R%NDGL, F_RW=>F%RW, F_RACTHE=>F%RACTHE, & + & D_MYMS=>D%MYMS, D_NPNTGTB1=>D%NPNTGTB1, D_OFFSETS_GEMM1=>D%OFFSETS_GEMM1, & + & G_NDGLU=>G%NDGLU) CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_SIZE=IIN_SIZE,& IIN0_STRIDES0=IIN0_STRIDES0,IIN0_SIZE=IIN0_SIZE) @@ -290,6 +297,7 @@ SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINP !$ACC END DATA #endif + END ASSOCIATE END SUBROUTINE TRLTOM_UNPACK END MODULE TRLTOM_PACK_UNPACK diff --git a/src/trans/gpu/internal/trmtol_pack_unpack.F90 b/src/trans/gpu/internal/trmtol_pack_unpack.F90 index 4468af603..00d4b6408 100755 --- a/src/trans/gpu/internal/trmtol_pack_unpack.F90 +++ b/src/trans/gpu/internal/trmtol_pack_unpack.F90 @@ -86,9 +86,9 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT, JPRD USE YOMHOOK, ONLY: LHOOK, DR_HOOK, JPHOOK - USE TPM_DIM, ONLY: R_NDGNH, R_NDGL - USE TPM_GEOMETRY, ONLY: G_NDGLU - USE TPM_DISTR, ONLY: D, D_NUMP, D_MYMS, D_NPNTGTB1, D_OFFSETS_GEMM1 + USE TPM_DIM, ONLY: R + USE TPM_GEOMETRY, ONLY: G + USE TPM_DISTR, ONLY: D USE LEINV_MOD, ONLY: LEINV_STRIDES USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF @@ -113,6 +113,9 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + ASSOCIATE(D_NUMP=>D%NUMP, R_NDGNH=>R%NDGNH, R_NDGL=>R%NDGL, G_NDGLU=>G%NDGLU, & + & D_MYMS=>D%MYMS, D_NPNTGTB1=>D%NPNTGTB1, D_OFFSETS_GEMM1=>D%OFFSETS_GEMM1) + IF (LHOOK) CALL DR_HOOK('TRMTOL_PACK',0,ZHOOK_HANDLE) CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRMTOL_PACK%HFOUBUF_IN),& @@ -174,6 +177,7 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I IF (LHOOK) CALL DR_HOOK('TRMTOL_PACK',1,ZHOOK_HANDLE) + END ASSOCIATE END SUBROUTINE TRMTOL_PACK FUNCTION PREPARE_TRMTOL_UNPACK(ALLOCATOR,KF_FS) RESULT(HTRMTOL_UNPACK) @@ -224,8 +228,8 @@ SUBROUTINE TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURREN ! ------------------------------------------------------------------ USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT -USE TPM_DISTR, ONLY: D, MYSETW, D_NSTAGTF, D_NPNTGTB0, D_NPTRLS, D_NDGL_FS -USE TPM_GEOMETRY, ONLY: G_NMEN, G_NLOEN, G_NLOEN_MAX +USE TPM_DISTR, ONLY: D, MYSETW +USE TPM_GEOMETRY, ONLY: G USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF ! @@ -238,9 +242,12 @@ SUBROUTINE TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURREN TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR TYPE(TRMTOL_UNPACK_HANDLE), INTENT(IN) :: HTRMTOL_UNPACK -INTEGER(KIND=JPIM) :: JM,JF,IGLG,ISTA,OFFSET_VAR,IOFF_LAT,KGL +INTEGER(KIND=JPIM) :: JM,JF,IGLG,ISTA,OFFSET_VAR,IOFF_LAT,KGL,ILOEN_MAX REAL(KIND=JPRBT) :: RET_REAL, RET_COMPLEX +ASSOCIATE(D_NDGL_FS=>D%NDGL_FS, D_NSTAGTF=>D%NSTAGTF, D_NPNTGTB0=>D%NPNTGTB0, D_NPTRLS=>D%NPTRLS, & + & G_NLOEN=>G%NLOEN, G_NMEN=>G%NMEN) + CALL ASSIGN_PTR(PREEL_COMPLEX, GET_ALLOCATION(ALLOCATOR, HTRMTOL_UNPACK%HREEL),& & 1_C_SIZE_T, INT(KF_TOTAL*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(PREEL_COMPLEX(1))) @@ -251,16 +258,17 @@ SUBROUTINE TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURREN #endif OFFSET_VAR=D_NPTRLS(MYSETW) +ILOEN_MAX=MAXVAL(G_NLOEN) #ifdef OMPGPU #endif #ifdef ACCGPU !$ACC PARALLEL LOOP PRIVATE(IGLG,IOFF_LAT,ISTA,RET_REAL,RET_COMPLEX) FIRSTPRIVATE(KF_CURRENT,& -!$ACC& KF_TOTAL,OFFSET_VAR,G_NLOEN_MAX) DEFAULT(NONE) & +!$ACC& KF_TOTAL,OFFSET_VAR) DEFAULT(NONE) & !$ACC& ASYNC(1) TILE(32,16,1) #endif DO KGL=1,D_NDGL_FS DO JF=1,KF_CURRENT - DO JM=0,G_NLOEN_MAX/2 + DO JM=0,ILOEN_MAX/2 IGLG = OFFSET_VAR+KGL-1 ! FFT transforms NLON real values to floor(NLON/2)+1 complex numbers. Hence we have @@ -290,6 +298,8 @@ SUBROUTINE TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURREN !$ACC WAIT(1) #endif +END ASSOCIATE + END SUBROUTINE TRMTOL_UNPACK END MODULE TRMTOL_PACK_UNPACK diff --git a/src/trans/gpu/internal/updspb_mod.F90 b/src/trans/gpu/internal/updspb_mod.F90 index 0f59694ed..9daf794e4 100755 --- a/src/trans/gpu/internal/updspb_mod.F90 +++ b/src/trans/gpu/internal/updspb_mod.F90 @@ -57,8 +57,8 @@ SUBROUTINE UPDSPB(KFIELD,POA,PSPEC,KFLDPTR) ! ------------------------------------------------------------------ USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT - USE TPM_DIM, ONLY: R_NTMAX - USE TPM_DISTR, ONLY: D_NUMP, D_MYMS, D_NASM0 + USE TPM_DIM, ONLY: R + USE TPM_DISTR, ONLY: D ! IMPLICIT NONE @@ -85,6 +85,7 @@ SUBROUTINE UPDSPB(KFIELD,POA,PSPEC,KFLDPTR) ! and nn=NTMAX+2-n from NTMAX+2-m to NTMAX+2-NSMAX. ! NLTN(m)=NTMAX+2-m : n=NLTN(nn),nn=NLTN(n) ! nn is the loop index. + ASSOCIATE(D_NUMP=>D%NUMP, D_MYMS=>D%MYMS, D_NASM0=>D%NASM0, R_NTMAX=>R%NTMAX) IF(PRESENT(KFLDPTR)) THEN stop 'Error: code path not (yet) supported in GPU version' @@ -135,6 +136,7 @@ SUBROUTINE UPDSPB(KFIELD,POA,PSPEC,KFLDPTR) !$ACC END DATA #endif +END ASSOCIATE ! ------------------------------------------------------------------ END SUBROUTINE UPDSPB diff --git a/src/trans/gpu/internal/uvtvd_mod.F90 b/src/trans/gpu/internal/uvtvd_mod.F90 index 36646f8e9..a920bb533 100755 --- a/src/trans/gpu/internal/uvtvd_mod.F90 +++ b/src/trans/gpu/internal/uvtvd_mod.F90 @@ -59,9 +59,9 @@ SUBROUTINE UVTVD(KF_UV,PU,PV,PVOR,PDIV) ! ------------------------------------------------------------------ USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT -USE TPM_DIM, ONLY: R, R_NTMAX -USE TPM_DISTR, ONLY: D, D_NUMP, D_MYMS -USE TPM_FIELDS_FLAT, ONLY: ZEPSNM +USE TPM_DIM, ONLY: R +USE TPM_DISTR, ONLY: D +USE TPM_FIELDS_GPU, ONLY: FG ! IMPLICIT NONE @@ -79,6 +79,7 @@ SUBROUTINE UVTVD(KF_UV,PU,PV,PVOR,PDIV) REAL(KIND=JPRBT) :: ZKM,ZJN ! ------------------------------------------------------------------ +ASSOCIATE(D_NUMP=>D%NUMP, R_NTMAX=>R%NTMAX, D_MYMS=>D%MYMS, ZEPSNM=>FG%ZEPSNM) !* 1. COMPUTE U V FROM VORTICITY AND DIVERGENCE. ! ------------------------------------------ @@ -170,6 +171,7 @@ SUBROUTINE UVTVD(KF_UV,PU,PV,PVOR,PDIV) !$ACC END DATA #endif ! ------------------------------------------------------------------ +END ASSOCIATE END SUBROUTINE UVTVD END MODULE UVTVD_MOD diff --git a/src/trans/gpu/internal/vdtuv_mod.F90 b/src/trans/gpu/internal/vdtuv_mod.F90 index 68e8c47c8..eeb687f11 100755 --- a/src/trans/gpu/internal/vdtuv_mod.F90 +++ b/src/trans/gpu/internal/vdtuv_mod.F90 @@ -14,10 +14,9 @@ MODULE VDTUV_MOD SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT -USE TPM_DIM, ONLY: R, R_NTMAX +USE TPM_DIM, ONLY: R USE TPM_FIELDS, ONLY: F -USE TPM_FIELDS_FLAT, ONLY: F_RLAPIN -USE TPM_DISTR, ONLY: D, D_NUMP, D_MYMS +USE TPM_DISTR, ONLY: D !**** *VDTUV* - Compute U,V in spectral space @@ -86,6 +85,8 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) ! LOCAL REAL SCALARS REAL(KIND=JPRBT) :: ZKM +ASSOCIATE(D_NUMP=>D%NUMP, D_MYMS=>D%MYMS, R_NTMAX=>R%NTMAX, F_RLAPIN=>F%RLAPIN) + #ifdef ACCGPU !$ACC DATA & !$ACC& PRESENT(R_NTMAX,D_MYMS,D_NUMP,F_RLAPIN) & @@ -157,6 +158,7 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) !$ACC END DATA #endif ! ------------------------------------------------------------------ +END ASSOCIATE END SUBROUTINE VDTUV END MODULE VDTUV_MOD diff --git a/src/trans/sedrenames.txt b/src/trans/sedrenames.txt index 49cb51484..e999ecfca 100644 --- a/src/trans/sedrenames.txt +++ b/src/trans/sedrenames.txt @@ -110,7 +110,7 @@ s/\ SULEG_MOD/\ SULEG_MOD_VARIANTDESIGNATOR/g s/SUTRLE_MOD/SUTRLE_MOD_VARIANTDESIGNATOR/g s/TPM_FFTW/TPM_FFTW_VARIANTDESIGNATOR/g s/TPM_FFT/TPM_FFT_VARIANTDESIGNATOR/g -s/TPM_FIELDS_FLAT/TPM_FIELDS_FLAT_VARIANTDESIGNATOR/g +s/TPM_FIELDS_GPU/TPM_FIELDS_GPU_VARIANTDESIGNATOR/g s/TPM_FLT/TPM_FLT_VARIANTDESIGNATOR/g s/TPM_TRANS/TPM_TRANS_VARIANTDESIGNATOR/g s/trans_end( *($|\(| |\*|\.h))/trans_end_VARIANTDESIGNATOR\1/g From a7681ae824ecd85fa3cd95ec81a625cb95c3101c Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Thu, 3 Oct 2024 05:06:28 -0700 Subject: [PATCH 02/10] add cleanup functionality --- src/trans/gpu/algor/growing_allocator_mod.F90 | 28 +- src/trans/gpu/algor/hicblas_cutlass.cuda.h | 18 +- src/trans/gpu/algor/hicblas_gemm.hip.cpp | 348 ++++++++++-------- src/trans/gpu/algor/hicblas_mod.F90 | 25 +- src/trans/gpu/algor/hicfft.hip.cpp | 143 ++++--- src/trans/gpu/external/trans_end.F90 | 3 + src/trans/gpu/internal/dealloc_resol_mod.F90 | 5 + src/trans/gpu/internal/ftdir_mod.F90 | 2 +- src/trans/gpu/internal/ftinv_mod.F90 | 2 +- src/trans/gpu/internal/ledir_mod.F90 | 4 +- src/trans/gpu/internal/leinv_mod.F90 | 4 +- src/trans/gpu/internal/tpm_hicfft.F90 | 40 +- 12 files changed, 383 insertions(+), 239 deletions(-) diff --git a/src/trans/gpu/algor/growing_allocator_mod.F90 b/src/trans/gpu/algor/growing_allocator_mod.F90 index f8de0fc90..7e7919b47 100644 --- a/src/trans/gpu/algor/growing_allocator_mod.F90 +++ b/src/trans/gpu/algor/growing_allocator_mod.F90 @@ -5,6 +5,7 @@ MODULE GROWING_ALLOCATOR_MOD PRIVATE PUBLIC :: GROWING_ALLOCATION_TYPE PUBLIC :: REALLOCATE_GROWING_ALLOCATION, REGISTER_FREE_FUNCTION + PUBLIC :: DESTROY_GROWING_ALLOCATOR ABSTRACT INTERFACE SUBROUTINE FREE_FUNC_PROC(PTR, SZ) BIND(C) @@ -32,19 +33,12 @@ SUBROUTINE REALLOCATE_GROWING_ALLOCATION(ALLOC, SZ) USE TPM_GEN, ONLY: NOUT IMPLICIT NONE TYPE(GROWING_ALLOCATION_TYPE), INTENT(INOUT) :: ALLOC - INTEGER(C_SIZE_T) :: SZ - INTEGER :: I + INTEGER(C_SIZE_T), INTENT(IN) :: SZ ! Deallocate existing pointer IF (ASSOCIATED(ALLOC%PTR) .AND. SZ > SIZE(ALLOC%PTR, 1, C_SIZE_T)) THEN WRITE(NOUT,*) "WARNING: REALLOCATING GROWING POINTER CAUSING GRAPH REINSTANTIATION" - DO I = 1, ALLOC%FREE_FUNCS_SZ - CALL ALLOC%FREE_FUNCS(I)%FUNC(ALLOC%PTR, & - SIZE(ALLOC%PTR, 1, C_SIZE_T)) - ENDDO - !$ACC EXIT DATA DELETE(ALLOC%PTR) - DEALLOCATE(ALLOC%PTR) - NULLIFY(ALLOC%PTR) + CALL DESTROY_GROWING_ALLOCATOR(ALLOC) ENDIF IF (.NOT. ASSOCIATED(ALLOC%PTR)) THEN @@ -89,4 +83,20 @@ SUBROUTINE REGISTER_FREE_C(ALLOC_C, FREE_FUNC_C) BIND(C, NAME="growing_allocator END SUBROUTINE + SUBROUTINE DESTROY_GROWING_ALLOCATOR(ALLOC) + USE ISO_C_BINDING, ONLY: C_SIZE_T + IMPLICIT NONE + TYPE(GROWING_ALLOCATION_TYPE) :: ALLOC + INTEGER :: I + IF (ALLOCATED(ALLOC%PTR)) THEN + DO I = 1, ALLOC%FREE_FUNCS_SZ + CALL ALLOC%FREE_FUNCS(I)%FUNC(ALLOC%PTR, & + SIZE(ALLOC%PTR, 1, C_SIZE_T)) + ENDDO + !$ACC EXIT DATA DELETE(ALLOC%PTR) + DEALLOCATE(ALLOC%PTR) + NULLIFY(ALLOC%PTR) + ENDIF + END SUBROUTINE + END MODULE diff --git a/src/trans/gpu/algor/hicblas_cutlass.cuda.h b/src/trans/gpu/algor/hicblas_cutlass.cuda.h index 7a842a808..a9148e93e 100644 --- a/src/trans/gpu/algor/hicblas_cutlass.cuda.h +++ b/src/trans/gpu/algor/hicblas_cutlass.cuda.h @@ -72,6 +72,7 @@ class cutlass_sgemm_grouped { static constexpr int sz_align = 8; public: + using real_type = float; void operator()(cudaStream_t stream, int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc) const { @@ -129,6 +130,7 @@ class cutlass_sgemm_grouped { static constexpr int sz_align = 1; public: + using real_type = float; void operator()(cudaStream_t stream, int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc) const { @@ -149,7 +151,7 @@ class cutlass_sgemm_grouped { } // namespace detail template -void cutlass_sgemm_wrapper_grouped_op(int blas_id, int m, int *n, int *k, +void cutlass_sgemm_wrapper_grouped_op(int resol_id, int blas_id, int m, int *n, int *k, float alpha, const float *A, int lda, int *offsetsA, const float *B, int ldb, int *offsetsB, float beta, float *C, @@ -165,18 +167,18 @@ void cutlass_sgemm_wrapper_grouped_op(int blas_id, int m, int *n, int *k, if (capability_major >= 8 && use_3xtf32) run_group_graph(cutlass_sgemm_grouped(), - m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, + resol_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, batchCount, stream, blas_id, growing_allocator); else run_group_graph(cutlass_sgemm_grouped(), - m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, + resol_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, batchCount, stream, blas_id, growing_allocator); } -void cutlass_sgemm_wrapper_grouped(int blas_id, char transa, char transb, +void cutlass_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa, char transb, int m, int *n, int *k, float alpha, const float *A, int lda, int *offsetsA, const float *B, int ldb, int *offsetsB, float beta, @@ -186,19 +188,19 @@ void cutlass_sgemm_wrapper_grouped(int blas_id, char transa, char transb, if (transa == 'N' && transb == 'N') cutlass_sgemm_wrapper_grouped_op( - blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, + resol_id, blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, batchCount, stream, growing_allocator); else if (transa == 'N' && transb == 'T') cutlass_sgemm_wrapper_grouped_op( - blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, + resol_id, blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, batchCount, stream, growing_allocator); else if (transa == 'T' && transb == 'N') cutlass_sgemm_wrapper_grouped_op( - blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, + resol_id, blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, batchCount, stream, growing_allocator); else if (transa == 'T' && transb == 'T') cutlass_sgemm_wrapper_grouped_op( - blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, + resol_id, blas_id, m, n, k, alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, batchCount, stream, growing_allocator); else assert(false); diff --git a/src/trans/gpu/algor/hicblas_gemm.hip.cpp b/src/trans/gpu/algor/hicblas_gemm.hip.cpp index 9d6178bed..73d3100fb 100644 --- a/src/trans/gpu/algor/hicblas_gemm.hip.cpp +++ b/src/trans/gpu/algor/hicblas_gemm.hip.cpp @@ -21,63 +21,88 @@ #include "growing_allocator.h" +bool hip_alreadyAllocated_sgemm = false; +bool hip_alreadyAllocated_sgemm_handle = false; -bool hip_alreadyAllocated_sgemm=false; -bool hip_alreadyAllocated_sgemm_handle=false; - -bool hip_alreadyAllocated_dsgemm=false; -bool hip_alreadyAllocated_dgemm_handle=false; +bool hip_alreadyAllocated_dsgemm = false; +bool hip_alreadyAllocated_dgemm_handle = false; hipblasHandle_t handle_hip_sgemm; hipblasHandle_t handle_hip_dgemm; - namespace { -namespace detail { -struct pair_hash { - std::size_t operator()(const std::pair &p) const { - return p.first * 10000 + p.second; +struct cache_key { + int resol_id; + int m; + int blas_id; + + bool operator==(const cache_key &other) const { + return resol_id == other.resol_id && m == other.m && + blas_id == other.blas_id; + } + cache_key(int resol_id_, int m_, int blas_id_) + : resol_id(resol_id_), m(m_), blas_id(blas_id_) {} +}; +} // namespace +template <> struct std::hash { + std::size_t operator()(const cache_key &k) const { + return k.blas_id * 1000000 + k.resol_id * 10000 + k.m; } }; -} // namespace detail -template auto &get_graph_cache() { - // we store at most one graph per "m" (# fields) and "blas id" - static std::unordered_map, hipGraphExec_t, - detail::pair_hash> +namespace { +template auto &get_graph_cache() { + // we store at most one graph per "m" (# fields) and "blas id" and resolution + static std::unordered_map> graphCache; return graphCache; } -template auto &get_ptr_cache() { +template auto &get_ptr_cache() { + using real_t = typename Gemm::real_type; static std::unordered_map< - std::pair, std::tuple, - detail::pair_hash> + cache_key, std::tuple> ptrCache; return ptrCache; } -template void free_gemm_cache(float *, size_t) { - get_graph_cache().clear(); - get_ptr_cache().clear(); +template void free_gemm_graph_cache(float *, size_t) { + get_graph_cache().clear(); + get_ptr_cache().clear(); +} +template +void erase_resol_from_cache(Cache &cache, int resol_id) { + // Note that in C++20 this could also be std::erase_if + int erased = 0; + for (auto it = cache.begin(); it != cache.end();) { + if (it->first.resol_id == resol_id) { + it = cache.erase(it); + ++erased; + } else + ++it; + } +} +template void erase_from_caches(int resol_id) { + erase_resol_from_cache(get_graph_cache(), resol_id); + erase_resol_from_cache(get_ptr_cache(), resol_id); } // this version is using graphs and caches the graphs template -void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha, - const Real *A, int lda, int *offsetsA, const Real *B, - int ldb, int *offsetsB, Real beta, Real *C, int ldc, - int *offsetsC, int batchCount, hipStream_t stream, +void run_group_graph(Gemm &&gemm, int resol_id, int m, int *n, int *k, + Real alpha, const Real *A, int lda, int *offsetsA, + const Real *B, int ldb, int *offsetsB, Real beta, Real *C, + int ldc, int *offsetsC, int batchCount, hipStream_t stream, int blas_id, void *growing_allocator) { growing_allocator_register_free_c(growing_allocator, - free_gemm_cache); + free_gemm_graph_cache); // we store at most one graph per "m" (# fields) and "blas id" - auto &graphCache = get_graph_cache(); + auto &graphCache = get_graph_cache(); // we also store A, B, and C and recreate the graph if they change - auto &ptrCache = get_ptr_cache(); + auto &ptrCache = get_ptr_cache(); - auto key = std::make_pair(m, blas_id); + auto key = cache_key{resol_id, m, blas_id}; auto ptrs = ptrCache.find(key); if (ptrs != ptrCache.end() && @@ -86,13 +111,14 @@ void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha, // the plan is cached, but the pointers are not correct. we remove and // delete the graph, but we keep the hipblas handles, if this happens more // often, we should cache this... - std::cout << "WARNING GEMM: POINTER CHANGE - Graph recreation might be slow." << std::endl; + std::cout + << "WARNING GEMM: POINTER CHANGE - Graph recreation might be slow." + << std::endl; std::cout << "We have an entry with key {m=" << m << ", blas_id=" << blas_id - << "}\n"; + << ", resol_id=" << resol_id << "}\n"; std::cout << "Pointers: " << std::get<0>(ptrs->second) << ", " << std::get<1>(ptrs->second) << ", " << std::get<2>(ptrs->second) << " vs. " << A << ", " << B << ", " << C << std::endl; - HIC_CHECK(hipGraphExecDestroy(graphCache[key])); graphCache.erase(key); ptrCache.erase(key); } @@ -115,32 +141,36 @@ void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha, hipGraph_t my_graph; HIC_CHECK(hipStreamEndCapture(stream, &my_graph)); hipGraphNode_t my_node; - HIC_CHECK(hipGraphAddChildGraphNode(&my_node, new_graph, nullptr, 0, - my_graph)); + HIC_CHECK( + hipGraphAddChildGraphNode(&my_node, new_graph, nullptr, 0, my_graph)); } hipGraphExec_t instance; HIC_CHECK(hipGraphInstantiate(&instance, new_graph, NULL, NULL, 0)); HIC_CHECK(hipStreamDestroy(stream)); HIC_CHECK(hipGraphDestroy(new_graph)); - graphCache.insert({key, instance}); + graphCache.insert({key, std::shared_ptr( + new hipGraphExec_t{instance}, [](auto ptr) { + HIC_CHECK(hipGraphExecDestroy(*ptr)); + delete ptr; + })}); ptrCache.insert({key, std::make_tuple(A, B, C)}); } - HIC_CHECK(hipGraphLaunch(graphCache.at(key), stream)); + HIC_CHECK(hipGraphLaunch(*graphCache.at(key), stream)); } // stupid simple gemm calls template -void run_group(Gemm &&gemm, int m, int *n, int *k, Real alpha, const Real *A, - int lda, int *offsetsA, const Real *B, int ldb, int *offsetsB, - Real beta, Real *C, int ldc, int *offsetsC, int batchCount, - hipStream_t stream, int = -1) { +void run_group(Gemm &&gemm, int resol_id, int m, int *n, int *k, Real alpha, + const Real *A, int lda, int *offsetsA, const Real *B, int ldb, + int *offsetsB, Real beta, Real *C, int ldc, int *offsetsC, + int batchCount, hipStream_t stream, int = -1) { for (int i = 0; i < batchCount; ++i) { if (m == 0 || n[i] == 0 || k[i] == 0) continue; - gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i], ldb, - beta, C + offsetsC[i], ldc); + gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i], + ldb, beta, C + offsetsC[i], ldc); } } @@ -148,7 +178,6 @@ void run_group(Gemm &&gemm, int m, int *n, int *k, Real alpha, const Real *A, #include "hicblas_cutlass.cuda.h" #endif -namespace detail { hipblasHandle_t get_hipblas_handle() { static hipblasHandle_t handle; if (!handle) @@ -157,6 +186,7 @@ hipblasHandle_t get_hipblas_handle() { } template struct hipblas_gemm_grouped { public: + using real_type = Real; hipblas_gemm_grouped(hipblasOperation_t transa, hipblasOperation_t transb) : transa_(transa), transb_(transb) { // we need to get the hipblas handle here, otherwise this could be created @@ -171,156 +201,166 @@ template struct hipblas_gemm_grouped { if constexpr (std::is_same::value) HICBLAS_CHECK(hipblasSgemm(handle, transa_, transb_, m, n, k, &alpha, A, - lda, B, ldb, &beta, C, ldc)); + lda, B, ldb, &beta, C, ldc)); if constexpr (std::is_same::value) HICBLAS_CHECK(hipblasDgemm(handle, transa_, transb_, m, n, k, &alpha, A, - lda, B, ldb, &beta, C, ldc)); + lda, B, ldb, &beta, C, ldc)); } private: hipblasOperation_t transa_, transb_; }; -} // namespace detail #ifndef USE_CUTLASS -void hipblas_sgemm_wrapper_grouped(int blas_id, char transa, char transb, - int m, int *n, int *k, float alpha, - const float *A, int lda, int *offsetsA, - const float *B, int ldb, int *offsetsB, float beta, - float *C, int ldc, int *offsetsC, - int batchCount, hipStream_t stream, - void *growing_allocator) { - - hipblasOperation_t op_t1=HIPBLAS_OP_N, op_t2=HIPBLAS_OP_N; - if (transa=='T' || transa=='t') - op_t1=HIPBLAS_OP_T; - if (transb=='T' || transb=='t') - op_t2=HIPBLAS_OP_T; - - using namespace detail; +void hipblas_sgemm_wrapper_grouped( + int resol_id, int blas_id, char transa, char transb, int m, int *n, int *k, + float alpha, const float *A, int lda, int *offsetsA, const float *B, + int ldb, int *offsetsB, float beta, float *C, int ldc, int *offsetsC, + int batchCount, hipStream_t stream, void *growing_allocator) { + + hipblasOperation_t op_t1 = HIPBLAS_OP_N, op_t2 = HIPBLAS_OP_N; + if (transa == 'T' || transa == 't') + op_t1 = HIPBLAS_OP_T; + if (transb == 'T' || transb == 't') + op_t2 = HIPBLAS_OP_T; + #ifdef USE_GRAPHS_GEMM - run_group_graph(hipblas_gemm_grouped(op_t1, op_t2), m, n, k, alpha, A, - lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, - batchCount, stream, blas_id, growing_allocator); + run_group_graph(hipblas_gemm_grouped(op_t1, op_t2), resol_id, m, n, k, + alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, + offsetsC, batchCount, stream, blas_id, growing_allocator); #else - run_group(hipblas_gemm_grouped(op_t1, op_t2), m, n, k, alpha, A, - lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, + run_group(hipblas_gemm_grouped(op_t1, op_t2), resol_id, m, n, k, alpha, + A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, batchCount, stream); #endif } #endif -void hipblas_dgemm_wrapper_grouped(int blas_id, char transa, char transb, - int m, int *n, int *k, - double alpha, - const double *A, int lda, int *offsetsA, - const double *B, int ldb, int *offsetsB, - double beta, - double *C, int ldc, int *offsetsC, - int batchCount, hipStream_t stream, void *) { - - hipblasOperation_t op_t1=HIPBLAS_OP_N, op_t2=HIPBLAS_OP_N; - if (transa=='T' || transa=='t') - op_t1=HIPBLAS_OP_T; - if (transb=='T' || transb=='t') - op_t2=HIPBLAS_OP_T; - - using namespace detail; - run_group(hipblas_gemm_grouped(op_t1, op_t2), m, n, k, alpha, - A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, - batchCount, stream, blas_id); +void hipblas_dgemm_wrapper_grouped(int resol_id, int blas_id, char transa, + char transb, int m, int *n, int *k, + double alpha, const double *A, int lda, + int *offsetsA, const double *B, int ldb, + int *offsetsB, double beta, double *C, + int ldc, int *offsetsC, int batchCount, + hipStream_t stream, void *) { + + hipblasOperation_t op_t1 = HIPBLAS_OP_N, op_t2 = HIPBLAS_OP_N; + if (transa == 'T' || transa == 't') + op_t1 = HIPBLAS_OP_T; + if (transb == 'T' || transb == 't') + op_t2 = HIPBLAS_OP_T; + + run_group(hipblas_gemm_grouped(op_t1, op_t2), resol_id, m, n, k, + alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, C, ldc, offsetsC, + batchCount, stream, blas_id); } } // namespace extern "C" { -void hipblas_dgemm_wrapper (char transa, char transb, - int m, int n,int k, double alpha, - const double *A, int lda, int tda, - const double *B, int ldb, int tdb, double beta, - double *C, int ldc, int tdc, int batchCount, - size_t stream, - void *growing_allocator) { - - hipblasOperation_t op_t1=HIPBLAS_OP_N, op_t2=HIPBLAS_OP_N; - - if (transa=='T' || transa=='t') - op_t1=HIPBLAS_OP_T; - if (transb=='T' || transb=='t') - op_t2=HIPBLAS_OP_T; - - if (!hip_alreadyAllocated_dgemm_handle){ +void hipblas_dgemm_wrapper(char transa, char transb, int m, int n, int k, + double alpha, const double *A, int lda, int tda, + const double *B, int ldb, int tdb, double beta, + double *C, int ldc, int tdc, int batchCount, + size_t stream, void *growing_allocator) { + + hipblasOperation_t op_t1 = HIPBLAS_OP_N, op_t2 = HIPBLAS_OP_N; + + if (transa == 'T' || transa == 't') + op_t1 = HIPBLAS_OP_T; + if (transb == 'T' || transb == 't') + op_t2 = HIPBLAS_OP_T; + + if (!hip_alreadyAllocated_dgemm_handle) { hipblasCreate(&handle_hip_dgemm); - hip_alreadyAllocated_dgemm_handle=true; + hip_alreadyAllocated_dgemm_handle = true; } - hipblasHandle_t handle = detail::get_hipblas_handle(); - HICBLAS_CHECK( - hipblasSetStream(handle, *(hipStream_t*)stream)); - - HICBLAS_CHECK(hipblasDgemmStridedBatched(handle,op_t1,op_t2,m,n,k, - &alpha,(const double *) A,lda,tda, (const double *) B,ldb,tdb, - &beta,(double *) C,ldc,tdc,batchCount)); + hipblasHandle_t handle = get_hipblas_handle(); + HICBLAS_CHECK(hipblasSetStream(handle, *(hipStream_t *)stream)); + HICBLAS_CHECK(hipblasDgemmStridedBatched( + handle, op_t1, op_t2, m, n, k, &alpha, (const double *)A, lda, tda, + (const double *)B, ldb, tdb, &beta, (double *)C, ldc, tdc, batchCount)); } -void hipblas_sgemm_wrapper (char transa, char transb, - int m, int n,int k, float alpha, - const float *A, int lda, int tda, - const float *B, int ldb, int tdb, float beta, - float *C, int ldc, int tdc, - int batchCount, - void *growing_allocator) { +void hipblas_sgemm_wrapper(char transa, char transb, int m, int n, int k, + float alpha, const float *A, int lda, int tda, + const float *B, int ldb, int tdb, float beta, + float *C, int ldc, int tdc, int batchCount, + void *growing_allocator) { - hipblasOperation_t op_t1=HIPBLAS_OP_N, op_t2=HIPBLAS_OP_N; + hipblasOperation_t op_t1 = HIPBLAS_OP_N, op_t2 = HIPBLAS_OP_N; - if (transa=='T' || transa=='t') - op_t1=HIPBLAS_OP_T; - if (transb=='T' || transb=='t') - op_t2=HIPBLAS_OP_T; + if (transa == 'T' || transa == 't') + op_t1 = HIPBLAS_OP_T; + if (transb == 'T' || transb == 't') + op_t2 = HIPBLAS_OP_T; - if (!hip_alreadyAllocated_sgemm_handle){ + if (!hip_alreadyAllocated_sgemm_handle) { hipblasCreate(&handle_hip_sgemm); - hip_alreadyAllocated_sgemm_handle=true; + hip_alreadyAllocated_sgemm_handle = true; } - HICBLAS_CHECK(hipblasSgemmStridedBatched(handle_hip_sgemm,op_t1,op_t2,m,n,k, - &alpha,(const float *) A,lda,tda, (const float *) B,ldb,tdb, - &beta,(float*) C,ldc,tdc,batchCount)); - + HICBLAS_CHECK(hipblasSgemmStridedBatched( + handle_hip_sgemm, op_t1, op_t2, m, n, k, &alpha, (const float *)A, lda, + tda, (const float *)B, ldb, tdb, &beta, (float *)C, ldc, tdc, + batchCount)); } -void hipblas_sgemm_wrapper_grouped(int blas_id, char transa, char transb, - int m, int *n, int *k, float alpha, - const float *A, int lda, int *offsetsA, - const float *B, int ldb, int *offsetsB, float beta, - float *C, int ldc, int *offsetsC, - int batchCount, size_t stream, - void *growing_allocator) { +void hipblas_sgemm_wrapper_grouped(int resol_id, int blas_id, char transa, + char transb, int m, int *n, int *k, + float alpha, const float *A, int lda, + int *offsetsA, const float *B, int ldb, + int *offsetsB, float beta, float *C, int ldc, + int *offsetsC, int batchCount, size_t stream, + void *growing_allocator) { #ifdef USE_CUTLASS - cutlass_sgemm_wrapper_grouped(blas_id, transa, transb, m, n, k, alpha, A, lda, offsetsA, - B, ldb, offsetsB, beta, C, ldc, offsetsC, batchCount, - *(hipStream_t*)stream, - growing_allocator); + cutlass_sgemm_wrapper_grouped(resol_id, blas_id, transa, transb, m, n, k, + alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, + C, ldc, offsetsC, batchCount, + *(hipStream_t *)stream, growing_allocator); #else - hipblas_sgemm_wrapper_grouped(blas_id, transa, transb, m, n, k, alpha, A, lda, - offsetsA, B, ldb, offsetsB, beta, C, ldc, - offsetsC, batchCount, - *(hipStream_t*)stream, - growing_allocator); + hipblas_sgemm_wrapper_grouped(resol_id, blas_id, transa, transb, m, n, k, + alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, + C, ldc, offsetsC, batchCount, + *(hipStream_t *)stream, growing_allocator); #endif } -void hipblas_dgemm_wrapper_grouped(int blas_id, char transa, char transb, - int m, int *n, int *k, double alpha, - const double *A, int lda, int *offsetsA, - const double *B, int ldb, int *offsetsB, double beta, - double *C, int ldc, int *offsetsC, - int batchCount, size_t stream, - void *growing_allocator) { - hipblas_dgemm_wrapper_grouped(blas_id, transa, transb, m, n, k, alpha, A, lda, offsetsA, B, - ldb, offsetsB, beta, C, ldc, offsetsC, batchCount, - *(hipStream_t*)stream, - growing_allocator); +void hipblas_dgemm_wrapper_grouped(int resol_id, int blas_id, char transa, + char transb, int m, int *n, int *k, + double alpha, const double *A, int lda, + int *offsetsA, const double *B, int ldb, + int *offsetsB, double beta, double *C, + int ldc, int *offsetsC, int batchCount, + size_t stream, void *growing_allocator) { + hipblas_dgemm_wrapper_grouped(resol_id, blas_id, transa, transb, m, n, k, + alpha, A, lda, offsetsA, B, ldb, offsetsB, beta, + C, ldc, offsetsC, batchCount, + *(hipStream_t *)stream, growing_allocator); +} + +void clean_gemm(int resol_id) { + erase_from_caches>(resol_id); + erase_from_caches>(resol_id); +#ifdef USE_CUTLASS + erase_from_caches>(resol_id); + erase_from_caches>(resol_id); + erase_from_caches>(resol_id); + erase_from_caches>(resol_id); + erase_from_caches>(resol_id); + erase_from_caches>(resol_id); + erase_from_caches>(resol_id); + erase_from_caches>(resol_id); +#endif } } diff --git a/src/trans/gpu/algor/hicblas_mod.F90 b/src/trans/gpu/algor/hicblas_mod.F90 index 988e1b3ef..fecf93243 100644 --- a/src/trans/gpu/algor/hicblas_mod.F90 +++ b/src/trans/gpu/algor/hicblas_mod.F90 @@ -63,10 +63,17 @@ SUBROUTINE HIP_SGEMM_BATCHED( & TYPE(C_PTR), INTENT(IN), VALUE :: ALLOC END SUBROUTINE HIP_SGEMM_BATCHED END INTERFACE +INTERFACE + PUBLIC CLEAN_GEMM + SUBROUTINE CLEAN_GEMM(RESOL_ID) BIND(C, NAME="clean_gemm") + USE ISO_C_BINDING + INTEGER(KIND=C_INT), INTENT(IN), VALUE :: RESOL_ID + END SUBROUTINE +END INTERFACE INTERFACE SUBROUTINE HIP_DGEMM_GROUPED( & - & BLAS_ID, CTA, CTB, & + & RESOL_ID, BLAS_ID, CTA, CTB, & & M, N, K, & & ALPHA, & & A, LDA, OFFSETA, & @@ -77,7 +84,7 @@ SUBROUTINE HIP_DGEMM_GROUPED( & &) BIND(C, NAME='hipblas_dgemm_wrapper_grouped') USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_DOUBLE, C_SIZE_T, C_PTR CHARACTER(1,C_CHAR), VALUE :: CTA, CTB - INTEGER(C_INT), VALUE :: BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT + INTEGER(C_INT), VALUE :: RESOL_ID, BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT INTEGER(C_INT) :: N(*), K(*), OFFSETA(*), OFFSETB(*), OFFSETC(*) REAL(C_DOUBLE), VALUE :: ALPHA,BETA REAL(C_DOUBLE) :: A(*), B(*), C(*) @@ -86,7 +93,7 @@ SUBROUTINE HIP_DGEMM_GROUPED( & END SUBROUTINE HIP_DGEMM_GROUPED SUBROUTINE HIP_SGEMM_GROUPED( & - & BLAS_ID, CTA, CTB, & + & RESOL_ID, BLAS_ID, CTA, CTB, & & M, N, K, & & ALPHA, & & A, LDA, OFFSETA, & @@ -97,7 +104,7 @@ SUBROUTINE HIP_SGEMM_GROUPED( & &) BIND(C, NAME='hipblas_sgemm_wrapper_grouped') USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_FLOAT, C_SIZE_T, C_PTR CHARACTER(1,C_CHAR), VALUE :: CTA, CTB - INTEGER(C_INT), VALUE :: BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT + INTEGER(C_INT), VALUE :: RESOL_ID, BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT INTEGER(C_INT) :: N(*), K(*), OFFSETA(*), OFFSETB(*), OFFSETC(*) REAL(C_FLOAT), VALUE :: ALPHA,BETA REAL(C_FLOAT) :: A(*), B(*), C(*) @@ -203,7 +210,7 @@ SUBROUTINE HIP_SGEMM_BATCHED_OVERLOAD( & END SUBROUTINE HIP_SGEMM_BATCHED_OVERLOAD SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD( & - & BLAS_ID, TRANSA, TRANSB, & + & RESOL_ID, BLAS_ID, TRANSA, TRANSB, & & M, N, K, & & ALPHA, & & AARRAY, LDA, OFFSETA, & @@ -212,6 +219,7 @@ SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD( & & CARRAY, LDC, OFFSETC, & & BATCHCOUNT, STREAM, ALLOC) USE ISO_C_BINDING, ONLY: C_INT, C_CHAR, C_LONG, C_LOC + INTEGER(KIND=C_INT), INTENT(IN) :: RESOL_ID INTEGER(KIND=C_INT), INTENT(IN) :: BLAS_ID CHARACTER(1,C_CHAR), VALUE :: TRANSA, TRANSB INTEGER(KIND=JPIM) :: M @@ -237,7 +245,7 @@ SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD( & HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG) CALL HIP_DGEMM_GROUPED( & - & BLAS_ID, TRANSA, TRANSB, & + & RESOL_ID, BLAS_ID, TRANSA, TRANSB, & & M, N, K, & & ALPHA, & & AARRAY, LDA, OFFSETA, & @@ -249,7 +257,7 @@ SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD( & END SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD(& - & BLAS_ID, TRANSA, TRANSB, & + & RESOL_ID, BLAS_ID, TRANSA, TRANSB, & & M, N, K, & & ALPHA, & & AARRAY, LDA, OFFSETA, & @@ -258,6 +266,7 @@ SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD(& & CARRAY, LDC, OFFSETC, & & BATCHCOUNT, STREAM, ALLOC) USE ISO_C_BINDING, ONLY: C_INT, C_CHAR, C_LONG, C_LOC + INTEGER(KIND=C_INT), INTENT(IN) :: RESOL_ID INTEGER(KIND=C_INT), INTENT(IN) :: BLAS_ID CHARACTER(1,C_CHAR), VALUE :: TRANSA, TRANSB INTEGER(KIND=JPIM) :: M @@ -286,7 +295,7 @@ SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD(& !$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY) #endif CALL HIP_SGEMM_GROUPED( & - & BLAS_ID, TRANSA, TRANSB, & + & RESOL_ID, BLAS_ID, TRANSA, TRANSB, & & M, N, K, & & ALPHA, & & AARRAY, LDA, OFFSETA, & diff --git a/src/trans/gpu/algor/hicfft.hip.cpp b/src/trans/gpu/algor/hicfft.hip.cpp index 8278d9a4b..a1329372c 100644 --- a/src/trans/gpu/algor/hicfft.hip.cpp +++ b/src/trans/gpu/algor/hicfft.hip.cpp @@ -1,5 +1,8 @@ #include "hicfft.h" +#include +#include + #include "growing_allocator.h" #define fftSafeCall(err) __fftSafeCall(err, __FILE__, __LINE__) @@ -43,57 +46,100 @@ template class hicfft_plan { real *data_real_l = &data_real[offset]; cmplx *data_complex_l = &data_complex[offset / 2]; if constexpr (Direction == HIPFFT_R2C) - fftSafeCall(hipfftExecR2C(handle, data_real_l, data_complex_l)); + fftSafeCall(hipfftExecR2C(*handle_ptr, data_real_l, data_complex_l)); else if constexpr (Direction == HIPFFT_C2R) - fftSafeCall(hipfftExecC2R(handle, data_complex_l, data_real_l)); + fftSafeCall(hipfftExecC2R(*handle_ptr, data_complex_l, data_real_l)); else if constexpr (Direction == HIPFFT_D2Z) - fftSafeCall(hipfftExecD2Z(handle, data_real_l, data_complex_l)); + fftSafeCall(hipfftExecD2Z(*handle_ptr, data_real_l, data_complex_l)); else if constexpr (Direction == HIPFFT_Z2D) - fftSafeCall(hipfftExecZ2D(handle, data_complex_l, data_real_l)); + fftSafeCall(hipfftExecZ2D(*handle_ptr, data_complex_l, data_real_l)); } void set_stream(hipStream_t stream) { - fftSafeCall(hipfftSetStream(handle, stream)); + fftSafeCall(hipfftSetStream(*handle_ptr, stream)); } hicfft_plan(hipfftHandle handle_, int offset_) - : handle(handle_), offset(offset_) {} + : handle_ptr(new hipfftHandle{handle_}, + [](auto ptr) { + fftSafeCall(cufftDestroy(*ptr)); + delete ptr; + }), + offset(offset_) {} private: - hipfftHandle handle; + std::shared_ptr handle_ptr; int offset; }; +struct cache_key { + int resol_id; + int kfield; + bool operator==(const cache_key &other) const { + return resol_id == other.resol_id && kfield == other.kfield; + } + cache_key(int resol_id_, int kfield_) + : resol_id(resol_id_), kfield(kfield_) {} +}; +} // namespace + +template <> struct std::hash { + std::size_t operator()(const cache_key &k) const { + return k.resol_id * 10000 + k.kfield; + } +}; + +namespace { // kfield -> handles template auto &get_fft_plan_cache() { - static std::unordered_map>> + static std::unordered_map>> fftPlansCache; return fftPlansCache; } // kfield -> graphs template auto &get_graph_cache() { - static std::unordered_map graphCache; + static std::unordered_map> + graphCache; return graphCache; } // kfield -> ptrs template auto &get_ptr_cache() { using real = typename Type::real; using cmplx = typename Type::cmplx; - static std::unordered_map> ptrCache; + static std::unordered_map> ptrCache; return ptrCache; } template -void free_fft_cache(float *, size_t) { +void free_fft_graph_cache(float *, size_t) { get_graph_cache().clear(); get_ptr_cache().clear(); } +template +void erase_resol_from_cache(Cache &cache, int resol_id) { + // Note that in C++20 this could also be std::erase_if + int erased = 0; + for (auto it = cache.begin(); it != cache.end();) { + if (it->first.resol_id == resol_id) { + it = cache.erase(it); + ++erased; + } else + ++it; + } +} +template +void erase_from_caches(int resol_id) { + erase_resol_from_cache(get_fft_plan_cache(), resol_id); + erase_resol_from_cache(get_graph_cache(), resol_id); + erase_resol_from_cache(get_ptr_cache(), resol_id); +} template -std::vector> plan_all(int kfield, int *loens, - int nfft, int *offsets) { +std::vector> +plan_all(int resol_id, int kfield, int *loens, int nfft, int *offsets) { static constexpr bool is_forward = Direction == HIPFFT_R2C || Direction == HIPFFT_D2Z; - auto key = kfield; + auto key = cache_key{resol_id, kfield}; auto &fftPlansCache = get_fft_plan_cache(); auto fftPlans = fftPlansCache.find(key); if (fftPlans == fftPlansCache.end()) { @@ -104,7 +150,6 @@ std::vector> plan_all(int kfield, int *loens, int nloen = loens[i]; hipfftHandle plan; - fftSafeCall(hipfftCreate(&plan)); int dist = offsets[i + 1] - offsets[i]; int embed[] = {1}; fftSafeCall(hipfftPlanMany( @@ -119,17 +164,18 @@ std::vector> plan_all(int kfield, int *loens, template void run_group_graph(typename Type::real *data_real, - typename Type::cmplx *data_complex, int kfield, int *loens, - int *offsets, int nfft, void *growing_allocator) { + typename Type::cmplx *data_complex, int resol_id, + int kfield, int *loens, int *offsets, int nfft, + void *growing_allocator) { growing_allocator_register_free_c(growing_allocator, - free_fft_cache); + free_fft_graph_cache); // if the pointers are changed, we need to update the graph auto &ptrCache = get_ptr_cache(); // kfield -> ptrs auto &graphCache = get_graph_cache(); // kfield -> graphs - auto key = kfield; + auto key = cache_key{resol_id, kfield}; auto ptrs = ptrCache.find(key); if (ptrs != ptrCache.end() && (ptrs->second.first != data_real || ptrs->second.second != data_complex)) { @@ -138,7 +184,6 @@ void run_group_graph(typename Type::real *data_real, // we should cache this... std::cout << "WARNING FFT: POINTER CHANGE --> THIS MIGHT BE SLOW" << std::endl; - HIC_CHECK(hipGraphExecDestroy(graphCache[key])); graphCache.erase(key); ptrCache.erase(key); } @@ -146,7 +191,8 @@ void run_group_graph(typename Type::real *data_real, auto graph = graphCache.find(key); if (graph == graphCache.end()) { // this graph does not exist yet - auto plans = plan_all(kfield, loens, nfft, offsets); + auto plans = + plan_all(resol_id, kfield, loens, nfft, offsets); // create a temporary stream hipStream_t stream; @@ -172,19 +218,24 @@ void run_group_graph(typename Type::real *data_real, HIC_CHECK(hipStreamDestroy(stream)); HIC_CHECK(hipGraphDestroy(new_graph)); - graphCache.insert({key, instance}); + graphCache.insert({key, std::shared_ptr( + new hipGraphExec_t{instance}, [](auto ptr) { + HIC_CHECK(hipGraphExecDestroy(*ptr)); + delete ptr; + })}); ptrCache.insert({key, std::make_pair(data_real, data_complex)}); } - HIC_CHECK(hipGraphLaunch(graphCache.at(key), 0)); + HIC_CHECK(hipGraphLaunch(*graphCache.at(key), 0)); HIC_CHECK(hipDeviceSynchronize()); } template void run_group(typename Type::real *data_real, - typename Type::cmplx *data_complex, int kfield, int *loens, - int *offsets, int nfft, void *growing_allocator) { - auto plans = plan_all(kfield, loens, nfft, offsets); + typename Type::cmplx *data_complex, int resol_id, int kfield, + int *loens, int *offsets, int nfft, void *growing_allocator) { + auto plans = + plan_all(resol_id, kfield, loens, nfft, offsets); for (auto &plan : plans) plan.exec(data_real, data_complex); @@ -199,29 +250,37 @@ extern "C" { #define RUN run_group #endif void execute_dir_fft_float(float *data_real, hipfftComplex *data_complex, - int kfield, int *loens, int *offsets, int nfft, - void *growing_allocator) { - RUN(data_real, data_complex, kfield, loens, offsets, nfft, - growing_allocator); + int resol_id, int kfield, int *loens, int *offsets, + int nfft, void *growing_allocator) { + RUN(data_real, data_complex, resol_id, kfield, loens, + offsets, nfft, growing_allocator); } void execute_inv_fft_float(hipfftComplex *data_complex, float *data_real, - int kfield, int *loens, int *offsets, int nfft, - void *growing_allocator) { - RUN(data_real, data_complex, kfield, loens, offsets, nfft, - growing_allocator); + int resol_id, int kfield, int *loens, int *offsets, + int nfft, void *growing_allocator) { + RUN(data_real, data_complex, resol_id, kfield, loens, + offsets, nfft, growing_allocator); } void execute_dir_fft_double(double *data_real, - hipfftDoubleComplex *data_complex, int kfield, - int *loens, int *offsets, int nfft, + hipfftDoubleComplex *data_complex, int resol_id, + int kfield, int *loens, int *offsets, int nfft, void *growing_allocator) { - RUN(data_real, data_complex, kfield, loens, offsets, nfft, - growing_allocator); + RUN(data_real, data_complex, resol_id, kfield, loens, + offsets, nfft, growing_allocator); } void execute_inv_fft_double(hipfftDoubleComplex *data_complex, - double *data_real, int kfield, int *loens, - int *offsets, int nfft, void *growing_allocator) { - RUN(data_real, data_complex, kfield, loens, offsets, nfft, - growing_allocator); + double *data_real, int resol_id, int kfield, + int *loens, int *offsets, int nfft, + void *growing_allocator) { + RUN(data_real, data_complex, resol_id, kfield, loens, + offsets, nfft, growing_allocator); } #undef RUN + +void clean_fft(int resol_id) { + erase_from_caches(resol_id); + erase_from_caches(resol_id); + erase_from_caches(resol_id); + erase_from_caches(resol_id); +} } diff --git a/src/trans/gpu/external/trans_end.F90 b/src/trans/gpu/external/trans_end.F90 index 79e204dea..366c759a3 100755 --- a/src/trans/gpu/external/trans_end.F90 +++ b/src/trans/gpu/external/trans_end.F90 @@ -55,6 +55,7 @@ SUBROUTINE TRANS_END(CDMODE) USE TPM_CTL, ONLY: C, CTL_RESOL USE TPM_FLT, ONLY: S, FLT_RESOL USE TPM_TRANS, ONLY: GROWING_ALLOCATION +USE GROWING_ALLOCATOR_MOD,ONLY: DESTROY_GROWING_ALLOCATOR USE EQ_REGIONS_MOD, ONLY: N_REGIONS USE SET_RESOL_MOD, ONLY: SET_RESOL USE DEALLOC_RESOL_MOD, ONLY: DEALLOC_RESOL @@ -82,6 +83,8 @@ SUBROUTINE TRANS_END(CDMODE) DEALLOCATE(LENABLED) ENDIF + CALL DESTROY_GROWING_ALLOCATOR(GROWING_ALLOCATION) + NULLIFY(R) IF( ALLOCATED(DIM_RESOL) ) DEALLOCATE(DIM_RESOL) diff --git a/src/trans/gpu/internal/dealloc_resol_mod.F90 b/src/trans/gpu/internal/dealloc_resol_mod.F90 index 3b84e60e0..c59c5c3f5 100755 --- a/src/trans/gpu/internal/dealloc_resol_mod.F90 +++ b/src/trans/gpu/internal/dealloc_resol_mod.F90 @@ -49,6 +49,8 @@ SUBROUTINE DEALLOC_RESOL(KRESOL) USE TPM_GEOMETRY, ONLY: G, GEOM_TYPE USE TPM_FIELDS, ONLY: F, FIELDS_TYPE USE TPM_FIELDS_GPU, ONLY: FG, FIELDS_GPU_TYPE +USE TPM_HICFFT, ONLY: CLEAN_FFT +USE HICBLAS_MOD, ONLY: CLEAN_GEMM USE TPM_FLT, ONLY: S, FLT_TYPE_WRAP USE TPM_CTL, ONLY: C USE SEEFMM_MIX, ONLY: FREE_SEEFMM @@ -121,6 +123,9 @@ SUBROUTINE DEALLOC_RESOL(KRESOL) ENDIF S = S_ + CALL CLEAN_FFT(KRESOL) + CALL CLEAN_GEMM(KRESOL) + LENABLED(KRESOL)=.FALSE. NDEF_RESOL = COUNT(LENABLED) ! Do not stay on a disabled resolution diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index 0b72d6c4b..03ebc5ff1 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -118,7 +118,7 @@ SUBROUTINE FTDIR(ALLOCATOR,HFTDIR,PREEL_REAL,PREEL_COMPLEX,KFIELD) CALL GSTATS(430,1) ENDIF CALL GSTATS(413,0) - CALL EXECUTE_DIR_FFT(PREEL_REAL(:),PREEL_COMPLEX(:),KFIELD, & + CALL EXECUTE_DIR_FFT(PREEL_REAL(:),PREEL_COMPLEX(:),NCUR_RESOL,KFIELD, & & LOENS=G_NLOEN(D_NPTRLS(MYSETW):D_NPTRLS(MYSETW)+D_NDGL_FS-1), & & OFFSETS=D_NSTAGTF(1:D_NDGL_FS+1),ALLOC=ALLOCATOR%PTR) diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index f513cceea..2ef8d868a 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -113,7 +113,7 @@ SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD) CALL GSTATS(440,1) ENDIF CALL GSTATS(423,0) - CALL EXECUTE_INV_FFT(PREEL_COMPLEX(:),PREEL_REAL(:),KFIELD, & + CALL EXECUTE_INV_FFT(PREEL_COMPLEX(:),PREEL_REAL(:),NCUR_RESOL,KFIELD, & & LOENS=G_NLOEN(D_NPTRLS(MYSETW):D_NPTRLS(MYSETW)+D_NDGL_FS-1), & & OFFSETS=D_NSTAGTF(1:D_NDGL_FS),ALLOC=ALLOCATOR%PTR) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 19482dae6..110f00dc3 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -220,7 +220,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) !$ACC HOST_DATA USE_DEVICE(ZAA,ZINPA,ZOUT) #endif CALL HIP_GEMM( & - & 21, & ! unique identifier + & NCUR_RESOL, 21, & ! unique identifier & 'N', 'N', & & 2*KF_FS, NS(:), KS(:), & & 1.0_JPRBT, & @@ -335,7 +335,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) !$ACC HOST_DATA USE_DEVICE(ZAS,ZINPS,ZOUT) #endif CALL HIP_GEMM( & - & 22, & ! unique identifier + & NCUR_RESOL, 22, & ! unique identifier & 'N', 'N', & & 2*KF_FS, NS(:), KS(:), & & 1.0_JPRBT, & diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index b678f9dfd..0ef6b569e 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -274,7 +274,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) !$ACC HOST_DATA USE_DEVICE(ZAA,ZINP,ZOUTA) #endif CALL HIP_GEMM( & - & 11, & ! unique identifier + & NCUR_RESOL, 11, & ! unique identifier & 'N', 'T', & & 2*KF_LEG, NS(:), KS(:), & & 1.0_JPRBT, & @@ -408,7 +408,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) !$ACC HOST_DATA USE_DEVICE(ZAS,ZINP,ZOUTS) #endif CALL HIP_GEMM( & - & 12, & ! unique identifier + & NCUR_RESOL, 12, & ! unique identifier & 'N', 'T', & & 2*KF_LEG, NS(:), KS(:), & & 1.0_JPRBT, & diff --git a/src/trans/gpu/internal/tpm_hicfft.F90 b/src/trans/gpu/internal/tpm_hicfft.F90 index 9ae63df00..abb06a9f1 100755 --- a/src/trans/gpu/internal/tpm_hicfft.F90 +++ b/src/trans/gpu/internal/tpm_hicfft.F90 @@ -28,6 +28,7 @@ MODULE TPM_HICFFT PRIVATE PUBLIC EXECUTE_DIR_FFT, EXECUTE_INV_FFT + PUBLIC CLEAN_FFT INTERFACE EXECUTE_DIR_FFT MODULE PROCEDURE EXECUTE_DIR_FFT_FLOAT,EXECUTE_DIR_FFT_DOUBLE @@ -37,6 +38,13 @@ MODULE TPM_HICFFT MODULE PROCEDURE EXECUTE_INV_FFT_FLOAT,EXECUTE_INV_FFT_DOUBLE END INTERFACE +INTERFACE + SUBROUTINE CLEAN_FFT(RESOL_ID) BIND(C, NAME="clean_fft") + USE ISO_C_BINDING + INTEGER(KIND=C_INT), INTENT(IN), VALUE :: RESOL_ID + END SUBROUTINE +END INTERFACE + ! ------------------------------------------------------------------ @@ -44,22 +52,24 @@ MODULE TPM_HICFFT ! ------------------------------------------------------------------ -SUBROUTINE EXECUTE_DIR_FFT_FLOAT(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,ALLOC) +SUBROUTINE EXECUTE_DIR_FFT_FLOAT(PREEL_REAL,PREEL_COMPLEX,RESOL_ID,KFIELD,LOENS,OFFSETS,ALLOC) USE EC_PARKIND ,ONLY : JPIM IMPLICIT NONE REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_REAL(:) REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_COMPLEX(:) + INTEGER(KIND=JPIM),INTENT(IN) :: RESOL_ID INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC INTERFACE - SUBROUTINE EXECUTE_DIR_FFT_FLOAT_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & + SUBROUTINE EXECUTE_DIR_FFT_FLOAT_C(PREEL_REAL,PREEL_COMPLEX,RESOL_ID,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & & BIND(C, NAME="execute_dir_fft_float") USE ISO_C_BINDING REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_REAL(*) REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_COMPLEX(*) + INTEGER(KIND=C_INT),INTENT(IN),VALUE :: RESOL_ID INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(*), OFFSETS(*) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT @@ -70,28 +80,30 @@ SUBROUTINE EXECUTE_DIR_FFT_FLOAT_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS #ifdef ACCGPU !$ACC HOST_DATA USE_DEVICE(PREEL_REAL,PREEL_COMPLEX) #endif - CALL EXECUTE_DIR_FFT_FLOAT_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,SIZE(LOENS),C_LOC(ALLOC)) + CALL EXECUTE_DIR_FFT_FLOAT_C(PREEL_REAL,PREEL_COMPLEX,RESOL_ID,KFIELD,LOENS,OFFSETS,SIZE(LOENS),C_LOC(ALLOC)) #ifdef ACCGPU !$ACC END HOST_DATA #endif END SUBROUTINE EXECUTE_DIR_FFT_FLOAT -SUBROUTINE EXECUTE_DIR_FFT_DOUBLE(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,ALLOC) +SUBROUTINE EXECUTE_DIR_FFT_DOUBLE(PREEL_REAL,PREEL_COMPLEX,RESOL_ID,KFIELD,LOENS,OFFSETS,ALLOC) USE EC_PARKIND ,ONLY : JPIM IMPLICIT NONE REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_REAL(:) REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_COMPLEX(:) + INTEGER(KIND=JPIM),INTENT(IN) :: RESOL_ID INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC INTERFACE - SUBROUTINE EXECUTE_DIR_FFT_DOUBLE_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & + SUBROUTINE EXECUTE_DIR_FFT_DOUBLE_C(PREEL_REAL,PREEL_COMPLEX,RESOL_ID,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & & BIND(C, NAME="execute_dir_fft_double") USE ISO_C_BINDING, ONLY: C_DOUBLE, C_INT, C_PTR REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_REAL(*) REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_COMPLEX(*) + INTEGER(KIND=C_INT),INTENT(IN),VALUE :: RESOL_ID INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(*), OFFSETS(*) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT @@ -102,29 +114,31 @@ SUBROUTINE EXECUTE_DIR_FFT_DOUBLE_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSET #ifdef ACCGPU !$ACC HOST_DATA USE_DEVICE(PREEL_REAL,PREEL_COMPLEX) #endif - CALL EXECUTE_DIR_FFT_DOUBLE_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,SIZE(LOENS),C_LOC(ALLOC)) + CALL EXECUTE_DIR_FFT_DOUBLE_C(PREEL_REAL,PREEL_COMPLEX,RESOL_ID,KFIELD,LOENS,OFFSETS,SIZE(LOENS),C_LOC(ALLOC)) #ifdef ACCGPU !$ACC END HOST_DATA #endif END SUBROUTINE EXECUTE_DIR_FFT_DOUBLE -SUBROUTINE EXECUTE_INV_FFT_FLOAT(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,ALLOC) +SUBROUTINE EXECUTE_INV_FFT_FLOAT(PREEL_COMPLEX,PREEL_REAL,RESOL_ID,KFIELD,LOENS,OFFSETS,ALLOC) USE EC_PARKIND ,ONLY : JPIM IMPLICIT NONE REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_COMPLEX(:) REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_REAL(:) + INTEGER(KIND=JPIM),INTENT(IN) :: RESOL_ID INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC INTERFACE - SUBROUTINE EXECUTE_INV_FFT_FLOAT_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & + SUBROUTINE EXECUTE_INV_FFT_FLOAT_C(PREEL_COMPLEX,PREEL_REAL,RESOL_ID,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & & BIND(C, NAME="execute_inv_fft_float") USE ISO_C_BINDING, ONLY: C_FLOAT, C_INT, C_PTR REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_COMPLEX(*) REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_REAL(*) + INTEGER(KIND=C_INT),INTENT(IN),VALUE :: RESOL_ID INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(*), OFFSETS(*) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT @@ -135,28 +149,30 @@ SUBROUTINE EXECUTE_INV_FFT_FLOAT_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS #ifdef ACCGPU !$ACC HOST_DATA USE_DEVICE(PREEL_COMPLEX,PREEL_REAL) #endif - CALL EXECUTE_INV_FFT_FLOAT_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,SIZE(LOENS),C_LOC(ALLOC)) + CALL EXECUTE_INV_FFT_FLOAT_C(PREEL_COMPLEX,PREEL_REAL,RESOL_ID,KFIELD,LOENS,OFFSETS,SIZE(LOENS),C_LOC(ALLOC)) #ifdef ACCGPU !$ACC END HOST_DATA #endif END SUBROUTINE -SUBROUTINE EXECUTE_INV_FFT_DOUBLE(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,ALLOC) +SUBROUTINE EXECUTE_INV_FFT_DOUBLE(PREEL_COMPLEX,PREEL_REAL,RESOL_ID,KFIELD,LOENS,OFFSETS,ALLOC) USE EC_PARKIND ,ONLY : JPIM IMPLICIT NONE REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_COMPLEX(:) REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_REAL(:) + INTEGER(KIND=JPIM),INTENT(IN) :: RESOL_ID INTEGER(KIND=JPIM),INTENT(IN) :: KFIELD INTEGER(KIND=JPIM),INTENT(IN) :: LOENS(:), OFFSETS(:) TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC INTERFACE - SUBROUTINE EXECUTE_INV_FFT_DOUBLE_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & + SUBROUTINE EXECUTE_INV_FFT_DOUBLE_C(PREEL_COMPLEX,PREEL_REAL,RESOL_ID,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & & BIND(C, NAME="execute_inv_fft_double") USE ISO_C_BINDING, ONLY: C_DOUBLE, C_INT, C_PTR REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_COMPLEX(*) REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_REAL(*) + INTEGER(KIND=C_INT),INTENT(IN),VALUE :: RESOL_ID INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(*), OFFSETS(*) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT @@ -167,7 +183,7 @@ SUBROUTINE EXECUTE_INV_FFT_DOUBLE_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSET #ifdef ACCGPU !$ACC HOST_DATA USE_DEVICE(PREEL_COMPLEX,PREEL_REAL) #endif - CALL EXECUTE_INV_FFT_DOUBLE_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,SIZE(LOENS),C_LOC(ALLOC)) + CALL EXECUTE_INV_FFT_DOUBLE_C(PREEL_COMPLEX,PREEL_REAL,RESOL_ID,KFIELD,LOENS,OFFSETS,SIZE(LOENS),C_LOC(ALLOC)) #ifdef ACCGPU !$ACC END HOST_DATA #endif From 3cde5bc19a797a92a22874d726fe35056da7c533 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Thu, 3 Oct 2024 05:07:05 -0700 Subject: [PATCH 03/10] modify ectrans-benchmark temporarily --- src/programs/ectrans-benchmark.F90 | 644 +++++++++++++++++++++++++++-- 1 file changed, 621 insertions(+), 23 deletions(-) diff --git a/src/programs/ectrans-benchmark.F90 b/src/programs/ectrans-benchmark.F90 index f9a12a8b3..cc79c3c0f 100644 --- a/src/programs/ectrans-benchmark.F90 +++ b/src/programs/ectrans-benchmark.F90 @@ -200,6 +200,7 @@ program ectrans_benchmark integer(kind=jpim) :: jend_uder_EW = 0 integer(kind=jpim) :: jbegin_vder_EW = 0 integer(kind=jpim) :: jend_vder_EW = 0 +integer(kind=jpim) :: nresol1, nresol2 logical :: ldump_values = .false. @@ -216,6 +217,8 @@ program ectrans_benchmark #include "setup_trans.h" #include "inv_trans.h" #include "dir_trans.h" +#include "trans_release.h" +#include "trans_end.h" #include "trans_inq.h" #include "specnorm.h" #include "abor1.intfb.h" @@ -397,12 +400,601 @@ program ectrans_benchmark call set_ectrans_gpu_nflev(nflevl) ! We pass nflevl via environment variable in order not to change API ! In long run, ectrans should grow its internal buffers automatically +call setup_trans(ksmax=24, kdgl=38, kloen=nloen, ldsplit=.true., & + & lduserpnm=luserpnm, ldkeeprpnm=lkeeprpnm, & + & lduseflt=luseflt,kresol=nresol2) +call setup_trans(ksmax=nsmax, kdgl=ndgl, kloen=nloen, ldsplit=.true., & + & lduserpnm=luserpnm, ldkeeprpnm=lkeeprpnm, & + & lduseflt=luseflt,kresol=nresol1) +call gstats(2, 1) + +call trans_inq(kspec2=nspec2, kspec2g=nspec2g, kgptot=ngptot, kgptotg=ngptotg,kresol=nresol1) + +if (nproma == 0) then ! no blocking (default when not specified) + nproma = ngptot +endif + +! Calculate number of NPROMA blocks +ngpblks = (ngptot - 1)/nproma+1 + +!=================================================================================================== +! Print information before starting +!=================================================================================================== + +! Print configuration details +if (verbosity >= 0 .and. myproc == 1) then + write(nout,'(" ")') + write(nout,'(a)')'======= Start of runtime parameters =======' + write(nout,'(" ")') + write(nout,'("nsmax ",i0)') nsmax + write(nout,'("grid ",a)') trim(cgrid) + write(nout,'("ndgl ",i0)') ndgl + write(nout,'("nproc ",i0)') nproc + write(nout,'("nthread ",i0)') nthread + write(nout,'("nprgpns ",i0)') nprgpns + write(nout,'("nprgpew ",i0)') nprgpew + write(nout,'("nprtrw ",i0)') nprtrw + write(nout,'("nprtrv ",i0)') nprtrv + write(nout,'("ngptot ",i0)') ngptot + write(nout,'("ngptotg ",i0)') ngptotg + write(nout,'("nfld ",i0)') nfld + write(nout,'("nlev ",i0)') nlev + write(nout,'("nproma ",i0)') nproma + write(nout,'("ngpblks ",i0)') ngpblks + write(nout,'("nspec2 ",i0)') nspec2 + write(nout,'("nspec2g ",i0)') nspec2g + write(nout,'("luseflt ",l1)') luseflt + write(nout,'("nopt_mem_tr",i0)') nopt_mem_tr + write(nout,'("lvordiv ",l1)') lvordiv + write(nout,'("lscders ",l1)') lscders + write(nout,'("luvders ",l1)') luvders + write(nout,'(" ")') + write(nout,'(a)') '======= End of runtime parameters =======' + write(nout,'(" ")') +end if + +!=================================================================================================== +! Allocate and Initialize spectral arrays +!=================================================================================================== + +! Allocate spectral arrays +! Try to mimick IFS layout as much as possible +nullify(zspvor) +nullify(zspdiv) +nullify(zspsc3a) +allocate(sp3d(nflevl,nspec2,2+nfld)) +allocate(zspsc2(1,nspec2)) + +call initialize_spectral_arrays(nsmax, zspsc2, sp3d) + +! Point convenience variables to storage variable sp3d +zspvor => sp3d(:,:,1) +zspdiv => sp3d(:,:,2) +zspsc3a => sp3d(:,:,3:3+(nfld-1)) + +!=================================================================================================== +! Allocate gridpoint arrays +!=================================================================================================== + +allocate(ivset(nflevg)) + +! Compute spectral distribution +ilev = 0 +do jb = 1, nprtrv + do jlev=1, numll(jb) + ilev = ilev + 1 + ivset(ilev) = jb + enddo +enddo + +! Allocate grid-point arrays +if (lvordiv) then + jbegin_uv = 1 + jend_uv = 2 +endif +if (luvders) then + jbegin_uder_EW = jend_uv + 1 + jend_uder_EW = jbegin_uder_EW + 1 + jbegin_vder_EW = jend_uder_EW + 1 + jend_vder_EW = jbegin_vder_EW + 1 +else + jbegin_uder_EW = jend_uv + jend_uder_EW = jend_uv + jbegin_vder_EW = jend_uv + jend_vder_EW = jend_uv +endif + +jbegin_sc = jend_vder_EW + 1 +jend_sc = jend_vder_EW + nfld + +if (lscders) then + ndimgmvs = 3 + jbegin_scder_NS = jend_sc + 1 + jend_scder_NS = jend_sc + nfld + jbegin_scder_EW = jend_scder_NS + 1 + jend_scder_EW = jend_scder_NS + nfld +else + ndimgmvs = 1 + jbegin_scder_NS = jend_sc + jend_scder_NS = jend_sc + jbegin_scder_EW = jend_sc + jend_scder_EW = jend_sc +endif + +ndimgmv = jend_scder_EW + +allocate(zgmv(nproma,nflevg,ndimgmv,ngpblks)) +allocate(zgmvs(nproma,ndimgmvs,ngpblks)) + +zgpuv => zgmv(:,:,1:jend_vder_EW,:) +zgp3a => zgmv(:,:,jbegin_sc:jend_scder_EW,:) +zgp2 => zgmvs(:,:,:) + +!=================================================================================================== +! Allocate norm arrays +!=================================================================================================== + +if (lprint_norms .or. ncheck > 0) then + allocate(znormsp(1)) + allocate(znormsp1(1)) + allocate(znormvor(nflevg)) + allocate(znormvor1(nflevg)) + allocate(znormdiv(nflevg)) + allocate(znormdiv1(nflevg)) + allocate(znormt(nflevg)) + allocate(znormt1(nflevg)) + + call specnorm(pspec=zspvor(1:nflevl,:), pnorm=znormvor1, kvset=ivset(1:nflevg), kresol=nresol1) + call specnorm(pspec=zspdiv(1:nflevl,:), pnorm=znormdiv1, kvset=ivset(1:nflevg), kresol=nresol1) + if (nfld > 0) then + call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt1, kvset=ivset(1:nflevg), kresol=nresol1) + endif + call specnorm(pspec=zspsc2(1:1,:), pnorm=znormsp1, kvset=ivsetsc, kresol=nresol1) + + if (verbosity >= 1 .and. myproc == 1) then + do ifld = 1, nflevg + write(nout,'("norm zspvor( ",i4,",:) = ",f20.15)') ifld, znormvor1(ifld) + write(nout,'("0x",Z16.16)') znormvor1(ifld) + enddo + do ifld = 1, nflevg + write(nout,'("norm zspdiv( ",i4,",:) = ",f20.15)') ifld, znormdiv1(ifld) + write(nout,'("0x",Z16.16)') znormdiv1(ifld) + enddo + if (nfld > 0) then + do ifld = 1, nflevg + write(nout,'("norm zspsc3a(",i4,",:,1) = ",f20.15)') ifld, znormt1(ifld) + write(nout,'("0x",Z16.16)') znormt1(ifld) + enddo + endif + do ifld = 1, 1 + write(nout,'("norm zspsc2( ",i4,",:) = ",f20.15)') ifld, znormsp1(ifld) + write(nout,'("0x",Z16.16)') znormsp1(ifld) + enddo + endif +endif + +!=================================================================================================== +! Setup timers +!=================================================================================================== + +ztinit = (timef() - ztinit)/1000.0_jprd + +if (verbosity >= 0 .and. myproc == 1) then + write(nout,'(" ")') + write(nout,'(a,i6,a,f9.2,a)') "ectrans_benchmark initialisation, on",nproc,& + & " tasks, took",ztinit," sec" + write(nout,'(" ")') +endif + +if (iters <= 0) call abor1('ectrans_benchmark:iters <= 0') + +allocate(ztstep(iters+2)) +allocate(ztstep1(iters+2)) +allocate(ztstep2(iters+2)) + +ztstepavg = 0._jprd +ztstepmax = 0._jprd +ztstepmin = 9999999999999999._jprd +ztstepavg1 = 0._jprd +ztstepmax1 = 0._jprd +ztstepmin1 = 9999999999999999._jprd +ztstepavg2 = 0._jprd +ztstepmax2 = 0._jprd +ztstepmin2 = 9999999999999999._jprd + +if (verbosity >= 1 .and. myproc == 1) then + write(nout,'(a)') '======= Start of spectral transforms =======' + write(nout,'(" ")') +endif + +ztloop = timef() + +!=================================================================================================== +! Do spectral transform loop +!=================================================================================================== + +gstats_lstats = .false. + +write(nout,'(a,i5,a)') 'Running for ', iters, ' iterations with 2 extra warm-up iterations' +write(nout,'(" ")') + +do jstep = 1, iters+2 + if (jstep == 3) gstats_lstats = .true. + + call gstats(3,0) + ztstep(jstep) = timef() + + !================================================================================================= + ! Do inverse transform + !================================================================================================= + + ztstep1(jstep) = timef() + call gstats(4,0) + if (lvordiv) then + call inv_trans(kresol=nresol1, kproma=nproma, & + & pspsc2=zspsc2, & ! spectral surface pressure + & pspvor=zspvor, & ! spectral vorticity + & pspdiv=zspdiv, & ! spectral divergence + & pspsc3a=zspsc3a, & ! spectral scalars + & ldscders=lscders, & + & ldvorgp=.false., & ! no gridpoint vorticity + & lddivgp=.false., & ! no gridpoint divergence + & lduvder=luvders, & + & kvsetuv=ivset, & + & kvsetsc2=ivsetsc, & + & kvsetsc3a=ivset, & + & pgp2=zgp2, & + & pgpuv=zgpuv, & + & pgp3a=zgp3a) + else + call inv_trans(kresol=nresol1, kproma=nproma, & + & pspsc2=zspsc2, & ! spectral surface pressure + & pspsc3a=zspsc3a, & ! spectral scalars + & ldscders=lscders, & ! scalar derivatives + & kvsetsc2=ivsetsc, & + & kvsetsc3a=ivset, & + & pgp2=zgp2, & + & pgp3a=zgp3a) + endif + call gstats(4,1) + + ztstep1(jstep) = (timef() - ztstep1(jstep))/1000.0_jprd + + !================================================================================================= + ! While in grid point space, dump the values to disk, for debugging only + !================================================================================================= + + if (ldump_values) then + ! dump a field to a binary file + call dump_gridpoint_field(jstep, myproc, nproma, ngpblks, zgp2(:,1,:), 'S', noutdump) + call dump_gridpoint_field(jstep, myproc, nproma, ngpblks, zgpuv(:,nflevg,1,:), 'U', noutdump) + call dump_gridpoint_field(jstep, myproc, nproma, ngpblks, zgpuv(:,nflevg,2,:), 'V', noutdump) + call dump_gridpoint_field(jstep, myproc, nproma, ngpblks, zgp3a(:,nflevg,1,:), 'T', noutdump) + endif + + !================================================================================================= + ! Do direct transform + !================================================================================================= + + ztstep2(jstep) = timef() + + call gstats(5,0) + if (lvordiv) then + call dir_trans(kresol=nresol1, kproma=nproma, & + & pgp2=zgmvs(:,1:1,:), & + & pgpuv=zgpuv(:,:,1:2,:), & + & pgp3a=zgp3a(:,:,1:nfld,:), & + & pspvor=zspvor, & + & pspdiv=zspdiv, & + & pspsc2=zspsc2, & + & pspsc3a=zspsc3a, & + & kvsetuv=ivset, & + & kvsetsc2=ivsetsc, & + & kvsetsc3a=ivset) + else + call dir_trans(kresol=nresol1, kproma=nproma, & + & pgp2=zgmvs(:,1:1,:), & + & pgp3a=zgp3a(:,:,1:nfld,:), & + & pspsc2=zspsc2, & + & pspsc3a=zspsc3a, & + & kvsetsc2=ivsetsc, & + & kvsetsc3a=ivset) + endif + call gstats(5,1) + ztstep2(jstep) = (timef() - ztstep2(jstep))/1000.0_jprd + + !================================================================================================= + ! Calculate timings + !================================================================================================= + + ztstep(jstep) = (timef() - ztstep(jstep))/1000.0_jprd + + ztstepavg = ztstepavg + ztstep(jstep) + ztstepmin = min(ztstep(jstep), ztstepmin) + ztstepmax = max(ztstep(jstep), ztstepmax) + + ztstepavg1 = ztstepavg1 + ztstep1(jstep) + ztstepmin1 = min(ztstep1(jstep), ztstepmin1) + ztstepmax1 = max(ztstep1(jstep), ztstepmax1) + + ztstepavg2 = ztstepavg2 + ztstep2(jstep) + ztstepmin2 = min(ztstep2(jstep), ztstepmin2) + ztstepmax2 = max(ztstep2(jstep), ztstepmax2) + + !================================================================================================= + ! Print norms + !================================================================================================= + + if (lprint_norms) then + call gstats(6,0) + call specnorm(pspec=zspsc2(1:1,:), pnorm=znormsp, kvset=ivsetsc(1:1), kresol=nresol1) + call specnorm(pspec=zspvor(1:nflevl,:), pnorm=znormvor, kvset=ivset(1:nflevg), kresol=nresol1) + call specnorm(pspec=zspdiv(1:nflevl,:), pnorm=znormdiv, kvset=ivset(1:nflevg), kresol=nresol1) + if (nfld > 0) then + call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt, kvset=ivset(1:nflevg), kresol=nresol1) + endif + + ! Surface pressure + if (myproc == 1) then + zmaxerr(:) = -999.0 + do ifld = 1, 1 + write(nout,*) "znormsp", znormsp + flush(nout) + zerr(1) = abs(znormsp1(ifld)/znormsp(ifld) - 1.0_jprb) + zmaxerr(1) = max(zmaxerr(1), zerr(1)) + enddo + ! Divergence + do ifld = 1, nflevg + zerr(2) = abs(znormdiv1(ifld)/znormdiv(ifld) - 1.0_jprb) + zmaxerr(2) = max(zmaxerr(2), zerr(2)) + enddo + ! Vorticity + do ifld = 1, nflevg + zerr(3) = abs(znormvor1(ifld)/znormvor(ifld) - 1.0_jprb) + zmaxerr(3) = max(zmaxerr(3),zerr(3)) + enddo + ! Temperature + if (nfld > 0) then + do ifld = 1, nflevg + zerr(4) = abs(znormt1(ifld)/znormt(ifld) - 1.0_jprb) + zmaxerr(4) = max(zmaxerr(4), zerr(4)) + enddo + write(nout,'("time step ",i6," took", f8.4," | zspvor max err="e10.3,& + & " | zspdiv max err="e10.3," | zspsc3a max err="e10.3," | zspsc2 max err="e10.3)') & + & jstep, ztstep(jstep), zmaxerr(3), zmaxerr(2), zmaxerr(4), zmaxerr(1) + else + write(nout,'("time step ",i6," took", f8.4," | zspvor max err="e10.3,& + & " | zspdiv max err="e10.3," | zspsc2 max err="e10.3)') & + & jstep, ztstep(jstep), zmaxerr(3), zmaxerr(2), zmaxerr(1) + endif + endif + call gstats(6,1) + else + write(nout,'("Time step ",i6," took", f8.4)') jstep, ztstep(jstep) + endif + call gstats(3,1) +enddo + +!=================================================================================================== + +ztloop = (timef() - ztloop)/1000.0_jprd + +write(nout,'(" ")') +write(nout,'(a)') '======= End of spectral transforms =======' +write(nout,'(" ")') + +if (lprint_norms .or. ncheck > 0) then + call specnorm(pspec=zspvor(1:nflevl,:), pnorm=znormvor, kvset=ivset, kresol=nresol1) + call specnorm(pspec=zspdiv(1:nflevl,:), pnorm=znormdiv, kvset=ivset, kresol=nresol1) + if (nfld > 0) then + call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt, kvset=ivset, kresol=nresol1) + endif + call specnorm(pspec=zspsc2(1:1,:), pnorm=znormsp, kvset=ivsetsc, kresol=nresol1) + + if (myproc == 1) then + zmaxerr(:) = -999.0 + do ifld = 1, nflevg + zerr(3) = abs(real(znormvor1(ifld),kind=jprd)/real(znormvor(ifld),kind=jprd) - 1.0_jprd) + zmaxerr(3) = max(zmaxerr(3), zerr(3)) + if (verbosity >= 1) then + write(nout,'("norm zspvor( ",i4,") = ",f20.15," error = ",e10.3)') ifld, znormvor(ifld), zerr(3) + write(nout,'("0x",Z16.16)') znormvor(ifld) + endif + enddo + do ifld = 1, nflevg + zerr(2) = abs(real(znormdiv1(ifld),kind=jprd)/real(znormdiv(ifld),kind=jprd) - 1.0d0) + zmaxerr(2) = max(zmaxerr(2),zerr(2)) + if (verbosity >= 1) then + write(nout,'("norm zspdiv( ",i4,",:) = ",f20.15," error = ",e10.3)') ifld, znormdiv(ifld), zerr(2) + write(nout,'("0x",Z16.16)') znormdiv(ifld) + endif + enddo + if (nfld > 0) then + do ifld = 1, nflevg + zerr(4) = abs(real(znormt1(ifld),kind=jprd)/real(znormt(ifld),kind=jprd) - 1.0d0) + zmaxerr(4) = max(zmaxerr(4), zerr(4)) + if (verbosity >= 1) then + write(nout,'("norm zspsc3a(",i4,",:,1) = ",f20.15," error = ",e10.3)') ifld, znormt(ifld), zerr(4) + write(nout,'("0x",Z16.16)') znormt(ifld) + endif + enddo + endif + do ifld = 1, 1 + zerr(1) = abs(real(znormsp1(ifld),kind=jprd)/real(znormsp(ifld),kind=jprd) - 1.0d0) + zmaxerr(1) = max(zmaxerr(1), zerr(1)) + if (verbosity >= 1) then + write(nout,'("norm zspsc2( ",i4,",:) = ",f20.15," error = ",e10.3)') ifld, znormsp(ifld), zerr(1) + write(nout,'("0x",Z16.16)') znormsp(ifld) + endif + enddo + + ! maximum error across all fields + if (nfld > 0) then + zmaxerrg = max(zmaxerr(1), zmaxerr(2), zmaxerr(3), zmaxerr(4)) + else + zmaxerrg = max(zmaxerr(1), zmaxerr(2), zmaxerr(3)) + endif + + if (verbosity >= 1) write(nout,*) + write(nout,'("max error zspvor(1:nlev,:) = ",e10.3)') zmaxerr(3) + write(nout,'("max error zspdiv(1:nlev,:) = ",e10.3)') zmaxerr(2) + if (nfld > 0) write(nout,'("max error zspsc3a(1:nlev,:,1) = ",e10.3)') zmaxerr(4) + write(nout,'("max error zspsc2(1:1,:) = ",e10.3)') zmaxerr(1) + write(nout,*) + write(nout,'("max error combined = = ",e10.3)') zmaxerrg + write(nout,*) + endif + if (ncheck > 0) then + ierr = 0 + if (myproc == 1) then + ! If the maximum spectral norm error across all fields is greater than 100 times the machine + ! epsilon, fail the test + if (zmaxerrg > real(ncheck, jprb) * epsilon(1.0_jprb)) then + write(nout, '(a)') '*******************************' + write(nout, '(a)') 'Correctness test failed' + write(nout, '(a,1e9.2)') 'Maximum spectral norm error = ', zmaxerrg + write(nout, '(a,1e9.2)') 'Error tolerance = ', real(ncheck, jprb) * epsilon(1.0_jprb) + write(nout, '(a)') '*******************************' + ierr = 1 + endif + endif + + ! Root rank broadcasts the correctness checker result to the other ranks + if (luse_mpi) then + call mpl_broadcast(ierr,kroot=1,ktag=1) + endif + + ! Halt if correctness checker failed + if (ierr == 1) then + error stop + endif + endif +endif + +if (luse_mpi) then + call mpl_allreduce(ztloop, 'sum', ldreprod=.false.) + call mpl_allreduce(ztstep, 'sum', ldreprod=.false.) + call mpl_allreduce(ztstepavg, 'sum', ldreprod=.false.) + call mpl_allreduce(ztstepmax, 'max', ldreprod=.false.) + call mpl_allreduce(ztstepmin, 'min', ldreprod=.false.) + + call mpl_allreduce(ztstep1, 'sum', ldreprod=.false.) + call mpl_allreduce(ztstepavg1, 'sum', ldreprod=.false.) + call mpl_allreduce(ztstepmax1, 'max', ldreprod=.false.) + call mpl_allreduce(ztstepmin1, 'min', ldreprod=.false.) + + call mpl_allreduce(ztstep2, 'sum', ldreprod=.false.) + call mpl_allreduce(ztstepavg2, 'sum', ldreprod=.false.) + call mpl_allreduce(ztstepmax2, 'max', ldreprod=.false.) + call mpl_allreduce(ztstepmin2, 'min', ldreprod=.false.) +endif + +ztstepavg = (ztstepavg/real(nproc,jprb))/real(iters,jprd) +ztloop = ztloop/real(nproc,jprd) +ztstep(:) = ztstep(:)/real(nproc,jprd) +ztstepmed = get_median(ztstep) + +ztstepavg1 = (ztstepavg1/real(nproc,jprb))/real(iters,jprd) +ztstep1(:) = ztstep1(:)/real(nproc,jprd) +ztstepmed1 = get_median(ztstep1) + +ztstepavg2 = (ztstepavg2/real(nproc,jprb))/real(iters,jprd) +ztstep2(:) = ztstep2(:)/real(nproc,jprd) +ztstepmed2 = get_median(ztstep2) + +write(nout,'(a)') '======= Start of time step stats =======' +write(nout,'(" ")') +write(nout,'("Inverse transforms")') +write(nout,'("------------------")') +write(nout,'("avg (s): ",f8.4)') ztstepavg1 +write(nout,'("min (s): ",f8.4)') ztstepmin1 +write(nout,'("max (s): ",f8.4)') ztstepmax1 +write(nout,'("med (s): ",f8.4)') ztstepmed1 +write(nout,'(" ")') +write(nout,'("Direct transforms")') +write(nout,'("-----------------")') +write(nout,'("avg (s): ",f8.4)') ztstepavg2 +write(nout,'("min (s): ",f8.4)') ztstepmin2 +write(nout,'("max (s): ",f8.4)') ztstepmax2 +write(nout,'("med (s): ",f8.4)') ztstepmed2 +write(nout,'(" ")') +write(nout,'("Inverse-direct transforms")') +write(nout,'("-------------------------")') +write(nout,'("avg (s): ",f8.4)') ztstepavg +write(nout,'("min (s): ",f8.4)') ztstepmin +write(nout,'("max (s): ",f8.4)') ztstepmax +write(nout,'("med (s): ",f8.4)') ztstepmed +write(nout,'("loop (s): ",f8.4)') ztloop +write(nout,'(" ")') +write(nout,'(a)') '======= End of time step stats =======' +write(nout,'(" ")') + +if (lstack) then + ! Gather stack usage statistics + istack = getstackusage() + if (myproc == 1) then + print 9001, istack + 9001 format("Stack utilisation information",/,& + &"=============================",//,& + &"Task size(bytes)",/,& + &"==== ===========",//,& + &" 1",11x,i10) + + do i = 2, nproc + call mpl_recv(istack, ksource=nprcids(i), ktag=i, cdstring='ectrans_benchmark:') + print '(i4,11x,i10)', i, istack + enddo + else + call mpl_send(istack, kdest=nprcids(1), ktag=myproc, cdstring='ectrans_benchmark:') + endif +endif + + +!=================================================================================================== +! Cleanup +!=================================================================================================== + +deallocate(zgmv) +deallocate(zgmvs) +deallocate(sp3d) +deallocate(zspsc2) +deallocate(ivset) +if (lprint_norms .or. ncheck > 0) then + deallocate(znormsp) + deallocate(znormsp1) + deallocate(znormvor) + deallocate(znormvor1) + deallocate(znormdiv) + deallocate(znormdiv1) + deallocate(znormt) + deallocate(znormt1) +endif +deallocate(ztstep) +deallocate(ztstep1) +deallocate(ztstep2) +deallocate(nloen) + +!=================================================================================================== + +call trans_release(nresol1) +call trans_release(nresol2) +call acc_present_dump + +call trans_end + +call acc_present_dump +cgrid = 'O40' +call parse_grid(cgrid, ndgl, nloen) +nsmax = 39 +call setup_trans0(kout=nout, kerr=nerr, kprintlev=merge(2, 0, verbosity == 1), & + & kmax_resol=nmax_resol, kpromatr=npromatr, kprgpns=nprgpns, kprgpew=nprgpew, & + & kprtrw=nprtrw, ldsync_trans=lsync_trans, & + & ldeq_regions=leq_regions, prad=zra, ldalloperm=.true., ldmpoff=.not.luse_mpi,& + & kopt_memory_tr=nopt_mem_tr) call setup_trans(ksmax=nsmax, kdgl=ndgl, kloen=nloen, ldsplit=.true., & & lduserpnm=luserpnm, ldkeeprpnm=lkeeprpnm, & - & lduseflt=luseflt) -call gstats(2, 1) - -call trans_inq(kspec2=nspec2, kspec2g=nspec2g, kgptot=ngptot, kgptotg=ngptotg) + & lduseflt=luseflt,kresol=nresol1) +call trans_inq(kspec2=nspec2, kspec2g=nspec2g, kgptot=ngptot, kgptotg=ngptotg,kresol=nresol1) if (nproma == 0) then ! no blocking (default when not specified) nproma = ngptot @@ -538,12 +1130,12 @@ program ectrans_benchmark allocate(znormt(nflevg)) allocate(znormt1(nflevg)) - call specnorm(pspec=zspvor(1:nflevl,:), pnorm=znormvor1, kvset=ivset(1:nflevg)) - call specnorm(pspec=zspdiv(1:nflevl,:), pnorm=znormdiv1, kvset=ivset(1:nflevg)) + call specnorm(pspec=zspvor(1:nflevl,:), pnorm=znormvor1, kvset=ivset(1:nflevg), kresol=nresol1) + call specnorm(pspec=zspdiv(1:nflevl,:), pnorm=znormdiv1, kvset=ivset(1:nflevg), kresol=nresol1) if (nfld > 0) then - call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt1, kvset=ivset(1:nflevg)) + call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt1, kvset=ivset(1:nflevg), kresol=nresol1) endif - call specnorm(pspec=zspsc2(1:1,:), pnorm=znormsp1, kvset=ivsetsc) + call specnorm(pspec=zspsc2(1:1,:), pnorm=znormsp1, kvset=ivsetsc, kresol=nresol1) if (verbosity >= 1 .and. myproc == 1) then do ifld = 1, nflevg @@ -619,7 +1211,7 @@ program ectrans_benchmark ztstep1(jstep) = timef() call gstats(4,0) if (lvordiv) then - call inv_trans(kresol=1, kproma=nproma, & + call inv_trans(kresol=nresol1, kproma=nproma, & & pspsc2=zspsc2, & ! spectral surface pressure & pspvor=zspvor, & ! spectral vorticity & pspdiv=zspdiv, & ! spectral divergence @@ -635,7 +1227,7 @@ program ectrans_benchmark & pgpuv=zgpuv, & & pgp3a=zgp3a) else - call inv_trans(kresol=1, kproma=nproma, & + call inv_trans(kresol=nresol1, kproma=nproma, & & pspsc2=zspsc2, & ! spectral surface pressure & pspsc3a=zspsc3a, & ! spectral scalars & ldscders=lscders, & ! scalar derivatives @@ -668,7 +1260,7 @@ program ectrans_benchmark call gstats(5,0) if (lvordiv) then - call dir_trans(kresol=1, kproma=nproma, & + call dir_trans(kresol=nresol1, kproma=nproma, & & pgp2=zgmvs(:,1:1,:), & & pgpuv=zgpuv(:,:,1:2,:), & & pgp3a=zgp3a(:,:,1:nfld,:), & @@ -680,7 +1272,7 @@ program ectrans_benchmark & kvsetsc2=ivsetsc, & & kvsetsc3a=ivset) else - call dir_trans(kresol=1, kproma=nproma, & + call dir_trans(kresol=nresol1, kproma=nproma, & & pgp2=zgmvs(:,1:1,:), & & pgp3a=zgp3a(:,:,1:nfld,:), & & pspsc2=zspsc2, & @@ -699,11 +1291,11 @@ program ectrans_benchmark if (lprint_norms) then call gstats(6,0) - call specnorm(pspec=zspsc2(1:1,:), pnorm=znormsp, kvset=ivsetsc(1:1)) - call specnorm(pspec=zspvor(1:nflevl,:), pnorm=znormvor, kvset=ivset(1:nflevg)) - call specnorm(pspec=zspdiv(1:nflevl,:), pnorm=znormdiv, kvset=ivset(1:nflevg)) + call specnorm(pspec=zspsc2(1:1,:), pnorm=znormsp, kvset=ivsetsc(1:1), kresol=nresol1) + call specnorm(pspec=zspvor(1:nflevl,:), pnorm=znormvor, kvset=ivset(1:nflevg), kresol=nresol1) + call specnorm(pspec=zspdiv(1:nflevl,:), pnorm=znormdiv, kvset=ivset(1:nflevg), kresol=nresol1) if (nfld > 0) then - call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt, kvset=ivset(1:nflevg)) + call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt, kvset=ivset(1:nflevg), kresol=nresol1) endif ! Surface pressure @@ -754,12 +1346,12 @@ program ectrans_benchmark write(nout,'(" ")') if (lprint_norms .or. ncheck > 0) then - call specnorm(pspec=zspvor(1:nflevl,:), pnorm=znormvor, kvset=ivset) - call specnorm(pspec=zspdiv(1:nflevl,:), pnorm=znormdiv, kvset=ivset) + call specnorm(pspec=zspvor(1:nflevl,:), pnorm=znormvor, kvset=ivset, kresol=nresol1) + call specnorm(pspec=zspdiv(1:nflevl,:), pnorm=znormdiv, kvset=ivset, kresol=nresol1) if (nfld > 0) then - call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt, kvset=ivset) + call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt, kvset=ivset, kresol=nresol1) endif - call specnorm(pspec=zspsc2(1:1,:), pnorm=znormsp, kvset=ivsetsc) + call specnorm(pspec=zspsc2(1:1,:), pnorm=znormsp, kvset=ivsetsc, kresol=nresol1) if (myproc == 1) then zmaxerr(:) = -999.0 @@ -954,6 +1546,12 @@ program ectrans_benchmark & kcall=1) endif +call trans_release(nresol1) +call trans_release(nresol2) + +call trans_end + + !=================================================================================================== ! Finalize MPI !=================================================================================================== @@ -1326,15 +1924,15 @@ subroutine initialize_2d_spectral_field(nsmax, field) field(:) = 0.0 ! Get zonal wavenumbers this rank is responsible for - call trans_inq(knump=num_my_zon_wns) + call trans_inq(knump=num_my_zon_wns,kresol=nresol1) allocate(my_zon_wns(num_my_zon_wns)) - call trans_inq(kmyms=my_zon_wns) + call trans_inq(kmyms=my_zon_wns,kresol=nresol1) ! If rank is responsible for the chosen zonal wavenumber... if (any(my_zon_wns == m_num) ) then ! Get array of spectral array addresses (this maps (m, n=m) to array index) allocate(nasm0(0:nsmax)) - call trans_inq(kasm0=nasm0) + call trans_inq(kasm0=nasm0,kresol=nresol1) ! Find out local array index of chosen spherical harmonic index = nasm0(m_num) + 2 * (l_num - m_num) + 1 From 91b27591bb40e77771407a2a7428a57ad2791a3a Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Wed, 16 Oct 2024 23:28:23 -0700 Subject: [PATCH 04/10] Revert "modify ectrans-benchmark temporarily" This reverts commit 3cde5bc19a797a92a22874d726fe35056da7c533. --- src/programs/ectrans-benchmark.F90 | 644 ++--------------------------- 1 file changed, 23 insertions(+), 621 deletions(-) diff --git a/src/programs/ectrans-benchmark.F90 b/src/programs/ectrans-benchmark.F90 index cc79c3c0f..f9a12a8b3 100644 --- a/src/programs/ectrans-benchmark.F90 +++ b/src/programs/ectrans-benchmark.F90 @@ -200,7 +200,6 @@ program ectrans_benchmark integer(kind=jpim) :: jend_uder_EW = 0 integer(kind=jpim) :: jbegin_vder_EW = 0 integer(kind=jpim) :: jend_vder_EW = 0 -integer(kind=jpim) :: nresol1, nresol2 logical :: ldump_values = .false. @@ -217,8 +216,6 @@ program ectrans_benchmark #include "setup_trans.h" #include "inv_trans.h" #include "dir_trans.h" -#include "trans_release.h" -#include "trans_end.h" #include "trans_inq.h" #include "specnorm.h" #include "abor1.intfb.h" @@ -400,601 +397,12 @@ program ectrans_benchmark call set_ectrans_gpu_nflev(nflevl) ! We pass nflevl via environment variable in order not to change API ! In long run, ectrans should grow its internal buffers automatically -call setup_trans(ksmax=24, kdgl=38, kloen=nloen, ldsplit=.true., & - & lduserpnm=luserpnm, ldkeeprpnm=lkeeprpnm, & - & lduseflt=luseflt,kresol=nresol2) -call setup_trans(ksmax=nsmax, kdgl=ndgl, kloen=nloen, ldsplit=.true., & - & lduserpnm=luserpnm, ldkeeprpnm=lkeeprpnm, & - & lduseflt=luseflt,kresol=nresol1) -call gstats(2, 1) - -call trans_inq(kspec2=nspec2, kspec2g=nspec2g, kgptot=ngptot, kgptotg=ngptotg,kresol=nresol1) - -if (nproma == 0) then ! no blocking (default when not specified) - nproma = ngptot -endif - -! Calculate number of NPROMA blocks -ngpblks = (ngptot - 1)/nproma+1 - -!=================================================================================================== -! Print information before starting -!=================================================================================================== - -! Print configuration details -if (verbosity >= 0 .and. myproc == 1) then - write(nout,'(" ")') - write(nout,'(a)')'======= Start of runtime parameters =======' - write(nout,'(" ")') - write(nout,'("nsmax ",i0)') nsmax - write(nout,'("grid ",a)') trim(cgrid) - write(nout,'("ndgl ",i0)') ndgl - write(nout,'("nproc ",i0)') nproc - write(nout,'("nthread ",i0)') nthread - write(nout,'("nprgpns ",i0)') nprgpns - write(nout,'("nprgpew ",i0)') nprgpew - write(nout,'("nprtrw ",i0)') nprtrw - write(nout,'("nprtrv ",i0)') nprtrv - write(nout,'("ngptot ",i0)') ngptot - write(nout,'("ngptotg ",i0)') ngptotg - write(nout,'("nfld ",i0)') nfld - write(nout,'("nlev ",i0)') nlev - write(nout,'("nproma ",i0)') nproma - write(nout,'("ngpblks ",i0)') ngpblks - write(nout,'("nspec2 ",i0)') nspec2 - write(nout,'("nspec2g ",i0)') nspec2g - write(nout,'("luseflt ",l1)') luseflt - write(nout,'("nopt_mem_tr",i0)') nopt_mem_tr - write(nout,'("lvordiv ",l1)') lvordiv - write(nout,'("lscders ",l1)') lscders - write(nout,'("luvders ",l1)') luvders - write(nout,'(" ")') - write(nout,'(a)') '======= End of runtime parameters =======' - write(nout,'(" ")') -end if - -!=================================================================================================== -! Allocate and Initialize spectral arrays -!=================================================================================================== - -! Allocate spectral arrays -! Try to mimick IFS layout as much as possible -nullify(zspvor) -nullify(zspdiv) -nullify(zspsc3a) -allocate(sp3d(nflevl,nspec2,2+nfld)) -allocate(zspsc2(1,nspec2)) - -call initialize_spectral_arrays(nsmax, zspsc2, sp3d) - -! Point convenience variables to storage variable sp3d -zspvor => sp3d(:,:,1) -zspdiv => sp3d(:,:,2) -zspsc3a => sp3d(:,:,3:3+(nfld-1)) - -!=================================================================================================== -! Allocate gridpoint arrays -!=================================================================================================== - -allocate(ivset(nflevg)) - -! Compute spectral distribution -ilev = 0 -do jb = 1, nprtrv - do jlev=1, numll(jb) - ilev = ilev + 1 - ivset(ilev) = jb - enddo -enddo - -! Allocate grid-point arrays -if (lvordiv) then - jbegin_uv = 1 - jend_uv = 2 -endif -if (luvders) then - jbegin_uder_EW = jend_uv + 1 - jend_uder_EW = jbegin_uder_EW + 1 - jbegin_vder_EW = jend_uder_EW + 1 - jend_vder_EW = jbegin_vder_EW + 1 -else - jbegin_uder_EW = jend_uv - jend_uder_EW = jend_uv - jbegin_vder_EW = jend_uv - jend_vder_EW = jend_uv -endif - -jbegin_sc = jend_vder_EW + 1 -jend_sc = jend_vder_EW + nfld - -if (lscders) then - ndimgmvs = 3 - jbegin_scder_NS = jend_sc + 1 - jend_scder_NS = jend_sc + nfld - jbegin_scder_EW = jend_scder_NS + 1 - jend_scder_EW = jend_scder_NS + nfld -else - ndimgmvs = 1 - jbegin_scder_NS = jend_sc - jend_scder_NS = jend_sc - jbegin_scder_EW = jend_sc - jend_scder_EW = jend_sc -endif - -ndimgmv = jend_scder_EW - -allocate(zgmv(nproma,nflevg,ndimgmv,ngpblks)) -allocate(zgmvs(nproma,ndimgmvs,ngpblks)) - -zgpuv => zgmv(:,:,1:jend_vder_EW,:) -zgp3a => zgmv(:,:,jbegin_sc:jend_scder_EW,:) -zgp2 => zgmvs(:,:,:) - -!=================================================================================================== -! Allocate norm arrays -!=================================================================================================== - -if (lprint_norms .or. ncheck > 0) then - allocate(znormsp(1)) - allocate(znormsp1(1)) - allocate(znormvor(nflevg)) - allocate(znormvor1(nflevg)) - allocate(znormdiv(nflevg)) - allocate(znormdiv1(nflevg)) - allocate(znormt(nflevg)) - allocate(znormt1(nflevg)) - - call specnorm(pspec=zspvor(1:nflevl,:), pnorm=znormvor1, kvset=ivset(1:nflevg), kresol=nresol1) - call specnorm(pspec=zspdiv(1:nflevl,:), pnorm=znormdiv1, kvset=ivset(1:nflevg), kresol=nresol1) - if (nfld > 0) then - call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt1, kvset=ivset(1:nflevg), kresol=nresol1) - endif - call specnorm(pspec=zspsc2(1:1,:), pnorm=znormsp1, kvset=ivsetsc, kresol=nresol1) - - if (verbosity >= 1 .and. myproc == 1) then - do ifld = 1, nflevg - write(nout,'("norm zspvor( ",i4,",:) = ",f20.15)') ifld, znormvor1(ifld) - write(nout,'("0x",Z16.16)') znormvor1(ifld) - enddo - do ifld = 1, nflevg - write(nout,'("norm zspdiv( ",i4,",:) = ",f20.15)') ifld, znormdiv1(ifld) - write(nout,'("0x",Z16.16)') znormdiv1(ifld) - enddo - if (nfld > 0) then - do ifld = 1, nflevg - write(nout,'("norm zspsc3a(",i4,",:,1) = ",f20.15)') ifld, znormt1(ifld) - write(nout,'("0x",Z16.16)') znormt1(ifld) - enddo - endif - do ifld = 1, 1 - write(nout,'("norm zspsc2( ",i4,",:) = ",f20.15)') ifld, znormsp1(ifld) - write(nout,'("0x",Z16.16)') znormsp1(ifld) - enddo - endif -endif - -!=================================================================================================== -! Setup timers -!=================================================================================================== - -ztinit = (timef() - ztinit)/1000.0_jprd - -if (verbosity >= 0 .and. myproc == 1) then - write(nout,'(" ")') - write(nout,'(a,i6,a,f9.2,a)') "ectrans_benchmark initialisation, on",nproc,& - & " tasks, took",ztinit," sec" - write(nout,'(" ")') -endif - -if (iters <= 0) call abor1('ectrans_benchmark:iters <= 0') - -allocate(ztstep(iters+2)) -allocate(ztstep1(iters+2)) -allocate(ztstep2(iters+2)) - -ztstepavg = 0._jprd -ztstepmax = 0._jprd -ztstepmin = 9999999999999999._jprd -ztstepavg1 = 0._jprd -ztstepmax1 = 0._jprd -ztstepmin1 = 9999999999999999._jprd -ztstepavg2 = 0._jprd -ztstepmax2 = 0._jprd -ztstepmin2 = 9999999999999999._jprd - -if (verbosity >= 1 .and. myproc == 1) then - write(nout,'(a)') '======= Start of spectral transforms =======' - write(nout,'(" ")') -endif - -ztloop = timef() - -!=================================================================================================== -! Do spectral transform loop -!=================================================================================================== - -gstats_lstats = .false. - -write(nout,'(a,i5,a)') 'Running for ', iters, ' iterations with 2 extra warm-up iterations' -write(nout,'(" ")') - -do jstep = 1, iters+2 - if (jstep == 3) gstats_lstats = .true. - - call gstats(3,0) - ztstep(jstep) = timef() - - !================================================================================================= - ! Do inverse transform - !================================================================================================= - - ztstep1(jstep) = timef() - call gstats(4,0) - if (lvordiv) then - call inv_trans(kresol=nresol1, kproma=nproma, & - & pspsc2=zspsc2, & ! spectral surface pressure - & pspvor=zspvor, & ! spectral vorticity - & pspdiv=zspdiv, & ! spectral divergence - & pspsc3a=zspsc3a, & ! spectral scalars - & ldscders=lscders, & - & ldvorgp=.false., & ! no gridpoint vorticity - & lddivgp=.false., & ! no gridpoint divergence - & lduvder=luvders, & - & kvsetuv=ivset, & - & kvsetsc2=ivsetsc, & - & kvsetsc3a=ivset, & - & pgp2=zgp2, & - & pgpuv=zgpuv, & - & pgp3a=zgp3a) - else - call inv_trans(kresol=nresol1, kproma=nproma, & - & pspsc2=zspsc2, & ! spectral surface pressure - & pspsc3a=zspsc3a, & ! spectral scalars - & ldscders=lscders, & ! scalar derivatives - & kvsetsc2=ivsetsc, & - & kvsetsc3a=ivset, & - & pgp2=zgp2, & - & pgp3a=zgp3a) - endif - call gstats(4,1) - - ztstep1(jstep) = (timef() - ztstep1(jstep))/1000.0_jprd - - !================================================================================================= - ! While in grid point space, dump the values to disk, for debugging only - !================================================================================================= - - if (ldump_values) then - ! dump a field to a binary file - call dump_gridpoint_field(jstep, myproc, nproma, ngpblks, zgp2(:,1,:), 'S', noutdump) - call dump_gridpoint_field(jstep, myproc, nproma, ngpblks, zgpuv(:,nflevg,1,:), 'U', noutdump) - call dump_gridpoint_field(jstep, myproc, nproma, ngpblks, zgpuv(:,nflevg,2,:), 'V', noutdump) - call dump_gridpoint_field(jstep, myproc, nproma, ngpblks, zgp3a(:,nflevg,1,:), 'T', noutdump) - endif - - !================================================================================================= - ! Do direct transform - !================================================================================================= - - ztstep2(jstep) = timef() - - call gstats(5,0) - if (lvordiv) then - call dir_trans(kresol=nresol1, kproma=nproma, & - & pgp2=zgmvs(:,1:1,:), & - & pgpuv=zgpuv(:,:,1:2,:), & - & pgp3a=zgp3a(:,:,1:nfld,:), & - & pspvor=zspvor, & - & pspdiv=zspdiv, & - & pspsc2=zspsc2, & - & pspsc3a=zspsc3a, & - & kvsetuv=ivset, & - & kvsetsc2=ivsetsc, & - & kvsetsc3a=ivset) - else - call dir_trans(kresol=nresol1, kproma=nproma, & - & pgp2=zgmvs(:,1:1,:), & - & pgp3a=zgp3a(:,:,1:nfld,:), & - & pspsc2=zspsc2, & - & pspsc3a=zspsc3a, & - & kvsetsc2=ivsetsc, & - & kvsetsc3a=ivset) - endif - call gstats(5,1) - ztstep2(jstep) = (timef() - ztstep2(jstep))/1000.0_jprd - - !================================================================================================= - ! Calculate timings - !================================================================================================= - - ztstep(jstep) = (timef() - ztstep(jstep))/1000.0_jprd - - ztstepavg = ztstepavg + ztstep(jstep) - ztstepmin = min(ztstep(jstep), ztstepmin) - ztstepmax = max(ztstep(jstep), ztstepmax) - - ztstepavg1 = ztstepavg1 + ztstep1(jstep) - ztstepmin1 = min(ztstep1(jstep), ztstepmin1) - ztstepmax1 = max(ztstep1(jstep), ztstepmax1) - - ztstepavg2 = ztstepavg2 + ztstep2(jstep) - ztstepmin2 = min(ztstep2(jstep), ztstepmin2) - ztstepmax2 = max(ztstep2(jstep), ztstepmax2) - - !================================================================================================= - ! Print norms - !================================================================================================= - - if (lprint_norms) then - call gstats(6,0) - call specnorm(pspec=zspsc2(1:1,:), pnorm=znormsp, kvset=ivsetsc(1:1), kresol=nresol1) - call specnorm(pspec=zspvor(1:nflevl,:), pnorm=znormvor, kvset=ivset(1:nflevg), kresol=nresol1) - call specnorm(pspec=zspdiv(1:nflevl,:), pnorm=znormdiv, kvset=ivset(1:nflevg), kresol=nresol1) - if (nfld > 0) then - call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt, kvset=ivset(1:nflevg), kresol=nresol1) - endif - - ! Surface pressure - if (myproc == 1) then - zmaxerr(:) = -999.0 - do ifld = 1, 1 - write(nout,*) "znormsp", znormsp - flush(nout) - zerr(1) = abs(znormsp1(ifld)/znormsp(ifld) - 1.0_jprb) - zmaxerr(1) = max(zmaxerr(1), zerr(1)) - enddo - ! Divergence - do ifld = 1, nflevg - zerr(2) = abs(znormdiv1(ifld)/znormdiv(ifld) - 1.0_jprb) - zmaxerr(2) = max(zmaxerr(2), zerr(2)) - enddo - ! Vorticity - do ifld = 1, nflevg - zerr(3) = abs(znormvor1(ifld)/znormvor(ifld) - 1.0_jprb) - zmaxerr(3) = max(zmaxerr(3),zerr(3)) - enddo - ! Temperature - if (nfld > 0) then - do ifld = 1, nflevg - zerr(4) = abs(znormt1(ifld)/znormt(ifld) - 1.0_jprb) - zmaxerr(4) = max(zmaxerr(4), zerr(4)) - enddo - write(nout,'("time step ",i6," took", f8.4," | zspvor max err="e10.3,& - & " | zspdiv max err="e10.3," | zspsc3a max err="e10.3," | zspsc2 max err="e10.3)') & - & jstep, ztstep(jstep), zmaxerr(3), zmaxerr(2), zmaxerr(4), zmaxerr(1) - else - write(nout,'("time step ",i6," took", f8.4," | zspvor max err="e10.3,& - & " | zspdiv max err="e10.3," | zspsc2 max err="e10.3)') & - & jstep, ztstep(jstep), zmaxerr(3), zmaxerr(2), zmaxerr(1) - endif - endif - call gstats(6,1) - else - write(nout,'("Time step ",i6," took", f8.4)') jstep, ztstep(jstep) - endif - call gstats(3,1) -enddo - -!=================================================================================================== - -ztloop = (timef() - ztloop)/1000.0_jprd - -write(nout,'(" ")') -write(nout,'(a)') '======= End of spectral transforms =======' -write(nout,'(" ")') - -if (lprint_norms .or. ncheck > 0) then - call specnorm(pspec=zspvor(1:nflevl,:), pnorm=znormvor, kvset=ivset, kresol=nresol1) - call specnorm(pspec=zspdiv(1:nflevl,:), pnorm=znormdiv, kvset=ivset, kresol=nresol1) - if (nfld > 0) then - call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt, kvset=ivset, kresol=nresol1) - endif - call specnorm(pspec=zspsc2(1:1,:), pnorm=znormsp, kvset=ivsetsc, kresol=nresol1) - - if (myproc == 1) then - zmaxerr(:) = -999.0 - do ifld = 1, nflevg - zerr(3) = abs(real(znormvor1(ifld),kind=jprd)/real(znormvor(ifld),kind=jprd) - 1.0_jprd) - zmaxerr(3) = max(zmaxerr(3), zerr(3)) - if (verbosity >= 1) then - write(nout,'("norm zspvor( ",i4,") = ",f20.15," error = ",e10.3)') ifld, znormvor(ifld), zerr(3) - write(nout,'("0x",Z16.16)') znormvor(ifld) - endif - enddo - do ifld = 1, nflevg - zerr(2) = abs(real(znormdiv1(ifld),kind=jprd)/real(znormdiv(ifld),kind=jprd) - 1.0d0) - zmaxerr(2) = max(zmaxerr(2),zerr(2)) - if (verbosity >= 1) then - write(nout,'("norm zspdiv( ",i4,",:) = ",f20.15," error = ",e10.3)') ifld, znormdiv(ifld), zerr(2) - write(nout,'("0x",Z16.16)') znormdiv(ifld) - endif - enddo - if (nfld > 0) then - do ifld = 1, nflevg - zerr(4) = abs(real(znormt1(ifld),kind=jprd)/real(znormt(ifld),kind=jprd) - 1.0d0) - zmaxerr(4) = max(zmaxerr(4), zerr(4)) - if (verbosity >= 1) then - write(nout,'("norm zspsc3a(",i4,",:,1) = ",f20.15," error = ",e10.3)') ifld, znormt(ifld), zerr(4) - write(nout,'("0x",Z16.16)') znormt(ifld) - endif - enddo - endif - do ifld = 1, 1 - zerr(1) = abs(real(znormsp1(ifld),kind=jprd)/real(znormsp(ifld),kind=jprd) - 1.0d0) - zmaxerr(1) = max(zmaxerr(1), zerr(1)) - if (verbosity >= 1) then - write(nout,'("norm zspsc2( ",i4,",:) = ",f20.15," error = ",e10.3)') ifld, znormsp(ifld), zerr(1) - write(nout,'("0x",Z16.16)') znormsp(ifld) - endif - enddo - - ! maximum error across all fields - if (nfld > 0) then - zmaxerrg = max(zmaxerr(1), zmaxerr(2), zmaxerr(3), zmaxerr(4)) - else - zmaxerrg = max(zmaxerr(1), zmaxerr(2), zmaxerr(3)) - endif - - if (verbosity >= 1) write(nout,*) - write(nout,'("max error zspvor(1:nlev,:) = ",e10.3)') zmaxerr(3) - write(nout,'("max error zspdiv(1:nlev,:) = ",e10.3)') zmaxerr(2) - if (nfld > 0) write(nout,'("max error zspsc3a(1:nlev,:,1) = ",e10.3)') zmaxerr(4) - write(nout,'("max error zspsc2(1:1,:) = ",e10.3)') zmaxerr(1) - write(nout,*) - write(nout,'("max error combined = = ",e10.3)') zmaxerrg - write(nout,*) - endif - if (ncheck > 0) then - ierr = 0 - if (myproc == 1) then - ! If the maximum spectral norm error across all fields is greater than 100 times the machine - ! epsilon, fail the test - if (zmaxerrg > real(ncheck, jprb) * epsilon(1.0_jprb)) then - write(nout, '(a)') '*******************************' - write(nout, '(a)') 'Correctness test failed' - write(nout, '(a,1e9.2)') 'Maximum spectral norm error = ', zmaxerrg - write(nout, '(a,1e9.2)') 'Error tolerance = ', real(ncheck, jprb) * epsilon(1.0_jprb) - write(nout, '(a)') '*******************************' - ierr = 1 - endif - endif - - ! Root rank broadcasts the correctness checker result to the other ranks - if (luse_mpi) then - call mpl_broadcast(ierr,kroot=1,ktag=1) - endif - - ! Halt if correctness checker failed - if (ierr == 1) then - error stop - endif - endif -endif - -if (luse_mpi) then - call mpl_allreduce(ztloop, 'sum', ldreprod=.false.) - call mpl_allreduce(ztstep, 'sum', ldreprod=.false.) - call mpl_allreduce(ztstepavg, 'sum', ldreprod=.false.) - call mpl_allreduce(ztstepmax, 'max', ldreprod=.false.) - call mpl_allreduce(ztstepmin, 'min', ldreprod=.false.) - - call mpl_allreduce(ztstep1, 'sum', ldreprod=.false.) - call mpl_allreduce(ztstepavg1, 'sum', ldreprod=.false.) - call mpl_allreduce(ztstepmax1, 'max', ldreprod=.false.) - call mpl_allreduce(ztstepmin1, 'min', ldreprod=.false.) - - call mpl_allreduce(ztstep2, 'sum', ldreprod=.false.) - call mpl_allreduce(ztstepavg2, 'sum', ldreprod=.false.) - call mpl_allreduce(ztstepmax2, 'max', ldreprod=.false.) - call mpl_allreduce(ztstepmin2, 'min', ldreprod=.false.) -endif - -ztstepavg = (ztstepavg/real(nproc,jprb))/real(iters,jprd) -ztloop = ztloop/real(nproc,jprd) -ztstep(:) = ztstep(:)/real(nproc,jprd) -ztstepmed = get_median(ztstep) - -ztstepavg1 = (ztstepavg1/real(nproc,jprb))/real(iters,jprd) -ztstep1(:) = ztstep1(:)/real(nproc,jprd) -ztstepmed1 = get_median(ztstep1) - -ztstepavg2 = (ztstepavg2/real(nproc,jprb))/real(iters,jprd) -ztstep2(:) = ztstep2(:)/real(nproc,jprd) -ztstepmed2 = get_median(ztstep2) - -write(nout,'(a)') '======= Start of time step stats =======' -write(nout,'(" ")') -write(nout,'("Inverse transforms")') -write(nout,'("------------------")') -write(nout,'("avg (s): ",f8.4)') ztstepavg1 -write(nout,'("min (s): ",f8.4)') ztstepmin1 -write(nout,'("max (s): ",f8.4)') ztstepmax1 -write(nout,'("med (s): ",f8.4)') ztstepmed1 -write(nout,'(" ")') -write(nout,'("Direct transforms")') -write(nout,'("-----------------")') -write(nout,'("avg (s): ",f8.4)') ztstepavg2 -write(nout,'("min (s): ",f8.4)') ztstepmin2 -write(nout,'("max (s): ",f8.4)') ztstepmax2 -write(nout,'("med (s): ",f8.4)') ztstepmed2 -write(nout,'(" ")') -write(nout,'("Inverse-direct transforms")') -write(nout,'("-------------------------")') -write(nout,'("avg (s): ",f8.4)') ztstepavg -write(nout,'("min (s): ",f8.4)') ztstepmin -write(nout,'("max (s): ",f8.4)') ztstepmax -write(nout,'("med (s): ",f8.4)') ztstepmed -write(nout,'("loop (s): ",f8.4)') ztloop -write(nout,'(" ")') -write(nout,'(a)') '======= End of time step stats =======' -write(nout,'(" ")') - -if (lstack) then - ! Gather stack usage statistics - istack = getstackusage() - if (myproc == 1) then - print 9001, istack - 9001 format("Stack utilisation information",/,& - &"=============================",//,& - &"Task size(bytes)",/,& - &"==== ===========",//,& - &" 1",11x,i10) - - do i = 2, nproc - call mpl_recv(istack, ksource=nprcids(i), ktag=i, cdstring='ectrans_benchmark:') - print '(i4,11x,i10)', i, istack - enddo - else - call mpl_send(istack, kdest=nprcids(1), ktag=myproc, cdstring='ectrans_benchmark:') - endif -endif - - -!=================================================================================================== -! Cleanup -!=================================================================================================== - -deallocate(zgmv) -deallocate(zgmvs) -deallocate(sp3d) -deallocate(zspsc2) -deallocate(ivset) -if (lprint_norms .or. ncheck > 0) then - deallocate(znormsp) - deallocate(znormsp1) - deallocate(znormvor) - deallocate(znormvor1) - deallocate(znormdiv) - deallocate(znormdiv1) - deallocate(znormt) - deallocate(znormt1) -endif -deallocate(ztstep) -deallocate(ztstep1) -deallocate(ztstep2) -deallocate(nloen) - -!=================================================================================================== - -call trans_release(nresol1) -call trans_release(nresol2) -call acc_present_dump - -call trans_end - -call acc_present_dump -cgrid = 'O40' -call parse_grid(cgrid, ndgl, nloen) -nsmax = 39 -call setup_trans0(kout=nout, kerr=nerr, kprintlev=merge(2, 0, verbosity == 1), & - & kmax_resol=nmax_resol, kpromatr=npromatr, kprgpns=nprgpns, kprgpew=nprgpew, & - & kprtrw=nprtrw, ldsync_trans=lsync_trans, & - & ldeq_regions=leq_regions, prad=zra, ldalloperm=.true., ldmpoff=.not.luse_mpi,& - & kopt_memory_tr=nopt_mem_tr) call setup_trans(ksmax=nsmax, kdgl=ndgl, kloen=nloen, ldsplit=.true., & & lduserpnm=luserpnm, ldkeeprpnm=lkeeprpnm, & - & lduseflt=luseflt,kresol=nresol1) -call trans_inq(kspec2=nspec2, kspec2g=nspec2g, kgptot=ngptot, kgptotg=ngptotg,kresol=nresol1) + & lduseflt=luseflt) +call gstats(2, 1) + +call trans_inq(kspec2=nspec2, kspec2g=nspec2g, kgptot=ngptot, kgptotg=ngptotg) if (nproma == 0) then ! no blocking (default when not specified) nproma = ngptot @@ -1130,12 +538,12 @@ program ectrans_benchmark allocate(znormt(nflevg)) allocate(znormt1(nflevg)) - call specnorm(pspec=zspvor(1:nflevl,:), pnorm=znormvor1, kvset=ivset(1:nflevg), kresol=nresol1) - call specnorm(pspec=zspdiv(1:nflevl,:), pnorm=znormdiv1, kvset=ivset(1:nflevg), kresol=nresol1) + call specnorm(pspec=zspvor(1:nflevl,:), pnorm=znormvor1, kvset=ivset(1:nflevg)) + call specnorm(pspec=zspdiv(1:nflevl,:), pnorm=znormdiv1, kvset=ivset(1:nflevg)) if (nfld > 0) then - call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt1, kvset=ivset(1:nflevg), kresol=nresol1) + call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt1, kvset=ivset(1:nflevg)) endif - call specnorm(pspec=zspsc2(1:1,:), pnorm=znormsp1, kvset=ivsetsc, kresol=nresol1) + call specnorm(pspec=zspsc2(1:1,:), pnorm=znormsp1, kvset=ivsetsc) if (verbosity >= 1 .and. myproc == 1) then do ifld = 1, nflevg @@ -1211,7 +619,7 @@ program ectrans_benchmark ztstep1(jstep) = timef() call gstats(4,0) if (lvordiv) then - call inv_trans(kresol=nresol1, kproma=nproma, & + call inv_trans(kresol=1, kproma=nproma, & & pspsc2=zspsc2, & ! spectral surface pressure & pspvor=zspvor, & ! spectral vorticity & pspdiv=zspdiv, & ! spectral divergence @@ -1227,7 +635,7 @@ program ectrans_benchmark & pgpuv=zgpuv, & & pgp3a=zgp3a) else - call inv_trans(kresol=nresol1, kproma=nproma, & + call inv_trans(kresol=1, kproma=nproma, & & pspsc2=zspsc2, & ! spectral surface pressure & pspsc3a=zspsc3a, & ! spectral scalars & ldscders=lscders, & ! scalar derivatives @@ -1260,7 +668,7 @@ program ectrans_benchmark call gstats(5,0) if (lvordiv) then - call dir_trans(kresol=nresol1, kproma=nproma, & + call dir_trans(kresol=1, kproma=nproma, & & pgp2=zgmvs(:,1:1,:), & & pgpuv=zgpuv(:,:,1:2,:), & & pgp3a=zgp3a(:,:,1:nfld,:), & @@ -1272,7 +680,7 @@ program ectrans_benchmark & kvsetsc2=ivsetsc, & & kvsetsc3a=ivset) else - call dir_trans(kresol=nresol1, kproma=nproma, & + call dir_trans(kresol=1, kproma=nproma, & & pgp2=zgmvs(:,1:1,:), & & pgp3a=zgp3a(:,:,1:nfld,:), & & pspsc2=zspsc2, & @@ -1291,11 +699,11 @@ program ectrans_benchmark if (lprint_norms) then call gstats(6,0) - call specnorm(pspec=zspsc2(1:1,:), pnorm=znormsp, kvset=ivsetsc(1:1), kresol=nresol1) - call specnorm(pspec=zspvor(1:nflevl,:), pnorm=znormvor, kvset=ivset(1:nflevg), kresol=nresol1) - call specnorm(pspec=zspdiv(1:nflevl,:), pnorm=znormdiv, kvset=ivset(1:nflevg), kresol=nresol1) + call specnorm(pspec=zspsc2(1:1,:), pnorm=znormsp, kvset=ivsetsc(1:1)) + call specnorm(pspec=zspvor(1:nflevl,:), pnorm=znormvor, kvset=ivset(1:nflevg)) + call specnorm(pspec=zspdiv(1:nflevl,:), pnorm=znormdiv, kvset=ivset(1:nflevg)) if (nfld > 0) then - call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt, kvset=ivset(1:nflevg), kresol=nresol1) + call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt, kvset=ivset(1:nflevg)) endif ! Surface pressure @@ -1346,12 +754,12 @@ program ectrans_benchmark write(nout,'(" ")') if (lprint_norms .or. ncheck > 0) then - call specnorm(pspec=zspvor(1:nflevl,:), pnorm=znormvor, kvset=ivset, kresol=nresol1) - call specnorm(pspec=zspdiv(1:nflevl,:), pnorm=znormdiv, kvset=ivset, kresol=nresol1) + call specnorm(pspec=zspvor(1:nflevl,:), pnorm=znormvor, kvset=ivset) + call specnorm(pspec=zspdiv(1:nflevl,:), pnorm=znormdiv, kvset=ivset) if (nfld > 0) then - call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt, kvset=ivset, kresol=nresol1) + call specnorm(pspec=zspsc3a(1:nflevl,:,1), pnorm=znormt, kvset=ivset) endif - call specnorm(pspec=zspsc2(1:1,:), pnorm=znormsp, kvset=ivsetsc, kresol=nresol1) + call specnorm(pspec=zspsc2(1:1,:), pnorm=znormsp, kvset=ivsetsc) if (myproc == 1) then zmaxerr(:) = -999.0 @@ -1546,12 +954,6 @@ program ectrans_benchmark & kcall=1) endif -call trans_release(nresol1) -call trans_release(nresol2) - -call trans_end - - !=================================================================================================== ! Finalize MPI !=================================================================================================== @@ -1924,15 +1326,15 @@ subroutine initialize_2d_spectral_field(nsmax, field) field(:) = 0.0 ! Get zonal wavenumbers this rank is responsible for - call trans_inq(knump=num_my_zon_wns,kresol=nresol1) + call trans_inq(knump=num_my_zon_wns) allocate(my_zon_wns(num_my_zon_wns)) - call trans_inq(kmyms=my_zon_wns,kresol=nresol1) + call trans_inq(kmyms=my_zon_wns) ! If rank is responsible for the chosen zonal wavenumber... if (any(my_zon_wns == m_num) ) then ! Get array of spectral array addresses (this maps (m, n=m) to array index) allocate(nasm0(0:nsmax)) - call trans_inq(kasm0=nasm0,kresol=nresol1) + call trans_inq(kasm0=nasm0) ! Find out local array index of chosen spherical harmonic index = nasm0(m_num) + 2 * (l_num - m_num) + 1 From 9be2c8913710a05a7795b5a27fb2eafac1a11e93 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Thu, 17 Oct 2024 08:40:41 -0700 Subject: [PATCH 05/10] including sams review --- src/trans/gpu/external/gpnorm_trans_gpu.F90 | 6 +++--- src/trans/gpu/internal/cdmap_mod.F90 | 2 +- src/trans/gpu/internal/fsc_mod.F90 | 4 ++-- src/trans/gpu/internal/spnsde_mod.F90 | 7 +++---- src/trans/gpu/internal/trmtol_pack_unpack.F90 | 2 +- 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/trans/gpu/external/gpnorm_trans_gpu.F90 b/src/trans/gpu/external/gpnorm_trans_gpu.F90 index 36f87875d..e6b280ca0 100755 --- a/src/trans/gpu/external/gpnorm_trans_gpu.F90 +++ b/src/trans/gpu/external/gpnorm_trans_gpu.F90 @@ -197,14 +197,14 @@ SUBROUTINE GPNORM_TRANS_GPU(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) IBEG=1 IEND=D%NDGL_FS -ASSOCIATE(D_NSTAGTF=>D%NSTAGTF,D_NPTRLS=>D%NPTRLS,G_NLOEN=>G%NLOEN) +ASSOCIATE(D_NSTAGTF=>D%NSTAGTF,D_NPTRLS=>D%NPTRLS,G_NLOEN=>G%NLOEN,F_RW=>F%RW) CALL GSTATS(1429,0) IF( IF_FS > 0 )THEN #ifdef ACCGPU !$ACC DATA & - !$ACC& COPY(D,D_NSTAGTF,D_NPTRLS,G_NLOEN) & + !$ACC& COPY(D,D_NSTAGTF,D_NPTRLS,G_NLOEN,F,F_RW) & !$ACC& PRESENT(ZGTF,ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN) #endif #ifdef OMPGPU @@ -274,7 +274,7 @@ SUBROUTINE GPNORM_TRANS_GPU(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) DO JGL=IBEG,IEND IGL = D_NPTRLS(MYSETW) + JGL - 1 DO JF=1,IF_FS - ZAVE(JF,JGL)=ZAVE(JF,JGL)*F%RW(IGL)/G_NLOEN(IGL) + ZAVE(JF,JGL)=ZAVE(JF,JGL)*F_RW(IGL)/G_NLOEN(IGL) !write(iunit,*) 'aver inside ',JF,IF_FS,IGL,ZAVE(JF,JGL), F%RW(IGL), G_NLOEN(IGL),ZMINGPN(JF),ZMAXGPN(JF) ENDDO ENDDO diff --git a/src/trans/gpu/internal/cdmap_mod.F90 b/src/trans/gpu/internal/cdmap_mod.F90 index 2c1210632..98aac868a 100755 --- a/src/trans/gpu/internal/cdmap_mod.F90 +++ b/src/trans/gpu/internal/cdmap_mod.F90 @@ -81,7 +81,7 @@ SUBROUTINE CDMAP(KM,KMLOC,KSL,KSLO,PEPSNM, KDIR, KDGNH, KDGNHD,& ! ------------------------------------------------------------------ -CALL MPL_ABORT("this routine is not supported right now") +CALL MPL_ABORT("CDMAP not yet supported in ecTrans GPU version") !* 1. PERFORM LEGENDRE TRANFORM. ! -------------------------- diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index 949cb303f..07dc1c4bc 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -190,7 +190,7 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE #endif #ifdef ACCGPU !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IGLG,IOFF_LAT,IOFF_UV,IOFF_UV_EWDER,RET_REAL,RET_COMPLEX,ZACHTE2,JM,JF,KGL) & - !$ACC& FIRSTPRIVATE(IBEG,IEND,IINC,OFFSET_VAR,KF_UV,KUV_EWDER_OFFSET,KUV_OFFSET,KF_FS) ASYNC(1) + !$ACC& FIRSTPRIVATE(IBEG,IEND,IINC,OFFSET_VAR,KF_UV,KUV_EWDER_OFFSET,KUV_OFFSET,KF_FS,ILOEN_MAX) ASYNC(1) #endif DO KGL=IBEG,IEND,IINC DO JF=1,2*KF_UV @@ -231,7 +231,7 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE #endif #ifdef ACCGPU !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IGLG,IOFF_LAT,IOFF_SCALARS_EWDER,IOFF_SCALARS,ZACHTE2,RET_REAL,RET_COMPLEX) & - !$ACC& FIRSTPRIVATE(IBEG,IEND,IINC,KF_SCALARS,OFFSET_VAR,KSCALARS_EWDER_OFFSET,KSCALARS_OFFSET,KF_FS) ASYNC(1) + !$ACC& FIRSTPRIVATE(IBEG,IEND,IINC,KF_SCALARS,OFFSET_VAR,KSCALARS_EWDER_OFFSET,KSCALARS_OFFSET,KF_FS,ILOEN_MAX) ASYNC(1) #endif DO KGL=IBEG,IEND,IINC DO JF=1,KF_SCALARS diff --git a/src/trans/gpu/internal/spnsde_mod.F90 b/src/trans/gpu/internal/spnsde_mod.F90 index eb0ae2470..002c93763 100755 --- a/src/trans/gpu/internal/spnsde_mod.F90 +++ b/src/trans/gpu/internal/spnsde_mod.F90 @@ -16,7 +16,6 @@ SUBROUTINE SPNSDE(KF_SCALARS,PEPSNM,PF,PNSD) USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT USE TPM_DIM, ONLY: R USE TPM_DISTR, ONLY: D -USE TPM_FIELDS_GPU, ONLY: FG !**** *SPNSDE* - Compute North-South derivative in spectral space @@ -82,7 +81,7 @@ SUBROUTINE SPNSDE(KF_SCALARS,PEPSNM,PF,PNSD) ! LOCAL INTEGER SCALARS INTEGER(KIND=JPIM) :: IJ, ISKIP, J, JN, JI, IR, II -ASSOCIATE(D_NUMP=>D%NUMP, R_NTMAX=>R%NTMAX, D_MYMS=>D%MYMS, ZEPSNM=>FG%ZEPSNM) +ASSOCIATE(D_NUMP=>D%NUMP, R_NTMAX=>R%NTMAX, D_MYMS=>D%MYMS) #ifdef ACCGPU !$ACC DATA & @@ -91,7 +90,7 @@ SUBROUTINE SPNSDE(KF_SCALARS,PEPSNM,PF,PNSD) #endif #ifdef OMPGPU !$OMP TARGET DATA & -!$OMP& MAP(PRESENT,ALLOC:ZN) +!$OMP& MAP(PRESENT,PEPSNM,ALLOC:ZN) #endif ! ------------------------------------------------------------------ @@ -105,7 +104,7 @@ SUBROUTINE SPNSDE(KF_SCALARS,PEPSNM,PF,PNSD) #ifdef OMPGPU !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO !! DEFAULT(NONE) PRIVATE(IJ) & - !!$OMP& SHARED(KM,ZN,ZEPSNM,KMLOC) + !!$OMP& SHARED(KM,ZN,KMLOC) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,IR,II,JI) & diff --git a/src/trans/gpu/internal/trmtol_pack_unpack.F90 b/src/trans/gpu/internal/trmtol_pack_unpack.F90 index 00d4b6408..82cc907a4 100755 --- a/src/trans/gpu/internal/trmtol_pack_unpack.F90 +++ b/src/trans/gpu/internal/trmtol_pack_unpack.F90 @@ -263,7 +263,7 @@ SUBROUTINE TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURREN #endif #ifdef ACCGPU !$ACC PARALLEL LOOP PRIVATE(IGLG,IOFF_LAT,ISTA,RET_REAL,RET_COMPLEX) FIRSTPRIVATE(KF_CURRENT,& -!$ACC& KF_TOTAL,OFFSET_VAR) DEFAULT(NONE) & +!$ACC& KF_TOTAL,OFFSET_VAR,ILOEN_MAX) DEFAULT(NONE) & !$ACC& ASYNC(1) TILE(32,16,1) #endif DO KGL=1,D_NDGL_FS From c9d0e9b21c65bc9af513a2f8ae8fb463a15af00e Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Thu, 17 Oct 2024 09:31:13 -0700 Subject: [PATCH 06/10] reset d/f/fg/r/g after s because deallocation of s can use d in some cases --- src/trans/gpu/internal/dealloc_resol_mod.F90 | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/trans/gpu/internal/dealloc_resol_mod.F90 b/src/trans/gpu/internal/dealloc_resol_mod.F90 index c59c5c3f5..1d10cc96a 100755 --- a/src/trans/gpu/internal/dealloc_resol_mod.F90 +++ b/src/trans/gpu/internal/dealloc_resol_mod.F90 @@ -91,13 +91,6 @@ SUBROUTINE DEALLOC_RESOL(KRESOL) #ifdef OMPGPU #endif - ! Empty all fields (none of them has pointers; allocatable arrays implicitly deallocate) - D = D_ - F = F_ - FG = FG_ - R = R_ - G = G_ - ! TPM_FLD is more complex because it has pointers IF( ALLOCATED(S%FA) ) THEN DO JMLOC=1,D%NUMP,NPRTRV ! +++++++++++++++++++++ JMLOC LOOP ++++++++++ @@ -123,6 +116,13 @@ SUBROUTINE DEALLOC_RESOL(KRESOL) ENDIF S = S_ + ! Empty all fields (none of them has pointers; allocatable arrays implicitly deallocate) + D = D_ + F = F_ + FG = FG_ + R = R_ + G = G_ + CALL CLEAN_FFT(KRESOL) CALL CLEAN_GEMM(KRESOL) From 2e299cfeeccf1a587addaac42a2c0ecdbb4d1df2 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Wed, 30 Oct 2024 01:58:46 -0700 Subject: [PATCH 07/10] most of the review comments addressed --- src/trans/gpu/algor/growing_allocator_mod.F90 | 2 +- src/trans/gpu/algor/hicblas_mod.F90 | 1 - src/trans/gpu/algor/hicfft.hip.cpp | 2 +- src/trans/gpu/external/setup_trans.F90 | 10 +++++----- src/trans/gpu/internal/ledir_mod.F90 | 2 +- src/trans/gpu/internal/leinv_mod.F90 | 4 ++-- src/trans/gpu/internal/prfi1b_mod.F90 | 2 +- src/trans/gpu/internal/spnsde_mod.F90 | 2 +- src/trans/gpu/internal/trltom_pack_unpack.F90 | 6 +++--- src/trans/gpu/internal/trmtol_pack_unpack.F90 | 4 ++-- src/trans/gpu/internal/updspb_mod.F90 | 2 +- src/trans/gpu/internal/uvtvd_mod.F90 | 2 +- src/trans/gpu/internal/vdtuv_mod.F90 | 2 +- 13 files changed, 20 insertions(+), 21 deletions(-) diff --git a/src/trans/gpu/algor/growing_allocator_mod.F90 b/src/trans/gpu/algor/growing_allocator_mod.F90 index 7e7919b47..db869e480 100644 --- a/src/trans/gpu/algor/growing_allocator_mod.F90 +++ b/src/trans/gpu/algor/growing_allocator_mod.F90 @@ -88,7 +88,7 @@ SUBROUTINE DESTROY_GROWING_ALLOCATOR(ALLOC) IMPLICIT NONE TYPE(GROWING_ALLOCATION_TYPE) :: ALLOC INTEGER :: I - IF (ALLOCATED(ALLOC%PTR)) THEN + IF (ASSOCIATED(ALLOC%PTR)) THEN DO I = 1, ALLOC%FREE_FUNCS_SZ CALL ALLOC%FREE_FUNCS(I)%FUNC(ALLOC%PTR, & SIZE(ALLOC%PTR, 1, C_SIZE_T)) diff --git a/src/trans/gpu/algor/hicblas_mod.F90 b/src/trans/gpu/algor/hicblas_mod.F90 index fecf93243..5341cff27 100644 --- a/src/trans/gpu/algor/hicblas_mod.F90 +++ b/src/trans/gpu/algor/hicblas_mod.F90 @@ -64,7 +64,6 @@ SUBROUTINE HIP_SGEMM_BATCHED( & END SUBROUTINE HIP_SGEMM_BATCHED END INTERFACE INTERFACE - PUBLIC CLEAN_GEMM SUBROUTINE CLEAN_GEMM(RESOL_ID) BIND(C, NAME="clean_gemm") USE ISO_C_BINDING INTEGER(KIND=C_INT), INTENT(IN), VALUE :: RESOL_ID diff --git a/src/trans/gpu/algor/hicfft.hip.cpp b/src/trans/gpu/algor/hicfft.hip.cpp index a1329372c..488610d72 100644 --- a/src/trans/gpu/algor/hicfft.hip.cpp +++ b/src/trans/gpu/algor/hicfft.hip.cpp @@ -60,7 +60,7 @@ template class hicfft_plan { hicfft_plan(hipfftHandle handle_, int offset_) : handle_ptr(new hipfftHandle{handle_}, [](auto ptr) { - fftSafeCall(cufftDestroy(*ptr)); + fftSafeCall(hipfftDestroy(*ptr)); delete ptr; }), offset(offset_) {} diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index 591ef25ab..84d60f29b 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -506,11 +506,11 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& #ifdef OMPGPU WRITE(NOUT,*) 'Using OpenMP offloading' #endif -WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS', SIZEOF(FG%ZAS) -WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA', SIZEOF(FG%ZAA) -WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS0', SIZEOF(FG%ZAS0) -WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA0', SIZEOF(FG%ZAA0) -WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZEPSNM', SIZEOF(FG%ZEPSNM) +WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS', C_SIZEOF(FG%ZAS(1,1,1))*SIZE(FG%ZAS) +WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA', C_SIZEOF(FG%ZAA(1,1,1))*SIZE(FG%ZAA) +WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAS0', C_SIZEOF(FG%ZAS0(1,1))*SIZE(FG%ZAS0) +WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZAA0', C_SIZEOF(FG%ZAA0(1,1))*SIZE(FG%ZAA0) +WRITE(NOUT,'(A10,":",I9,"B")') 'FG%ZEPSNM', C_SIZEOF(FG%ZEPSNM(1,1))*SIZE(FG%ZEPSNM) IF (IMLOC0(1) > 0) THEN #ifdef ACCGPU diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 110f00dc3..0c1ee9e3e 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -158,7 +158,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) #ifdef ACCGPU !$ACC DATA & !$ACC& PRESENT(ZINPS,ZINPA,ZOUT,ZINPS0,ZINPA0,ZOUT0) & - !$ACC& PRESENT(D_MYMS,D_NUMP,R_NTMAX,R_NSMAX,G_NDGLU) & + !$ACC& PRESENT(D,D_MYMS,D_NUMP,R,R_NTMAX,R_NSMAX) & !$ACC& PRESENT(ZAA,ZAS,POA1,D_OFFSETS_GEMM1,D_OFFSETS_GEMM2) #endif diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 0ef6b569e..7d7d378cc 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -153,10 +153,10 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) #ifdef OMPGPU #endif #ifdef ACCGPU - !$ACC DATA PRESENT(D_MYMS,D_NUMP,G_NDGLU) & + !$ACC DATA PRESENT(D,D_MYMS,D_NUMP) & !$ACC& PRESENT(ZINP,ZOUTS,ZOUTA,ZINP0,ZOUTS0,ZOUTA0) & !$ACC& PRESENT(ZAA,ZAS,PIA) & - !$ACC& PRESENT(R_NSMAX,G_NDGLU,D_OFFSETS_GEMM2) + !$ACC& PRESENT(R,R_NSMAX,D_OFFSETS_GEMM2) #endif ! READ 2:NSMAX+3 diff --git a/src/trans/gpu/internal/prfi1b_mod.F90 b/src/trans/gpu/internal/prfi1b_mod.F90 index 23ddd21b8..83b7b8b9a 100755 --- a/src/trans/gpu/internal/prfi1b_mod.F90 +++ b/src/trans/gpu/internal/prfi1b_mod.F90 @@ -81,7 +81,7 @@ SUBROUTINE PRFI1B(PIA,PSPEC,KFIELDS,KDIM,KFLDPTR) #ifdef ACCGPU !$ACC DATA & - !$ACC& PRESENT(D_NUMP,R_NSMAX,D_MYMS,D_NASM0) & + !$ACC& PRESENT(D,D_NUMP,R,R_NSMAX,D_MYMS,D_NASM0) & !$ACC& PRESENT(PIA) & !$ACC& PRESENT(PSPEC) ASYNC(1) #endif diff --git a/src/trans/gpu/internal/spnsde_mod.F90 b/src/trans/gpu/internal/spnsde_mod.F90 index 002c93763..e7c7283b6 100755 --- a/src/trans/gpu/internal/spnsde_mod.F90 +++ b/src/trans/gpu/internal/spnsde_mod.F90 @@ -85,7 +85,7 @@ SUBROUTINE SPNSDE(KF_SCALARS,PEPSNM,PF,PNSD) #ifdef ACCGPU !$ACC DATA & -!$ACC& PRESENT (R_NTMAX, D_MYMS) & +!$ACC& PRESENT (R,R_NTMAX, D,D_MYMS) & !$ACC& PRESENT (D_NUMP,PEPSNM, PF, PNSD) ASYNC(1) #endif #ifdef OMPGPU diff --git a/src/trans/gpu/internal/trltom_pack_unpack.F90 b/src/trans/gpu/internal/trltom_pack_unpack.F90 index 0a157a991..0a6bb4c28 100755 --- a/src/trans/gpu/internal/trltom_pack_unpack.F90 +++ b/src/trans/gpu/internal/trltom_pack_unpack.F90 @@ -98,7 +98,7 @@ SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) #ifdef OMPGPU #endif #ifdef ACCGPU - !$ACC DATA PRESENT(G_NMEN,D_NPNTGTB0,FOUBUF_IN,PREEL_COMPLEX,D_NSTAGTF,D_NDGL_FS,G_NLOEN, R_NSMAX) ASYNC(1) + !$ACC DATA PRESENT(G,G_NMEN,D,D_NPNTGTB0,FOUBUF_IN,PREEL_COMPLEX,D_NSTAGTF,D_NDGL_FS,G_NLOEN, R,R_NSMAX) ASYNC(1) #endif ! scale results and move into next transformation buffer @@ -232,8 +232,8 @@ SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINP #ifdef ACCGPU !$ACC DATA & !$ACC& PRESENT(ZINPS,ZINPA,ZINPS0,ZINPA0) & - !$ACC& PRESENT(F_RW,F_RACTHE) & - !$ACC& PRESENT(D_MYMS,D_NUMP,R_NDGNH,R_NDGL,G_NDGLU) & + !$ACC& PRESENT(F,F_RW,F_RACTHE) & + !$ACC& PRESENT(D,D_MYMS,D_NUMP,R,R_NDGNH,R_NDGL,G_NDGLU) & !$ACC& PRESENT(D_NPNTGTB1,D_OFFSETS_GEMM1,FOUBUF) !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS) & diff --git a/src/trans/gpu/internal/trmtol_pack_unpack.F90 b/src/trans/gpu/internal/trmtol_pack_unpack.F90 index 82cc907a4..0dd9d3a8b 100755 --- a/src/trans/gpu/internal/trmtol_pack_unpack.F90 +++ b/src/trans/gpu/internal/trmtol_pack_unpack.F90 @@ -127,7 +127,7 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I #ifdef OMPGPU #endif #ifdef ACCGPU - !$ACC DATA PRESENT(D_MYMS,D_NPNTGTB1,D_NUMP,G_NDGLU,R_NDGNH,R_NDGL) & + !$ACC DATA PRESENT(D,D_MYMS,D_NPNTGTB1,D_NUMP,G,G_NDGLU,R,R_NDGNH,R_NDGL) & !$ACC& PRESENT(ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,D_OFFSETS_GEMM1) #endif @@ -254,7 +254,7 @@ SUBROUTINE TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURREN #ifdef OMPGPU #endif #ifdef ACCGPU -!$ACC DATA PRESENT(G_NLOEN,G_NMEN,D_NPNTGTB0,FOUBUF,PREEL_COMPLEX,D_NSTAGTF,D_NDGL_FS) ASYNC(1) +!$ACC DATA PRESENT(G,G_NLOEN,G_NMEN,D,D_NPNTGTB0,FOUBUF,PREEL_COMPLEX,D_NSTAGTF,D_NDGL_FS) ASYNC(1) #endif OFFSET_VAR=D_NPTRLS(MYSETW) diff --git a/src/trans/gpu/internal/updspb_mod.F90 b/src/trans/gpu/internal/updspb_mod.F90 index 9daf794e4..938a42fc9 100755 --- a/src/trans/gpu/internal/updspb_mod.F90 +++ b/src/trans/gpu/internal/updspb_mod.F90 @@ -96,7 +96,7 @@ SUBROUTINE UPDSPB(KFIELD,POA,PSPEC,KFLDPTR) !loop over wavenumber #ifdef ACCGPU - !$ACC DATA PRESENT(PSPEC,POA,R_NTMAX,D_NUMP,D_MYMS,D_NASM0) ASYNC(1) + !$ACC DATA PRESENT(PSPEC,POA,R,R_NTMAX,D,D_NUMP,D_MYMS,D_NASM0) ASYNC(1) #endif #ifdef OMPGPU !WARNING: following line should be PRESENT,ALLOC but causes issues with AMD compiler! diff --git a/src/trans/gpu/internal/uvtvd_mod.F90 b/src/trans/gpu/internal/uvtvd_mod.F90 index a920bb533..421ebb16f 100755 --- a/src/trans/gpu/internal/uvtvd_mod.F90 +++ b/src/trans/gpu/internal/uvtvd_mod.F90 @@ -86,7 +86,7 @@ SUBROUTINE UVTVD(KF_UV,PU,PV,PVOR,PDIV) #ifdef ACCGPU !$ACC DATA & -!$ACC& PRESENT(D_MYMS,D_NUMP,R_NTMAX) & +!$ACC& PRESENT(D,D_MYMS,D_NUMP,R,R_NTMAX) & !$ACC& PRESENT(ZEPSNM,PU,PV,PVOR,PDIV) ASYNC(1) #endif #ifdef OMPGPU diff --git a/src/trans/gpu/internal/vdtuv_mod.F90 b/src/trans/gpu/internal/vdtuv_mod.F90 index eeb687f11..d9e90053c 100755 --- a/src/trans/gpu/internal/vdtuv_mod.F90 +++ b/src/trans/gpu/internal/vdtuv_mod.F90 @@ -89,7 +89,7 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) #ifdef ACCGPU !$ACC DATA & -!$ACC& PRESENT(R_NTMAX,D_MYMS,D_NUMP,F_RLAPIN) & +!$ACC& PRESENT(R,R_NTMAX,D,D_MYMS,D_NUMP,F,F_RLAPIN) & !$ACC& PRESENT(PEPSNM, PVOR, PDIV) & !$ACC& PRESENT(PU, PV) #endif From 01619b317339ef8a45d33fcd084dd0f9e72aa87a Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Wed, 30 Oct 2024 02:01:14 -0700 Subject: [PATCH 08/10] Move around code to address compilation issue [ 97%] Building Fortran object ectrans/src/programs/CMakeFiles/ectrans-benchmark-gpu-dp.dir/ectrans-benchmark.F90.o Global is external, but doesn't have external or weak linkage! ptr @"str2int$ectrans_benchmark_" Global is external, but doesn't have external or weak linkage! ptr @"print_help$ectrans_benchmark_" LLVM ERROR: Broken module found, compilation aborted! ftn-2116 ftn: INTERNAL --- src/programs/ectrans-benchmark-ifs.F90 | 166 ++++++++++++------------- 1 file changed, 83 insertions(+), 83 deletions(-) diff --git a/src/programs/ectrans-benchmark-ifs.F90 b/src/programs/ectrans-benchmark-ifs.F90 index fd9fcc8de..5187553ff 100644 --- a/src/programs/ectrans-benchmark-ifs.F90 +++ b/src/programs/ectrans-benchmark-ifs.F90 @@ -1094,6 +1094,17 @@ subroutine parse_grid(cgrid,ndgl,nloen) !=================================================================================================== +subroutine str2int(str, int, stat) + + character(len=*), intent(in) :: str + integer, intent(out) :: int + integer, intent(out) :: stat + read(str, *, iostat=stat) int + +end subroutine str2int + +!=================================================================================================== + function get_int_value(cname, iarg) result(value) integer :: value @@ -1130,6 +1141,78 @@ function get_str_value(cname, iarg) result(value) !=================================================================================================== +subroutine print_help(unit) + + integer, optional :: unit + integer :: nout = 6 + if (present(unit)) then + nout = unit + endif + + write(nout, "(a)") "" + + if (jprb == jprd) then + write(nout, "(a)") "NAME ectrans-benchmark-dp" + else + write(nout, "(a)") "NAME ectrans-benchmark-sp" + end if + write(nout, "(a)") "" + + write(nout, "(a)") "DESCRIPTION" + write(nout, "(a)") " This program tests ecTrans by transforming fields back and forth& + & between spectral " + if (jprb == jprd) then + write(nout, "(a)") " space and grid-point space (double-precision version)" + else + write(nout, "(a)") " space and grid-point space (single-precision version)" + end if + write(nout, "(a)") "" + + write(nout, "(a)") "USAGE" + if (jprb == jprd) then + write(nout, "(a)") " ectrans-benchmark-dp [options]" + else + write(nout, "(a)") " ectrans-benchmark-sp [options]" + end if + write(nout, "(a)") "" + + write(nout, "(a)") "OPTIONS" + write(nout, "(a)") " -h, --help Print this message" + write(nout, "(a)") " -v Run with verbose output" + write(nout, "(a)") " -t, --truncation T Run with this triangular spectral truncation& + & (default = 79)" + write(nout, "(a)") " -g, --grid GRID Run with this grid. Possible values: O, F" + write(nout, "(a)") " If not specified, O is used with N=truncation+1& + & (cubic relation)" + write(nout, "(a)") " -n, --niter NITER Run for this many inverse/direct transform& + & iterations (default = 10)" + write(nout, "(a)") " -f, --nfld NFLD Number of scalar fields (default = 1)" + write(nout, "(a)") " -l, --nlev NLEV Number of vertical levels (default = 1)" + write(nout, "(a)") " --vordiv Also transform vorticity-divergence to wind" + write(nout, "(a)") " --scders Compute scalar derivatives (default off)" + write(nout, "(a)") " --uvders Compute uv East-West derivatives (default off). Only& + & when also --vordiv is given" + write(nout, "(a)") " --flt Run with fast Legendre transforms (default off)" + write(nout, "(a)") " --nproma NPROMA Run with NPROMA (default no blocking: NPROMA=ngptot)" + write(nout, "(a)") " --norms Calculate and print spectral norms of transformed& + & fields" + write(nout, "(a)") " The computation of spectral norms will skew overall& + & timings" + write(nout, "(a)") " --meminfo Show diagnostic information from FIAT's ec_meminfo& + & subroutine on memory usage, thread-binding etc." + write(nout, "(a)") " --nprtrv Size of V set in spectral decomposition" + write(nout, "(a)") " --nprtrw Size of W set in spectral decomposition" + write(nout, "(a)") " -c, --check VALUE The multiplier of the machine epsilon used as a& + & tolerance for correctness checking" + write(nout, "(a)") "" + write(nout, "(a)") "DEBUGGING" + write(nout, "(a)") " --dump-values Output gridpoint fields in unformatted binary file" + write(nout, "(a)") "" + +end subroutine print_help + +!=================================================================================================== + subroutine parsing_failed(message) character(len=*), intent(in) :: message @@ -1240,17 +1323,6 @@ function cubic_octahedral_gaussian_grid(nsmax) result(cgrid) !=================================================================================================== -subroutine str2int(str, int, stat) - - character(len=*), intent(in) :: str - integer, intent(out) :: int - integer, intent(out) :: stat - read(str, *, iostat=stat) int - -end subroutine str2int - -!=================================================================================================== - subroutine sort(a, n) real(kind=jprd), intent(inout) :: a(n) @@ -1275,78 +1347,6 @@ end subroutine sort !=================================================================================================== -subroutine print_help(unit) - - integer, optional :: unit - integer :: nout = 6 - if (present(unit)) then - nout = unit - endif - - write(nout, "(a)") "" - - if (jprb == jprd) then - write(nout, "(a)") "NAME ectrans-benchmark-dp" - else - write(nout, "(a)") "NAME ectrans-benchmark-sp" - end if - write(nout, "(a)") "" - - write(nout, "(a)") "DESCRIPTION" - write(nout, "(a)") " This program tests ecTrans by transforming fields back and forth& - & between spectral " - if (jprb == jprd) then - write(nout, "(a)") " space and grid-point space (double-precision version)" - else - write(nout, "(a)") " space and grid-point space (single-precision version)" - end if - write(nout, "(a)") "" - - write(nout, "(a)") "USAGE" - if (jprb == jprd) then - write(nout, "(a)") " ectrans-benchmark-dp [options]" - else - write(nout, "(a)") " ectrans-benchmark-sp [options]" - end if - write(nout, "(a)") "" - - write(nout, "(a)") "OPTIONS" - write(nout, "(a)") " -h, --help Print this message" - write(nout, "(a)") " -v Run with verbose output" - write(nout, "(a)") " -t, --truncation T Run with this triangular spectral truncation& - & (default = 79)" - write(nout, "(a)") " -g, --grid GRID Run with this grid. Possible values: O, F" - write(nout, "(a)") " If not specified, O is used with N=truncation+1& - & (cubic relation)" - write(nout, "(a)") " -n, --niter NITER Run for this many inverse/direct transform& - & iterations (default = 10)" - write(nout, "(a)") " -f, --nfld NFLD Number of scalar fields (default = 1)" - write(nout, "(a)") " -l, --nlev NLEV Number of vertical levels (default = 1)" - write(nout, "(a)") " --vordiv Also transform vorticity-divergence to wind" - write(nout, "(a)") " --scders Compute scalar derivatives (default off)" - write(nout, "(a)") " --uvders Compute uv East-West derivatives (default off). Only& - & when also --vordiv is given" - write(nout, "(a)") " --flt Run with fast Legendre transforms (default off)" - write(nout, "(a)") " --nproma NPROMA Run with NPROMA (default no blocking: NPROMA=ngptot)" - write(nout, "(a)") " --norms Calculate and print spectral norms of transformed& - & fields" - write(nout, "(a)") " The computation of spectral norms will skew overall& - & timings" - write(nout, "(a)") " --meminfo Show diagnostic information from FIAT's ec_meminfo& - & subroutine on memory usage, thread-binding etc." - write(nout, "(a)") " --nprtrv Size of V set in spectral decomposition" - write(nout, "(a)") " --nprtrw Size of W set in spectral decomposition" - write(nout, "(a)") " -c, --check VALUE The multiplier of the machine epsilon used as a& - & tolerance for correctness checking" - write(nout, "(a)") "" - write(nout, "(a)") "DEBUGGING" - write(nout, "(a)") " --dump-values Output gridpoint fields in unformatted binary file" - write(nout, "(a)") "" - -end subroutine print_help - -!=================================================================================================== - subroutine initialize_spectral_arrays(nsmax, zsp, sp3d) integer, intent(in) :: nsmax ! Spectral truncation From 1c5f52e82584d0d7973839f28bfd98e68f99c4e4 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Wed, 30 Oct 2024 02:04:23 -0700 Subject: [PATCH 09/10] use acc_init instead --- src/programs/ectrans-benchmark.F90 | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/programs/ectrans-benchmark.F90 b/src/programs/ectrans-benchmark.F90 index a939e9aeb..8d619d290 100644 --- a/src/programs/ectrans-benchmark.F90 +++ b/src/programs/ectrans-benchmark.F90 @@ -1069,6 +1069,9 @@ subroutine parsing_failed(message) subroutine get_command_line_arguments(nsmax, cgrid, iters, iters_warmup, nfld, nlev, lvordiv, lscders, luvders, & & luseflt, nopt_mem_tr, nproma, verbosity, ldump_values, lprint_norms, & & lmeminfo, nprtrv, nprtrw, ncheck) +#ifdef _OPENACC + use openacc +#endif integer, intent(inout) :: nsmax ! Spectral truncation character(len=16), intent(inout) :: cgrid ! Spectral truncation @@ -1096,7 +1099,7 @@ subroutine get_command_line_arguments(nsmax, cgrid, iters, iters_warmup, nfld, n integer :: iarg = 1 ! Argument index #ifdef _OPENACC - !$acc init + call acc_init(acc_get_device_type()) #endif do while (iarg <= command_argument_count()) From 62c4091ac7b42987092af15cf80d911d724d1df8 Mon Sep 17 00:00:00 2001 From: Lukas Mosimann Date: Mon, 18 Nov 2024 00:25:20 -0800 Subject: [PATCH 10/10] suggestions from sam --- src/trans/gpu/external/setup_trans.F90 | 2 +- src/trans/gpu/internal/fsc_mod.F90 | 2 +- src/trans/gpu/internal/trltom_pack_unpack.F90 | 2 +- src/trans/gpu/internal/uvtvd_mod.F90 | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index 84d60f29b..82670fea0 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -103,7 +103,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& !ifndef INTERFACE -USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_PTR, C_INT, C_ASSOCIATED, C_SIZE_T +USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_PTR, C_INT, C_ASSOCIATED, C_SIZE_T, C_SIZEOF USE EC_ENV_MOD, ONLY: EC_GETENV USE TPM_GEN, ONLY: NOUT, MSETUP0, NCUR_RESOL, NDEF_RESOL, & & NMAX_RESOL, NPRINTLEV, LENABLED, NERR diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index 07dc1c4bc..65c3a6bfb 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -105,7 +105,7 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE #ifdef ACCGPU !$ACC DATA & -!$ACC& PRESENT(D_NPTRLS,D_NSTAGTF,PREEL_COMPLEX,F_RACTHE,G_NMEN,G_NLOEN, R_NSMAX) +!$ACC& PRESENT(D,D_NPTRLS,D_NSTAGTF,PREEL_COMPLEX,F,F_RACTHE,G,G_NMEN,G_NLOEN,R,R_NSMAX) #endif #ifdef OMPGPU !$OMP TARGET DATA MAP(PRESENT,ALLOC:ZGTF) & diff --git a/src/trans/gpu/internal/trltom_pack_unpack.F90 b/src/trans/gpu/internal/trltom_pack_unpack.F90 index 0a6bb4c28..f0937afac 100755 --- a/src/trans/gpu/internal/trltom_pack_unpack.F90 +++ b/src/trans/gpu/internal/trltom_pack_unpack.F90 @@ -233,7 +233,7 @@ SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINP !$ACC DATA & !$ACC& PRESENT(ZINPS,ZINPA,ZINPS0,ZINPA0) & !$ACC& PRESENT(F,F_RW,F_RACTHE) & - !$ACC& PRESENT(D,D_MYMS,D_NUMP,R,R_NDGNH,R_NDGL,G_NDGLU) & + !$ACC& PRESENT(D,D_MYMS,D_NUMP,R,R_NDGNH,R_NDGL,G,G_NDGLU) & !$ACC& PRESENT(D_NPNTGTB1,D_OFFSETS_GEMM1,FOUBUF) !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS) & diff --git a/src/trans/gpu/internal/uvtvd_mod.F90 b/src/trans/gpu/internal/uvtvd_mod.F90 index 421ebb16f..27f5e0368 100755 --- a/src/trans/gpu/internal/uvtvd_mod.F90 +++ b/src/trans/gpu/internal/uvtvd_mod.F90 @@ -87,7 +87,7 @@ SUBROUTINE UVTVD(KF_UV,PU,PV,PVOR,PDIV) #ifdef ACCGPU !$ACC DATA & !$ACC& PRESENT(D,D_MYMS,D_NUMP,R,R_NTMAX) & -!$ACC& PRESENT(ZEPSNM,PU,PV,PVOR,PDIV) ASYNC(1) +!$ACC& PRESENT(FG,ZEPSNM,PU,PV,PVOR,PDIV) ASYNC(1) #endif #ifdef OMPGPU !WARNING: following line should be PRESENT,ALLOC but causes issues with AMD compiler!