From 743904e064fb75c9b63deab725e90b87fd27254f Mon Sep 17 00:00:00 2001 From: Trey White Date: Mon, 15 Jul 2024 12:49:26 -0400 Subject: [PATCH 1/5] Successful run of standalone Homme with CPE 23.12 and Rocm 5.7.1. --- cime_config/machines/config_machines.xml | 13 ++--- .../machine-files/crusher-scream-cpu.cmake | 6 -- .../machine-files/crusher-scream-gpu.cmake | 17 ------ .../machine-files/frontier-scream-gpu.cmake | 2 +- .../homme/cmake/machineFiles/frontier.cmake | 55 +++++++++++++++++++ .../cxx/HyperviscosityFunctorImpl.hpp | 5 -- .../homme/src/share/cxx/ExecSpaceDefs.cpp | 28 +--------- .../theta-l_kokkos/cxx/CaarFunctorImpl.hpp | 5 -- .../src/theta-l_kokkos/cxx/LimiterFunctor.hpp | 5 -- 9 files changed, 63 insertions(+), 73 deletions(-) delete mode 100644 components/eamxx/cmake/machine-files/crusher-scream-cpu.cmake delete mode 100644 components/eamxx/cmake/machine-files/crusher-scream-gpu.cmake create mode 100644 components/homme/cmake/machineFiles/frontier.cmake diff --git a/cime_config/machines/config_machines.xml b/cime_config/machines/config_machines.xml index a1cf8744de83..e471a4756e68 100644 --- a/cime_config/machines/config_machines.xml +++ b/cime_config/machines/config_machines.xml @@ -1441,22 +1441,19 @@ /usr/share/lmod/lmod/libexec/lmod python - PrgEnv-cray + cpe/23.12 craype-accel-amd-gfx90a - rocm/5.4.0 + rocm/5.7.1 libunwind/1.6.2 - cce/15.0.1 - craype craype/2.7.20 - cray-mpich cray-mpich/8.1.26 cray-python/3.9.13.1 subversion/1.14.1 git/2.36.1 cmake/3.21.3 - cray-hdf5-parallel/1.12.2.1 - cray-netcdf-hdf5parallel/4.9.0.1 - cray-parallel-netcdf/1.12.3.1 + cray-hdf5-parallel + cray-netcdf-hdf5parallel + cray-parallel-netcdf darshan-runtime diff --git a/components/eamxx/cmake/machine-files/crusher-scream-cpu.cmake b/components/eamxx/cmake/machine-files/crusher-scream-cpu.cmake deleted file mode 100644 index 58f15881e31c..000000000000 --- a/components/eamxx/cmake/machine-files/crusher-scream-cpu.cmake +++ /dev/null @@ -1,6 +0,0 @@ -include(${CMAKE_CURRENT_LIST_DIR}/common.cmake) -common_setup() - -include (${EKAT_MACH_FILES_PATH}/kokkos/serial.cmake) -include (${EKAT_MACH_FILES_PATH}/mpi/srun.cmake) - diff --git a/components/eamxx/cmake/machine-files/crusher-scream-gpu.cmake b/components/eamxx/cmake/machine-files/crusher-scream-gpu.cmake deleted file mode 100644 index 391a9b8d6880..000000000000 --- a/components/eamxx/cmake/machine-files/crusher-scream-gpu.cmake +++ /dev/null @@ -1,17 +0,0 @@ -include(${CMAKE_CURRENT_LIST_DIR}/common.cmake) -common_setup() - -#serial is needed, but maybe it is always on? -include (${EKAT_MACH_FILES_PATH}/kokkos/mi250.cmake) -include (${EKAT_MACH_FILES_PATH}/kokkos/hip.cmake) -include (${EKAT_MACH_FILES_PATH}/mpi/srun.cmake) - -SET(MPICH_DIR "/opt/cray/pe/mpich/8.1.16/ofi/crayclang/10.0" CACHE STRING "") - -set(CMAKE_CXX_FLAGS "--amdgpu-target=gfx90a -fno-gpu-rdc -I${MPICH_DIR}/include" CACHE STRING "" FORCE) -set(CMAKE_Fortran_FLAGS "-I${MPICH_DIR}/include" CACHE STRING "" FORCE) -set(CMAKE_C_FLAGS "-I${MPICH_DIR}/include" CACHE STRING "" FORCE) -set(CMAKE_EXE_LINKER_FLAGS "-L${MPICH_DIR}/lib -lmpi -L/opt/cray/pe/mpich/8.1.16/gtl/lib -lmpi_gtl_hsa" CACHE STRING "" FORCE) - - - diff --git a/components/eamxx/cmake/machine-files/frontier-scream-gpu.cmake b/components/eamxx/cmake/machine-files/frontier-scream-gpu.cmake index 14d1d501160d..bb1f4455a534 100644 --- a/components/eamxx/cmake/machine-files/frontier-scream-gpu.cmake +++ b/components/eamxx/cmake/machine-files/frontier-scream-gpu.cmake @@ -6,4 +6,4 @@ include (${EKAT_MACH_FILES_PATH}/kokkos/hip.cmake) set(SCREAM_MPIRUN_EXE "srun" CACHE STRING "") set(SCREAM_MACHINE "frontier-scream-gpu" CACHE STRING "") -set(CMAKE_CXX_FLAGS "--amdgpu-target=gfx90a -fno-gpu-rdc -I$ENV{MPICH_DIR}/include" CACHE STRING "" FORCE) +set(CMAKE_CXX_FLAGS "--offload-arch=gfx90a -fno-gpu-rdc -I$ENV{MPICH_DIR}/include" CACHE STRING "" FORCE) diff --git a/components/homme/cmake/machineFiles/frontier.cmake b/components/homme/cmake/machineFiles/frontier.cmake new file mode 100644 index 000000000000..84a334ad5ce5 --- /dev/null +++ b/components/homme/cmake/machineFiles/frontier.cmake @@ -0,0 +1,55 @@ +SET (HOMME_MACHINE "frontier" CACHE STRING "") + +SET (HOMMEXX_CUDA_MAX_WARP_PER_TEAM "16" CACHE STRING "") + +SET (NETCDF_DIR $ENV{CRAY_NETCDF_HDF5PARALLEL_PREFIX} CACHE FILEPATH "") +SET (HDF5_DIR $ENV{CRAY_HDF5_PARALLEL_PREFIX} CACHE FILEPATH "") +SET (CPRNC_DIR /ccs/proj/cli115/software/frontier/cprnc CACHE FILEPATH "") + +SET(BUILD_HOMME_WITHOUT_PIOLIBRARY TRUE CACHE BOOL "") + +SET(HOMME_FIND_BLASLAPACK TRUE CACHE BOOL "") + +SET(WITH_PNETCDF FALSE CACHE FILEPATH "") + +SET(USE_QUEUING FALSE CACHE BOOL "") + +SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "") +SET(BUILD_HOMME_THETA_KOKKOS TRUE CACHE BOOL "") + +SET (HOMMEXX_BFB_TESTING FALSE CACHE BOOL "") + +SET(USE_TRILINOS OFF CACHE BOOL "") + +SET(HIP_BUILD TRUE CACHE BOOL "") + +SET(Kokkos_ENABLE_SERIAL ON CACHE BOOL "") +SET(Kokkos_ENABLE_DEBUG OFF CACHE BOOL "") +SET(Kokkos_ARCH_VEGA90A ON CACHE BOOL "") +SET(Kokkos_ENABLE_OPENMP OFF CACHE BOOL "") +SET(Kokkos_ENABLE_HIP ON CACHE BOOL "") +SET(Kokkos_ENABLE_EXPLICIT_INSTANTIATION OFF CACHE BOOL "") + +SET(CMAKE_C_COMPILER "cc" CACHE STRING "") +SET(CMAKE_Fortran_COMPILER "ftn" CACHE STRING "") +#SET(CMAKE_Fortran_FLAGS "--gcc-toolchain=$ENV{MEMBERWORK}/cli115/workaround" CACHE STRING "") +SET(CMAKE_CXX_COMPILER "hipcc" CACHE STRING "") + +SET(Extrae_LIBRARY "-I$ENV{CRAY_MPICH_DIR}/include -L$ENV{CRAY_MPICH_DIR}/lib -lmpi $ENV{PE_MPICH_GTL_DIR_amd_gfx90a} $ENV{PE_MPICH_GTL_LIBS_amd_gfx90a}" CACHE STRING "") + +#SET(ADD_Fortran_FLAGS "-G2 --gcc-toolchain=$ENV{MEMBERWORK}/cli115/workaround ${Extrae_LIBRARY}" CACHE STRING "") +SET(ADD_C_FLAGS "-g -O ${Extrae_LIBRARY}" CACHE STRING "") +SET(ADD_CXX_FLAGS "-g -std=c++14 -O3 -munsafe-fp-atomics --offload-arch=gfx90a -fno-gpu-rdc -Wno-unused-command-line-argument -Wno-unsupported-floating-point-opt -Wno-#pragma-messages ${Extrae_LIBRARY}" CACHE STRING "") +SET(ADD_LINKER_FLAGS "${Extrae_LIBRARY}" CACHE STRING "") + + +set (ENABLE_OPENMP OFF CACHE BOOL "") +set (ENABLE_COLUMN_OPENMP OFF CACHE BOOL "") +set (ENABLE_HORIZ_OPENMP OFF CACHE BOOL "") + +set (HOMME_TESTING_PROFILE "short" CACHE STRING "") +SET (BUILD_HOMME_THETA_KOKKOS TRUE CACHE BOOL "") + +set (USE_NUM_PROCS 8 CACHE STRING "") + +SET (USE_MPI_OPTIONS "-c7 --gpu-bind=closest --gpus-per-task=1" CACHE FILEPATH "") diff --git a/components/homme/src/preqx_kokkos/cxx/HyperviscosityFunctorImpl.hpp b/components/homme/src/preqx_kokkos/cxx/HyperviscosityFunctorImpl.hpp index 688007e27f24..79263b98720b 100644 --- a/components/homme/src/preqx_kokkos/cxx/HyperviscosityFunctorImpl.hpp +++ b/components/homme/src/preqx_kokkos/cxx/HyperviscosityFunctorImpl.hpp @@ -297,13 +297,8 @@ class HyperviscosityFunctorImpl SphereOperators m_sphere_ops; // Policies -#ifndef NDEBUG - template - using TeamPolicyType = Kokkos::TeamPolicy,Tag>; -#else template using TeamPolicyType = Kokkos::TeamPolicy; -#endif Kokkos::RangePolicy m_policy_update_states; TeamPolicyType m_policy_first_laplace; TeamPolicyType m_policy_pre_exchange; diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.cpp b/components/homme/src/share/cxx/ExecSpaceDefs.cpp index 8d496bff5d16..7e762a7a6763 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.cpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.cpp @@ -6,7 +6,6 @@ #include -#include #include #include "ExecSpaceDefs.hpp" @@ -35,32 +34,9 @@ void initialize_kokkos () { // provides, as that algorithm is hardcoded in Kokkos::initialize(int& narg, // char* arg[]). Once the behavior is exposed in the InitArguments version of // initialize, we can remove this string code. - // If for some reason we're running on a GPU platform, have Cuda enabled, - // but are using a different execution space, this initialization is still - // OK. The rank gets a GPU assigned and simply will ignore it. -#ifdef KOKKOS_ENABLE_CUDA - int nd; - const auto ret = cudaGetDeviceCount(&nd); - if (ret != cudaSuccess) { - // It isn't a big deal if we can't get the device count. - nd = 1; - } -#elif defined(KOKKOS_ENABLE_HIP) - int nd; - const auto ret = hipGetDeviceCount(&nd); - if (ret != hipSuccess) { - // It isn't a big deal if we can't get the device count. - nd = 1; - } -#endif #ifdef HOMMEXX_ENABLE_GPU - std::stringstream ss; - ss << "--kokkos-num-devices=" << nd; - const auto key = ss.str(); - std::vector str(key.size()+1); - std::copy(key.begin(), key.end(), str.begin()); - str.back() = 0; - args.push_back(const_cast(str.data())); + const char *const map_device = "--kokkos-map-device-id-by=mpi_rank"; + args.push_back(const_cast(map_device)); #endif diff --git a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp index 38f9dc8573d8..87e570b568e4 100644 --- a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp +++ b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp @@ -107,13 +107,8 @@ struct CaarFunctorImpl { struct TagPostExchange {}; // Policies -#ifndef NDEBUG - template - using TeamPolicyType = Kokkos::TeamPolicy,Tag>; -#else template using TeamPolicyType = Kokkos::TeamPolicy; -#endif TeamPolicyType m_policy_pre; diff --git a/components/homme/src/theta-l_kokkos/cxx/LimiterFunctor.hpp b/components/homme/src/theta-l_kokkos/cxx/LimiterFunctor.hpp index cd3bf7c32526..5dd16d792b9f 100644 --- a/components/homme/src/theta-l_kokkos/cxx/LimiterFunctor.hpp +++ b/components/homme/src/theta-l_kokkos/cxx/LimiterFunctor.hpp @@ -49,13 +49,8 @@ struct LimiterFunctor { struct TagDp3dLimiter {}; // Policies -#ifndef NDEBUG - template - using TeamPolicyType = Kokkos::TeamPolicy,Tag>; -#else template using TeamPolicyType = Kokkos::TeamPolicy; -#endif TeamPolicyType m_policy_dp3d_lim; From b25528861ec0ba970ec75a8bcac24d2197355221 Mon Sep 17 00:00:00 2001 From: Trey White Date: Mon, 15 Jul 2024 20:42:12 -0400 Subject: [PATCH 2/5] Added CaarFunctorImpl.cpp and new SphereOperators. But not turned on. --- .../homme/src/share/compose/compose_homme.hpp | 2 +- .../homme/src/share/cxx/SphereOperators.hpp | 267 ++++++++ .../src/share/cxx/utilities/ViewUtils.hpp | 36 + .../homme/src/theta-l_kokkos/CMakeLists.txt | 1 + .../theta-l_kokkos/cxx/CaarFunctorImpl.cpp | 638 ++++++++++++++++++ .../theta-l_kokkos/cxx/CaarFunctorImpl.hpp | 50 +- .../theta-l_kokkos/cxx/EquationOfState.hpp | 6 + 7 files changed, 988 insertions(+), 12 deletions(-) create mode 100644 components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.cpp diff --git a/components/homme/src/share/compose/compose_homme.hpp b/components/homme/src/share/compose/compose_homme.hpp index a3b40a204a9a..e4a826318f40 100644 --- a/components/homme/src/share/compose/compose_homme.hpp +++ b/components/homme/src/share/compose/compose_homme.hpp @@ -200,7 +200,7 @@ struct HommeFormatArray { std::vector ie_data_ptr; const Int nlev, qsized, ntimelev; -#ifdef COMPOSE_ENABLE_GPU +#if defined __CUDA_ARCH__ || defined __HIP_DEVICE_COMPILE__ COMPOSE_INLINE_FUNCTION static T& unused () { static T unused = 0; assert(0); diff --git a/components/homme/src/share/cxx/SphereOperators.hpp b/components/homme/src/share/cxx/SphereOperators.hpp index c227d97ea708..0b528c1f1b5b 100644 --- a/components/homme/src/share/cxx/SphereOperators.hpp +++ b/components/homme/src/share/cxx/SphereOperators.hpp @@ -1176,6 +1176,273 @@ class SphereOperators Real m_scale_factor_inv, m_laplacian_rigid_factor; }; +using TeamPolicy = Kokkos::TeamPolicy; +using Team = TeamPolicy::member_type; + +static constexpr int NPNP = NP * NP; +static constexpr int WARP_SIZE = warpSize; + +static constexpr int SPHERE_BLOCK_LEV = WARP_SIZE; +static constexpr int SPHERE_BLOCKS_PER_COL = (NUM_PHYSICAL_LEV - 1) / SPHERE_BLOCK_LEV + 1; +static constexpr int SPHERE_BLOCK = NPNP * SPHERE_BLOCK_LEV; + +struct SphereGlobal { + + const ExecViewManaged d; + const ExecViewManaged dinv; + const ExecViewManaged dvv; + const ExecViewManaged metdet; + const Real scale_factor_inv; + + SphereGlobal(const SphereOperators &op): + d(op.m_d), + dinv(op.m_dinv), + dvv(op.dvv), + metdet(op.m_metdet), + scale_factor_inv(op.m_scale_factor_inv) + {} +}; + +using SphereBlockScratchView = Kokkos::View< + Real[SPHERE_BLOCK_LEV][NP][NP], + ExecSpace::scratch_memory_space, + Kokkos::MemoryTraits + >; + +using SphereBlockScratchSubview = Kokkos::Subview< + SphereBlockScratchView, int, + std::remove_const_t, + std::remove_const_t + >; + +struct SphereBlockOps; + +struct SphereBlockScratch { + SphereBlockScratchView v; + SphereBlockScratchSubview sv; + KOKKOS_INLINE_FUNCTION SphereBlockScratch(const SphereBlockOps &b); +}; + +struct SphereBlockOps { + const SphereGlobal &g; + const Team &t; + Real scale_factor_inv; + Real metdet; + Real rrdmd; + Real dinv[2][2]; + Real dvvx[NP]; + Real dvvy[NP]; + int e,x,y,z; + + KOKKOS_INLINE_FUNCTION SphereBlockOps(const SphereGlobal &sg, const Team &team): + g(sg), + t(team), + scale_factor_inv(g.scale_factor_inv), + rrdmd(0) + { + const int lr = t.league_rank(); + e = lr / SPHERE_BLOCKS_PER_COL; + const int iw = lr % SPHERE_BLOCKS_PER_COL; + const int tr = t.team_rank(); + const int ixy = tr / SPHERE_BLOCK_LEV; + x = ixy / NP; + y = ixy % NP; + const int dz = tr % SPHERE_BLOCK_LEV; + z = dz + iw * SPHERE_BLOCK_LEV; + + metdet = g.metdet(e,x,y); + for (int i = 0; i < 2; i++) for (int j = 0; j < 2; j++) dinv[i][j] = g.dinv(e,i,j,x,y); + for (int j = 0; j < NP; j++) { + dvvx[j] = g.dvv(x,j); + dvvy[j] = g.dvv(y,j); + } + } + + KOKKOS_INLINE_FUNCTION void barrier() const + { + t.team_barrier(); + } + + KOKKOS_INLINE_FUNCTION Real div(const SphereBlockScratch &t0, const SphereBlockScratch &t1) + { + // Separately compute du and dv to match Fortran bfb + Real du = 0; + Real dv = 0; + for (int j = 0; j < NP; j++) { + du += dvvy[j] * t0.sv(x,j); + dv += dvvx[j] * t1.sv(j,y); + } + if (rrdmd == 0) rrdmd = (1.0 / metdet) * scale_factor_inv; + return (du + dv) * rrdmd; + } + + KOKKOS_INLINE_FUNCTION void divInit(SphereBlockScratch &t0, SphereBlockScratch &t1, const Real v0, const Real v1) const + { + t0.sv(x,y) = (dinv[0][0] * v0 + dinv[1][0] * v1) * metdet; + t1.sv(x,y) = (dinv[0][1] * v0 + dinv[1][1] * v1) * metdet; + } + + KOKKOS_INLINE_FUNCTION void grad(Real &g0, Real &g1, const SphereBlockScratch &t) const + { + Real s0 = 0; + Real s1 = 0; + for (int j = 0; j < NP; j++) { + s0 += dvvy[j] * t.sv(x,j); + s1 += dvvx[j] * t.sv(j,y); + } + s0 *= scale_factor_inv; + s1 *= scale_factor_inv; + g0 = dinv[0][0] * s0 + dinv[0][1] * s1; + g1 = dinv[1][0] * s0 + dinv[1][1] * s1; + } + + KOKKOS_INLINE_FUNCTION void gradInit(SphereBlockScratch &t0, const Real v0) + { + t0.sv(x,y) = v0; + } + + KOKKOS_INLINE_FUNCTION bool skip() const { return (z >= NUM_PHYSICAL_LEV); } + + KOKKOS_INLINE_FUNCTION Real vort(const SphereBlockScratch &t0, const SphereBlockScratch &t1) + { + Real du = 0; + Real dv = 0; + for (int j = 0; j < NP; j++) { + du += dvvx[j] * t0.sv(j,y); + dv += dvvy[j] * t1.sv(x,j); + } + if (rrdmd == 0) rrdmd = (1.0 / metdet) * scale_factor_inv; + return (dv - du) * rrdmd; + } + + KOKKOS_INLINE_FUNCTION void vortInit(SphereBlockScratch &t0, SphereBlockScratch &t1, const Real v0, const Real v1) const + { + t0.sv(x,y) = g.d(e,0,0,x,y) * v0 + g.d(e,0,1,x,y) * v1; + t1.sv(x,y) = g.d(e,1,0,x,y) * v0 + g.d(e,1,1,x,y) * v1; + } + + static TeamPolicy policy(const int num_elems, const int num_scratch) + { + return TeamPolicy(num_elems * SPHERE_BLOCKS_PER_COL, SPHERE_BLOCK). + set_scratch_size(0, Kokkos::PerTeam(num_scratch * SphereBlockScratchView::shmem_size())); + } + +}; + +KOKKOS_INLINE_FUNCTION SphereBlockScratch::SphereBlockScratch(const SphereBlockOps &b): + v(b.t.team_scratch(0)), + sv(Kokkos::subview(v, b.z % SPHERE_BLOCK_LEV, Kokkos::ALL, Kokkos::ALL)) +{} + +struct SphereCol { + const Team &t; + int e,x,y,z; + + KOKKOS_INLINE_FUNCTION SphereCol(const Team &team): + t(team) + { + const int lr = t.league_rank(); + e = lr / NPNP; + const int xy = lr % NPNP; + x = xy / NP; + y = xy % NP; + z = t.team_rank(); + } + static TeamPolicy policy(const int num_elems, const int num_lev) + { + return TeamPolicy(num_elems * NPNP, num_lev); + } +}; + +struct SphereColOps: SphereCol { + Real scale_factor_inv; + Real dinv[2][2]; + Real dvvx[NP]; + Real dvvy[NP]; + + KOKKOS_INLINE_FUNCTION SphereColOps(const SphereGlobal &sg, const Team &team): + SphereCol(team) + { + scale_factor_inv = sg.scale_factor_inv; + for (int i = 0; i < 2; i++) for (int j = 0; j < 2; j++) dinv[i][j] = sg.dinv(e,i,j,x,y); + for (int j = 0; j < NP; j++) { + dvvx[j] = sg.dvv(x,j); + dvvy[j] = sg.dvv(y,j); + } + } + + template + KOKKOS_INLINE_FUNCTION void grad(OutView &out, const InView &in, const int n) const + { + Real s0 = 0; + Real s1 = 0; + for (int j = 0; j < NP; j++) { + s0 += dvvy[j] * in(e,n,x,j,z); + s1 += dvvx[j] * in(e,n,j,y,z); + } + s0 *= scale_factor_inv; + s1 *= scale_factor_inv; + out(e,0,x,y,z) = dinv[0][0] * s0 + dinv[0][1] * s1; + out(e,1,x,y,z) = dinv[1][0] * s0 + dinv[1][1] * s1; + } +}; + +struct SphereScanOps { + const Team &t; + int e,x,y; + + KOKKOS_INLINE_FUNCTION SphereScanOps(const Team &team): + t(team) + { + e = t.league_rank(); + const int tr = t.team_rank(); + x = tr / NP; + y = tr % NP; + } + + template + KOKKOS_INLINE_FUNCTION void nacs(OutView &out, const int n, const InView &in, const EndView &end) const + { + Dispatch<>::parallel_scan( + t, NUM_PHYSICAL_LEV, + [&](const int a, Real &sum, const bool last) { + if (a == 0) out(e,n,x,y,NUM_PHYSICAL_LEV) = sum = end(e,x,y); + const int z = NUM_PHYSICAL_LEV-a-1; + sum += in(e,x,y,z); + if (last) out(e,n,x,y,z) = sum; + }); + } + + template + KOKKOS_INLINE_FUNCTION void scan(OutView &out, const InView &in, const Real zero) const + { + Dispatch<>::parallel_scan( + t, NUM_PHYSICAL_LEV, + [&](const int z, Real &sum, const bool last) { + if (z == 0) out(e,x,y,0) = sum = zero; + sum += in(e,x,y,z); + if (last) out(e,x,y,z+1) = sum; + }); + } + + template + KOKKOS_INLINE_FUNCTION void scan(OutView &out, const InView &in, const int n, const Real zero) const + { + Dispatch<>::parallel_scan( + t, NUM_PHYSICAL_LEV, + [&](const int z, Real &sum, const bool last) { + if (z == 0) out(e,x,y,0) = sum = zero; + sum += in(e,n,x,y,z); + if (last) out(e,x,y,z+1) = sum; + }); + } + + static TeamPolicy policy(const int num_elems) + { + return TeamPolicy(num_elems, NPNP, WARP_SIZE); + } +}; + } // namespace Homme #endif // HOMMEXX_SPHERE_OPERATORS_HPP diff --git a/components/homme/src/share/cxx/utilities/ViewUtils.hpp b/components/homme/src/share/cxx/utilities/ViewUtils.hpp index 1b3efcd91020..b749e99e096a 100644 --- a/components/homme/src/share/cxx/utilities/ViewUtils.hpp +++ b/components/homme/src/share/cxx/utilities/ViewUtils.hpp @@ -67,6 +67,18 @@ viewAsReal(ViewType v_in) { return ReturnView(reinterpret_cast(v_in.data())); } +template +KOKKOS_INLINE_FUNCTION +typename +std::enable_if::type,Scalar>::value, + Unmanaged*[DIM1][DIM2][DIM3*VECTOR_SIZE],Properties...>> + >::type +viewAsReal(ViewType v_in) { + using ReturnST = RealType; + using ReturnView = Unmanaged*[DIM1][DIM2][DIM3*VECTOR_SIZE],Properties...>>; + return ReturnView(reinterpret_cast(v_in.data())); +} + template KOKKOS_INLINE_FUNCTION typename @@ -79,6 +91,30 @@ viewAsReal(ViewType v_in) { return ReturnView(reinterpret_cast(v_in.data())); } +template +KOKKOS_INLINE_FUNCTION +typename +std::enable_if::type,Scalar>::value, + Unmanaged*[DIM1][DIM2][DIM3][DIM4*VECTOR_SIZE],Properties...>> + >::type +viewAsReal(ViewType v_in) { + using ReturnST = RealType; + using ReturnView = Unmanaged*[DIM1][DIM2][DIM3][DIM4*VECTOR_SIZE],Properties...>>; + return ReturnView(reinterpret_cast(v_in.data())); +} + +template +KOKKOS_INLINE_FUNCTION +typename +std::enable_if::type,Scalar>::value, + Unmanaged*[DIM1][DIM2][DIM3][DIM4][DIM5*VECTOR_SIZE],Properties...>> + >::type +viewAsReal(ViewType v_in) { + using ReturnST = RealType; + using ReturnView = Unmanaged*[DIM1][DIM2][DIM3][DIM4][DIM5*VECTOR_SIZE],Properties...>>; + return ReturnView(reinterpret_cast(v_in.data())); +} + // Structure to define the type of a view that has a const data type, // given the type of an input view template diff --git a/components/homme/src/theta-l_kokkos/CMakeLists.txt b/components/homme/src/theta-l_kokkos/CMakeLists.txt index 739866960e36..a727ebfac62b 100644 --- a/components/homme/src/theta-l_kokkos/CMakeLists.txt +++ b/components/homme/src/theta-l_kokkos/CMakeLists.txt @@ -141,6 +141,7 @@ MACRO(THETAL_KOKKOS_SETUP) ) SET(THETAL_DEPS_CXX + ${TARGET_DIR}/cxx/CaarFunctorImpl.cpp ${TARGET_DIR}/cxx/CamForcing.cpp ${TARGET_DIR}/cxx/Diagnostics.cpp ${TARGET_DIR}/cxx/ElementsForcing.cpp diff --git a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.cpp b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.cpp new file mode 100644 index 000000000000..13d4e5090c43 --- /dev/null +++ b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.cpp @@ -0,0 +1,638 @@ +#include "CaarFunctorImpl.hpp" + +namespace Homme { + +template +void CaarFunctorImpl::epoch1_blockOps() +{ + auto buffers_dp_tens = viewAsReal(m_buffers.dp_tens); + auto buffers_exner = viewAsReal(m_buffers.exner); + auto buffers_phi = viewAsReal(m_buffers.phi); + auto buffers_pnh = viewAsReal(m_buffers.pnh); + auto buffers_theta_tens = viewAsReal(m_buffers.theta_tens); + auto buffers_vdp = viewAsReal(m_buffers.vdp); + + const Real data_eta_ave_w = m_data.eta_ave_w; + const int data_n0 = m_data.n0; + + auto derived_vn0 = viewAsReal(m_derived.m_vn0); + + const SphereGlobal sg(m_sphere_ops); + + auto state_dp3d = viewAsReal(m_state.m_dp3d); + auto state_phinh_i = viewAsReal(m_state.m_phinh_i); + auto state_v = viewAsReal(m_state.m_v); + auto state_vtheta_dp= viewAsReal(m_state.m_vtheta_dp); + + Kokkos::parallel_for( + "__PRETTY_FUNCTION__", + SphereBlockOps::policy(m_num_elems, 4), + KOKKOS_LAMBDA(const Team &team) { + + SphereBlockOps b(sg, team); + if (b.skip()) return; + + if (!HYDROSTATIC) { + + const Real dphi = state_phinh_i(b.e,data_n0,b.x,b.y,b.z+1) - state_phinh_i(b.e,data_n0,b.x,b.y,b.z); + const Real vtheta_dp = state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z); + if ((vtheta_dp < 0) || (dphi > 0)) abort(); + EquationOfState::compute_pnh_and_exner(vtheta_dp, dphi, buffers_pnh(b.e,b.x,b.y,b.z), buffers_exner(b.e,b.x,b.y,b.z)); + + buffers_phi(b.e,b.x,b.y,b.z) = 0.5 * (state_phinh_i(b.e,data_n0,b.x,b.y,b.z) + state_phinh_i(b.e,data_n0,b.x,b.y,b.z+1)); + } + + const Real v0 = state_v(b.e,data_n0,0,b.x,b.y,b.z) * state_dp3d(b.e,data_n0,b.x,b.y,b.z); + const Real v1 = state_v(b.e,data_n0,1,b.x,b.y,b.z) * state_dp3d(b.e,data_n0,b.x,b.y,b.z); + buffers_vdp(b.e,0,b.x,b.y,b.z) = v0; + buffers_vdp(b.e,1,b.x,b.y,b.z) = v1; + derived_vn0(b.e,0,b.x,b.y,b.z) += data_eta_ave_w * v0; + derived_vn0(b.e,1,b.x,b.y,b.z) += data_eta_ave_w * v1; + + SphereBlockScratch ttmp0(b); + SphereBlockScratch ttmp1(b); + b.divInit(ttmp0, ttmp1, v0, v1); + + SphereBlockScratch ttmp2(b); + SphereBlockScratch ttmp3(b); + Real vtheta = 0; + + if (CONSERVATIVE) { + const Real sv0 = state_v(b.e,data_n0,0,b.x,b.y,b.z) * state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z); + const Real sv1 = state_v(b.e,data_n0,1,b.x,b.y,b.z) * state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z); + b.divInit(ttmp2, ttmp3, sv0, sv1); + } else { + vtheta = state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z) / state_dp3d(b.e,data_n0,b.x,b.y,b.z); + b.gradInit(ttmp2, vtheta); + } + + b.barrier(); + + const Real dvdp = b.div(ttmp0, ttmp1); + buffers_dp_tens(b.e,b.x,b.y,b.z) = dvdp; + + if (CONSERVATIVE) { + buffers_theta_tens(b.e,b.x,b.y,b.z) = b.div(ttmp2, ttmp3); + } else { + Real grad0, grad1; + b.grad(grad0, grad1, ttmp2); + Real theta_tens = dvdp * vtheta; + theta_tens += grad0 * v0; + theta_tens += grad1 * v1; + buffers_theta_tens(b.e,b.x,b.y,b.z) = theta_tens; + } + }); +} + +template +void CaarFunctorImpl::epoch2_scanOps() +{ + auto buffers_dp_tens = viewAsReal(m_buffers.dp_tens); + auto buffers_dp_i = viewAsReal(m_buffers.dp_i); + auto buffers_eta_dot_dpdn = viewAsReal(m_buffers.eta_dot_dpdn); + auto buffers_w_tens = viewAsReal(m_buffers.w_tens); + + const int data_n0 = m_data.n0; + auto &hvcoord_hybrid_bi = m_hvcoord.hybrid_bi; + const Real pi_i00 = m_hvcoord.ps0 * m_hvcoord.hybrid_ai0; + auto state_dp3d = viewAsReal(m_state.m_dp3d); + + Kokkos::parallel_for( + "__PRETTY_FUNCTION__", + SphereScanOps::policy(m_num_elems), + KOKKOS_LAMBDA(const Team &team) { + + const SphereScanOps s(team); + + s.scan(buffers_dp_i, state_dp3d, data_n0, pi_i00); + s.scan(buffers_w_tens, buffers_dp_tens, 0); + + if (RSPLIT_ZERO) { + + s.scan(buffers_eta_dot_dpdn, buffers_dp_tens, 0); + + const Real last = buffers_eta_dot_dpdn(s.e,s.x,s.y,NUM_PHYSICAL_LEV); + + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(s.t, 1, NUM_PHYSICAL_LEV), + [&](const int z) { + Real eta_dot_dpdn = -buffers_eta_dot_dpdn(s.e,s.x,s.y,z); + eta_dot_dpdn += hvcoord_hybrid_bi(z) * last; + buffers_eta_dot_dpdn(s.e,s.x,s.y,z) = eta_dot_dpdn; + }); + + Kokkos::single( + Kokkos::PerThread(team), + [&]() { + buffers_eta_dot_dpdn(s.e,s.x,s.y,0) = buffers_eta_dot_dpdn(s.e,s.x,s.y,NUM_PHYSICAL_LEV) = 0; + }); + } + }); +} + +template +void CaarFunctorImpl::epoch3_blockOps() +{ + auto buffers_dp_i = viewAsReal(m_buffers.dp_i); + auto buffers_dp_tens = viewAsReal(m_buffers.dp_tens); + auto buffers_eta_dot_dpdn = viewAsReal(m_buffers.eta_dot_dpdn); + auto buffers_exner = viewAsReal(m_buffers.exner); + auto buffers_omega_p = viewAsReal(m_buffers.omega_p); + auto buffers_pnh = viewAsReal(m_buffers.pnh); + auto buffers_temp = viewAsReal(m_buffers.temp); + auto buffers_v_tens = viewAsReal(m_buffers.v_tens); + auto buffers_w_tens = viewAsReal(m_buffers.w_tens); + + const Real data_eta_ave_w = m_data.eta_ave_w; + const int data_n0 = m_data.n0; + + auto derived_eta_dot_dpdn = viewAsReal(m_derived.m_eta_dot_dpdn); + auto derived_omega_p = viewAsReal(m_derived.m_omega_p); + + const SphereGlobal sg(m_sphere_ops); + + auto state_dp3d = viewAsReal(m_state.m_dp3d); + auto state_v = viewAsReal(m_state.m_v); + auto state_vtheta_dp= viewAsReal(m_state.m_vtheta_dp); + auto state_w_i = viewAsReal(m_state.m_w_i); + + Kokkos::parallel_for( + "__PRETTY_FUNCTION__", + SphereBlockOps::policy(m_num_elems, 1), + KOKKOS_LAMBDA(const Team &team) { + + SphereBlockOps b(sg, team); + if (b.skip()) return; + + const Real pi = 0.5 * (buffers_dp_i(b.e,b.x,b.y,b.z) + buffers_dp_i(b.e,b.x,b.y,b.z+1)); + SphereBlockScratch tmp0(b); + b.gradInit(tmp0, pi); + + if (HYDROSTATIC) { + Real exner = pi; + EquationOfState::pressure_to_exner(exner); + buffers_exner(b.e,b.x,b.y,b.z) = exner; + buffers_pnh(b.e,b.x,b.y,b.z) = EquationOfState::compute_dphi(state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z), exner, pi); + } + + b.barrier(); + + Real grad0, grad1; + b.grad(grad0, grad1, tmp0); + + Real omega = -0.5 * (buffers_w_tens(b.e,b.x,b.y,b.z) + buffers_w_tens(b.e,b.x,b.y,b.z+1)); + const Real uz = state_v(b.e,data_n0,0,b.x,b.y,b.z); + const Real vz = state_v(b.e,data_n0,1,b.x,b.y,b.z); + omega += uz * grad0 + vz * grad1; + buffers_omega_p(b.e,b.x,b.y,b.z) = omega; + + derived_omega_p(b.e,b.x,b.y,b.z) += data_eta_ave_w * omega; + + if (RSPLIT_ZERO) { + + const Real dp = state_dp3d(b.e,data_n0,b.x,b.y,b.z); + const Real etap = buffers_eta_dot_dpdn(b.e,b.x,b.y,b.z+1); + const Real etaz = buffers_eta_dot_dpdn(b.e,b.x,b.y,b.z); + + buffers_dp_tens(b.e,b.x,b.y,b.z) += etap - etaz; + + derived_eta_dot_dpdn(b.e,b.x,b.y,b.z) += data_eta_ave_w * etaz; + + if (!HYDROSTATIC) { + const Real dw = state_w_i(b.e,data_n0,b.x,b.y,b.z+1) - state_w_i(b.e,data_n0,b.x,b.y,b.z); + const Real eta = 0.5 * (etaz + etap); + buffers_temp(b.e,b.x,b.y,b.z) = dw * eta; + } + + Real u = 0; + Real v = 0; + if (b.z < NUM_PHYSICAL_LEV-1) { + const Real facp = 0.5 * etap / dp; + u = facp * (state_v(b.e,data_n0,0,b.x,b.y,b.z+1) - uz); + v = facp * (state_v(b.e,data_n0,1,b.x,b.y,b.z+1) - vz); + } + if (b.z > 0) { + const Real facm = 0.5 * etaz / dp; + u += facm * (uz - state_v(b.e,data_n0,0,b.x,b.y,b.z-1)); + v += facm * (vz - state_v(b.e,data_n0,1,b.x,b.y,b.z-1)); + } + buffers_v_tens(b.e,0,b.x,b.y,b.z) = u; + buffers_v_tens(b.e,1,b.x,b.y,b.z) = v; + } + }); +} + +void CaarFunctorImpl::epoch4_scanOps() +{ + auto buffers_phi = viewAsReal(m_buffers.phi); + auto buffers_pnh = viewAsReal(m_buffers.pnh); + + const int data_n0 = m_data.n0; + + auto &geometry_phis = m_geometry.m_phis; + + auto state_phinh_i = viewAsReal(m_state.m_phinh_i); + + Kokkos::parallel_for( + "__PRETTY_FUNCTION__", + SphereScanOps::policy(m_num_elems), + KOKKOS_LAMBDA(const Team &team) { + const SphereScanOps s(team); + s.nacs(state_phinh_i, data_n0, buffers_pnh, geometry_phis); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(s.t, NUM_PHYSICAL_LEV), + [&](const int z) { + buffers_phi(s.e,s.x,s.y,z) = 0.5 * (state_phinh_i(s.e,data_n0,s.x,s.y,z) + state_phinh_i(s.e,data_n0,s.x,s.y,z+1)); + }); + }); +} + +template +void CaarFunctorImpl::epoch5_colOps() +{ + auto buffers_dp_i = viewAsReal(m_buffers.dp_i); + auto buffers_dpnh_dp_i = viewAsReal(m_buffers.dpnh_dp_i); + auto buffers_eta_dot_dpdn = viewAsReal(m_buffers.eta_dot_dpdn); + auto buffers_exner = viewAsReal(m_buffers.exner); + auto buffers_grad_phinh_i = viewAsReal(m_buffers.grad_phinh_i); + auto buffers_grad_w_i = viewAsReal(m_buffers.grad_w_i); + auto buffers_phi = viewAsReal(m_buffers.phi); + auto buffers_phi_tens = viewAsReal(m_buffers.phi_tens); + auto buffers_pnh = viewAsReal(m_buffers.pnh); + auto buffers_temp = viewAsReal(m_buffers.temp); + auto buffers_v_i = viewAsReal(m_buffers.v_i); + auto buffers_vtheta_i = viewAsReal(m_buffers.vtheta_i); + auto buffers_w_tens = viewAsReal(m_buffers.w_tens); + + const int data_n0 = m_data.n0; + const Real dscale = m_data.scale1 - m_data.scale2; + + auto &geometry_gradphis = m_geometry.m_gradphis; + + const Real gscale1 = m_data.scale1 * PhysicalConstants::g; + const Real gscale2 = m_data.scale2 * PhysicalConstants::g; + + auto hvcoord_hybrid_bi_packed = viewAsReal(m_hvcoord.hybrid_bi_packed); + + const Real ndata_scale1 = -m_data.scale1; + const Real pi_i00 = m_hvcoord.ps0 * m_hvcoord.hybrid_ai0; + + const SphereGlobal sg(m_sphere_ops); + + auto state_dp3d = viewAsReal(m_state.m_dp3d); + auto state_phinh_i = viewAsReal(m_state.m_phinh_i); + auto state_v = viewAsReal(m_state.m_v); + auto state_w_i = viewAsReal(m_state.m_w_i); + + Kokkos::parallel_for( + "__PRETTY_FUNCTION__", + SphereColOps::policy(m_num_elems, NUM_INTERFACE_LEV), + KOKKOS_LAMBDA(const Team &team) { + + const SphereColOps c(sg, team); + + c.grad(buffers_grad_phinh_i, state_phinh_i, data_n0); + if (!HYDROSTATIC) c.grad(buffers_grad_w_i, state_w_i, data_n0); + + const Real dm = (c.z == 0) ? 0 : state_dp3d(c.e,data_n0,c.x,c.y,c.z-1); + const Real dz = (c.z == NUM_PHYSICAL_LEV) ? 0 : state_dp3d(c.e,data_n0,c.x,c.y,c.z); + const Real dp_i = (c.z == 0) ? dz : (c.z == NUM_PHYSICAL_LEV) ? dm : 0.5 * (dz + dm); + buffers_dp_i(c.e,c.x,c.y,c.z) = dp_i; + + if (!HYDROSTATIC) { + + const Real v0m = (c.z == 0) ? 0 : state_v(c.e,data_n0,0,c.x,c.y,c.z-1); + const Real v0z = (c.z == NUM_PHYSICAL_LEV) ? 0 : state_v(c.e,data_n0,0,c.x,c.y,c.z); + const Real v_i0 = (c.z == 0) ? v0z : (c.z == NUM_PHYSICAL_LEV) ? v0m : (dz * v0z + dm * v0m) / (dm + dz); + buffers_v_i(c.e,0,c.x,c.y,c.z) = v_i0; + + const Real v1m = (c.z == 0) ? 0 : state_v(c.e,data_n0,1,c.x,c.y,c.z-1); + const Real v1z = (c.z == NUM_PHYSICAL_LEV) ? 0 : state_v(c.e,data_n0,1,c.x,c.y,c.z); + const Real v_i1 = (c.z == 0) ? v1z : (c.z == NUM_PHYSICAL_LEV) ? v1m :(dz * v1z + dm * v1m) / (dm + dz); + buffers_v_i(c.e,1,c.x,c.y,c.z) = v_i1; + + const Real pm = (c.z == 0) ? pi_i00 : buffers_pnh(c.e,c.x,c.y,c.z-1); + const Real pz = (c.z == NUM_PHYSICAL_LEV) ? pm + 0.5 * dm : buffers_pnh(c.e,c.x,c.y,c.z); + buffers_dpnh_dp_i(c.e,c.x,c.y,c.z) = 2.0 * (pz - pm) / (dm + dz); + } + + if (RSPLIT_ZERO) { + + const Real phim = (c.z == 0) ? 0 : buffers_phi(c.e,c.x,c.y,c.z-1); + const Real phiz = (c.z == NUM_PHYSICAL_LEV) ? 0 : buffers_phi(c.e,c.x,c.y,c.z); + + if (!HYDROSTATIC) { + const Real phi_vadv = ((c.z == 0) || (c.z == NUM_PHYSICAL_LEV)) ? 0 : (phiz - phim) * buffers_eta_dot_dpdn(c.e,c.x,c.y,c.z) / dp_i; + buffers_phi_tens(c.e,c.x,c.y,c.z) = phi_vadv; + } + + Real vtheta_i = phiz - phim; + if (!(c.z == 0) && !(c.z == NUM_PHYSICAL_LEV)) { + const Real dexner = buffers_exner(c.e,c.x,c.y,c.z) - buffers_exner(c.e,c.x,c.y,c.z-1); + vtheta_i /= dexner; + } + vtheta_i /= -PhysicalConstants::cp; + if (!HYDROSTATIC) vtheta_i *= buffers_dpnh_dp_i(c.e,c.x,c.y,c.z); + buffers_vtheta_i(c.e,c.x,c.y,c.z) = vtheta_i; + } + + if (!HYDROSTATIC) { + + Real w_tens = 0; + if (RSPLIT_ZERO) { + const Real tempm = (c.z == 0) ? 0 : buffers_temp(c.e,c.x,c.y,c.z-1); + const Real tempz = (c.z == NUM_PHYSICAL_LEV) ? 0 : buffers_temp(c.e,c.x,c.y,c.z); + const Real dw = (c.z == 0) ? tempz : (c.z == NUM_PHYSICAL_LEV) ? tempm : 0.5 * (tempz + tempm); + w_tens = dw / dp_i; + } + w_tens += buffers_v_i(c.e,0,c.x,c.y,c.z) * buffers_grad_w_i(c.e,0,c.x,c.y,c.z) + buffers_v_i(c.e,1,c.x,c.y,c.z) * buffers_grad_w_i(c.e,1,c.x,c.y,c.z); + w_tens *= ndata_scale1; + const Real gscale = (c.z == NUM_PHYSICAL_LEV) ? gscale1 : gscale2; + w_tens += (buffers_dpnh_dp_i(c.e,c.x,c.y,c.z)-Real(1)) * gscale; + buffers_w_tens(c.e,c.x,c.y,c.z) = w_tens; + + Real phi_tens = (RSPLIT_ZERO) ? buffers_phi_tens(c.e,c.x,c.y,c.z) : 0; + phi_tens += buffers_v_i(c.e,0,c.x,c.y,c.z) * buffers_grad_phinh_i(c.e,0,c.x,c.y,c.z) + buffers_v_i(c.e,1,c.x,c.y,c.z) * buffers_grad_phinh_i(c.e,1,c.x,c.y,c.z); + phi_tens *= ndata_scale1; + phi_tens += state_w_i(c.e,data_n0,c.x,c.y,c.z) * gscale; + + if (dscale) phi_tens += dscale * (buffers_v_i(c.e,0,c.x,c.y,c.z) * geometry_gradphis(c.e,0,c.x,c.y) + buffers_v_i(c.e,1,c.x,c.y,c.z) * geometry_gradphis(c.e,1,c.x,c.y)) * hvcoord_hybrid_bi_packed(c.z); + + buffers_phi_tens(c.e,c.x,c.y,c.z) = phi_tens; + } + }); +} + +template +void CaarFunctorImpl::epoch6_blockOps() +{ + auto buffers_dpnh_dp_i = viewAsReal(m_buffers.dpnh_dp_i); + auto buffers_exner = viewAsReal(m_buffers.exner); + auto buffers_grad_phinh_i = viewAsReal(m_buffers.grad_phinh_i); + auto buffers_grad_w_i = viewAsReal(m_buffers.grad_w_i); + auto buffers_v_tens = viewAsReal(m_buffers.v_tens); + + const int data_n0 = m_data.n0; + + auto &geometry_fcor = m_geometry.m_fcor; + + const SphereGlobal sg(m_sphere_ops); + + auto state_dp3d = viewAsReal(m_state.m_dp3d); + auto state_v = viewAsReal(m_state.m_v); + auto state_vtheta_dp= viewAsReal(m_state.m_vtheta_dp); + auto state_w_i = viewAsReal(m_state.m_w_i); + + Kokkos::parallel_for( + "caar compute_v_tens", + SphereBlockOps::policy(m_num_elems, 6), + KOKKOS_LAMBDA(const Team &team) { + + SphereBlockOps b(sg, team); + if (b.skip()) return; + + SphereBlockScratch ttmp0(b); + const Real w2 = (HYDROSTATIC) ? 0 : 0.25 * (state_w_i(b.e,data_n0,b.x,b.y,b.z) * state_w_i(b.e,data_n0,b.x,b.y,b.z) + state_w_i(b.e,data_n0,b.x,b.y,b.z+1) * state_w_i(b.e,data_n0,b.x,b.y,b.z+1)); + b.gradInit(ttmp0, w2); + + SphereBlockScratch ttmp1(b); + const Real exneriz = buffers_exner(b.e,b.x,b.y,b.z); + b.gradInit(ttmp1, exneriz); + + SphereBlockScratch ttmp2(b); + const Real log_exneriz = (PGRAD_CORRECTION) ? log(exneriz) : 0; + b.gradInit(ttmp2, log_exneriz); + + const Real v0 = state_v(b.e,data_n0,0,b.x,b.y,b.z); + const Real v1 = state_v(b.e,data_n0,1,b.x,b.y,b.z); + + SphereBlockScratch ttmp3(b); + SphereBlockScratch ttmp4(b); + b.vortInit(ttmp3, ttmp4, v0, v1); + + SphereBlockScratch ttmp5(b); + b.gradInit(ttmp5, 0.5 * (v0 * v0 + v1 * v1)); + + b.barrier(); + + Real grad_v0, grad_v1; + b.grad(grad_v0, grad_v1, ttmp5); + + Real u_tens = (RSPLIT_ZERO) ? buffers_v_tens(b.e,0,b.x,b.y,b.z) : 0; + Real v_tens = (RSPLIT_ZERO) ? buffers_v_tens(b.e,1,b.x,b.y,b.z) : 0; + u_tens += grad_v0; + v_tens += grad_v1; + + const Real cp_vtheta = PhysicalConstants::cp * (state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z) / state_dp3d(b.e,data_n0,b.x,b.y,b.z)); + + Real grad_exner0, grad_exner1; + b.grad(grad_exner0, grad_exner1, ttmp1); + + u_tens += cp_vtheta * grad_exner0; + v_tens += cp_vtheta * grad_exner1; + + Real mgrad_x, mgrad_y; + if (HYDROSTATIC) { + + mgrad_x = 0.5 * (buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z) + buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z+1)); + mgrad_y = 0.5 * (buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z) + buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z+1)); + + } else { + + mgrad_x = 0.5 * (buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z) * buffers_dpnh_dp_i(b.e,b.x,b.y,b.z) + buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z+1) * buffers_dpnh_dp_i(b.e,b.x,b.y,b.z+1)); + mgrad_y = 0.5 * (buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z) * buffers_dpnh_dp_i(b.e,b.x,b.y,b.z) + buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z+1) * buffers_dpnh_dp_i(b.e,b.x,b.y,b.z+1)); + + } + + if (PGRAD_CORRECTION) { + + Real grad_lexner0, grad_lexner1; + b.grad(grad_lexner0, grad_lexner1, ttmp2); + + namespace PC = PhysicalConstants; + constexpr Real cpt0 = PC::cp * (PC::Tref - PC::Tref_lapse_rate * PC::Tref * PC::cp / PC::g); + mgrad_x += cpt0 * (grad_lexner0 - grad_exner0 / exneriz); + mgrad_y += cpt0 * (grad_lexner1 - grad_exner1 / exneriz); + } + + Real wvor_x = 0; + Real wvor_y = 0; + if (!HYDROSTATIC) { + b.grad(wvor_x, wvor_y, ttmp0); + wvor_x -= 0.5 * (buffers_grad_w_i(b.e,0,b.x,b.y,b.z) * state_w_i(b.e,data_n0,b.x,b.y,b.z) + buffers_grad_w_i(b.e,0,b.x,b.y,b.z+1) * state_w_i(b.e,data_n0,b.x,b.y,b.z+1)); + wvor_y -= 0.5 * (buffers_grad_w_i(b.e,1,b.x,b.y,b.z) * state_w_i(b.e,data_n0,b.x,b.y,b.z) + buffers_grad_w_i(b.e,1,b.x,b.y,b.z+1) * state_w_i(b.e,data_n0,b.x,b.y,b.z+1)); + } + + u_tens += mgrad_x + wvor_x; + v_tens += mgrad_y + wvor_y; + + const Real vort = b.vort(ttmp3, ttmp4) + geometry_fcor(b.e,b.x,b.y); + u_tens -= v1 * vort; + v_tens += v0 * vort; + + buffers_v_tens(b.e,0,b.x,b.y,b.z) = u_tens; + buffers_v_tens(b.e,1,b.x,b.y,b.z) = v_tens; + }); +} + +template +void CaarFunctorImpl::epoch7_col() +{ + auto buffers_dp_tens = viewAsReal(m_buffers.dp_tens); + auto buffers_eta_dot_dpdn = viewAsReal(m_buffers.eta_dot_dpdn); + auto buffers_phi_tens = viewAsReal(m_buffers.phi_tens); + auto buffers_theta_tens = viewAsReal(m_buffers.theta_tens); + auto buffers_v_tens = viewAsReal(m_buffers.v_tens); + auto buffers_vtheta_i = viewAsReal(m_buffers.vtheta_i); + auto buffers_w_tens = viewAsReal(m_buffers.w_tens); + + const Real data_dt = m_data.dt; + const int data_nm1 = m_data.nm1; + const int data_np1 = m_data.np1; + const Real data_scale3 = m_data.scale3; + + auto &geometry_spheremp = m_geometry.m_spheremp; + + const Real scale1_dt = m_data.scale1 * m_data.dt; + + auto state_dp3d = viewAsReal(m_state.m_dp3d); + auto state_phinh_i = viewAsReal(m_state.m_phinh_i); + auto state_v = viewAsReal(m_state.m_v); + auto state_vtheta_dp = viewAsReal(m_state.m_vtheta_dp); + auto state_w_i = viewAsReal(m_state.m_w_i); + + Kokkos::parallel_for( + "__PRETTY_FUNCTION__", + SphereCol::policy(m_num_elems, NUM_PHYSICAL_LEV), + KOKKOS_LAMBDA(const Team &team) { + + const SphereCol c(team); + + const Real spheremp = geometry_spheremp(c.e,c.x,c.y); + const Real scale1_dt_spheremp = scale1_dt * spheremp; + const Real scale3_spheremp = data_scale3 * spheremp; + + Real dp_tens = buffers_dp_tens(c.e,c.x,c.y,c.z); + dp_tens *= scale1_dt_spheremp; + Real dp_np1 = scale3_spheremp * state_dp3d(c.e,data_nm1,c.x,c.y,c.z); + dp_np1 -= dp_tens; + state_dp3d(c.e,data_np1,c.x,c.y,c.z) = dp_np1; + + Real theta_tens = buffers_theta_tens(c.e,c.x,c.y,c.z); + if (RSPLIT_ZERO) { + const Real etap = buffers_eta_dot_dpdn(c.e,c.x,c.y,c.z+1); + const Real etaz = buffers_eta_dot_dpdn(c.e,c.x,c.y,c.z); + const Real thetap = etap * buffers_vtheta_i(c.e,c.x,c.y,c.z+1); + const Real thetaz = etaz * buffers_vtheta_i(c.e,c.x,c.y,c.z); + theta_tens += thetap - thetaz; + } + theta_tens *= -scale1_dt_spheremp; + Real vtheta_np1 = state_vtheta_dp(c.e,data_nm1,c.x,c.y,c.z); + vtheta_np1 *= scale3_spheremp; + vtheta_np1 += theta_tens; + state_vtheta_dp(c.e,data_np1,c.x,c.y,c.z) = vtheta_np1; + + Real u_tens = buffers_v_tens(c.e,0,c.x,c.y,c.z); + u_tens *= -scale1_dt_spheremp; + Real u_np1 = state_v(c.e,data_nm1,0,c.x,c.y,c.z); + u_np1 *= scale3_spheremp; + u_np1 += u_tens; + state_v(c.e,data_np1,0,c.x,c.y,c.z) = u_np1; + + Real v_tens = buffers_v_tens(c.e,1,c.x,c.y,c.z); + v_tens *= -scale1_dt_spheremp; + Real v_np1 = state_v(c.e,data_nm1,1,c.x,c.y,c.z); + v_np1 *= scale3_spheremp; + v_np1 += v_tens; + state_v(c.e,data_np1,1,c.x,c.y,c.z) = v_np1; + + if (!HYDROSTATIC) { + + const Real dt_spheremp = data_dt * spheremp; + + Real phi_tens = buffers_phi_tens(c.e,c.x,c.y,c.z); + phi_tens *= dt_spheremp; + Real phi_np1 = state_phinh_i(c.e,data_nm1,c.x,c.y,c.z); + phi_np1 *= scale3_spheremp; + phi_np1 += phi_tens; + state_phinh_i(c.e,data_np1,c.x,c.y,c.z) = phi_np1; + + Real w_tens = buffers_w_tens(c.e,c.x,c.y,c.z); + w_tens *= dt_spheremp; + Real w_np1 = state_w_i(c.e,data_nm1,c.x,c.y,c.z); + w_np1 *= scale3_spheremp; + w_np1 += w_tens; + state_w_i(c.e,data_np1,c.x,c.y,c.z) = w_np1; + + if (c.z == NUM_PHYSICAL_LEV-1) { + Real w_tens = buffers_w_tens(c.e,c.x,c.y,NUM_PHYSICAL_LEV); + w_tens *= dt_spheremp; + buffers_w_tens(c.e,c.x,c.y,NUM_PHYSICAL_LEV) = w_tens; + Real w_np1 = state_w_i(c.e,data_nm1,c.x,c.y,NUM_PHYSICAL_LEV); + w_np1 *= scale3_spheremp; + w_np1 += w_tens; + state_w_i(c.e,data_np1,c.x,c.y,NUM_PHYSICAL_LEV) = w_np1; + } + } + }); +} + +void CaarFunctorImpl::caar_compute() +{ + if (m_theta_hydrostatic_mode) { + if (m_theta_advection_form == AdvectionForm::Conservative) epoch1_blockOps(); + else epoch1_blockOps(); + } else { + if (m_theta_advection_form == AdvectionForm::Conservative) epoch1_blockOps(); + else epoch1_blockOps(); + } + + if (m_rsplit == 0) epoch2_scanOps(); + else epoch2_scanOps(); + + if (m_theta_hydrostatic_mode) { + if (m_rsplit == 0) epoch3_blockOps(); + else epoch3_blockOps(); + } else { + if (m_rsplit == 0) epoch3_blockOps(); + else epoch3_blockOps(); + } + + if (m_theta_hydrostatic_mode) epoch4_scanOps(); + + if (m_theta_hydrostatic_mode) { + if (m_rsplit == 0) epoch5_colOps(); + else epoch5_colOps(); + } else { + if (m_rsplit == 0) epoch5_colOps(); + else epoch5_colOps(); + } + + if (m_theta_hydrostatic_mode) { + if (m_rsplit == 0) { + if (m_pgrad_correction) epoch6_blockOps(); + else epoch6_blockOps(); + } else { + if (m_pgrad_correction) epoch6_blockOps(); + else epoch6_blockOps(); + } + } else { + if (m_rsplit == 0) { + if (m_pgrad_correction) epoch6_blockOps(); + else epoch6_blockOps(); + } else { + if (m_pgrad_correction) epoch6_blockOps(); + else epoch6_blockOps(); + } + } + + if (m_theta_hydrostatic_mode) { + if (m_rsplit == 0) epoch7_col(); + else epoch7_col(); + } else { + if (m_rsplit == 0) epoch7_col(); + else epoch7_col(); + } +} + +} diff --git a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp index 87e570b568e4..5f634b83b951 100644 --- a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp +++ b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp @@ -118,6 +118,17 @@ struct CaarFunctorImpl { Kokkos::Array, NUM_TIME_LEVELS> m_bes; + private: + template void epoch1_blockOps(); + template void epoch2_scanOps(); + template void epoch3_blockOps(); + void epoch4_scanOps(); + template void epoch5_colOps(); + template void epoch6_blockOps(); + template void epoch7_col(); + void caar_compute(); + + public: CaarFunctorImpl(const Elements &elements, const Tracers &/* tracers */, const ReferenceElement &ref_FE, const HybridVCoord &hvcoord, const SphereOperators &sphere_ops, const SimulationParams& params) @@ -187,7 +198,7 @@ struct CaarFunctorImpl { if (m_theta_hydrostatic_mode) { // pi=pnh, and no wtens/phitens num_scalar_mid_buf -= 1; - num_scalar_int_buf -= 3; + num_scalar_int_buf -= 2; // No grad_w_i/v_i num_vector_int_buf -= 2; @@ -197,7 +208,7 @@ struct CaarFunctorImpl { num_scalar_int_buf -=2; if (m_theta_hydrostatic_mode) { // No dp_i - num_scalar_int_buf -= 1; + //num_scalar_int_buf -= 1; } } @@ -255,10 +266,10 @@ struct CaarFunctorImpl { mem += m_buffers.v_tens.size(); // Interface scalars - if (!m_theta_hydrostatic_mode || m_rsplit==0) { + //if (!m_theta_hydrostatic_mode || m_rsplit==0) { m_buffers.dp_i = decltype(m_buffers.dp_i)(mem,nslots); mem += m_buffers.dp_i.size(); - } + //} if (!m_theta_hydrostatic_mode) { m_buffers.dpnh_dp_i = decltype(m_buffers.dpnh_dp_i)(mem,nslots); @@ -275,9 +286,9 @@ struct CaarFunctorImpl { if (!m_theta_hydrostatic_mode) { m_buffers.phi_tens = decltype(m_buffers.phi_tens )(mem,nslots); mem += m_buffers.phi_tens.size(); - m_buffers.w_tens = decltype(m_buffers.w_tens )(mem,nslots); - mem += m_buffers.w_tens.size(); } + m_buffers.w_tens = decltype(m_buffers.w_tens )(mem,nslots); + mem += m_buffers.w_tens.size(); // Interface vectors if (!m_theta_hydrostatic_mode) { @@ -341,6 +352,17 @@ struct CaarFunctorImpl { profiling_resume(); GPTLstart("caar compute"); + +#undef TREY +//#define TREY +#ifdef TREY + + caar_compute(); + Kokkos::fence(); + GPTLstop("caar compute"); + +#else + int nerr; Kokkos::parallel_reduce("caar loop pre-boundary exchange", m_policy_pre, *this, nerr); Kokkos::fence(); @@ -348,6 +370,8 @@ struct CaarFunctorImpl { if (nerr > 0) check_print_abort_on_bad_elems("CaarFunctorImpl::run TagPreExchange", data.n0); +#endif + GPTLstart("caar_bexchV"); m_bes[data.np1]->exchange(m_geometry.m_rspheremp); Kokkos::fence(); @@ -539,8 +563,10 @@ struct CaarFunctorImpl { auto dp = Homme::subview(m_state.m_dp3d,kv.ie,m_data.n0,igp,jgp); auto div_vdp = Homme::subview(m_buffers.div_vdp,kv.team_idx,igp,jgp); auto pi = Homme::subview(m_buffers.pi,kv.team_idx,igp,jgp); - auto omega_i = Homme::subview(m_buffers.grad_phinh_i,kv.team_idx,0,igp,jgp); - auto pi_i = Homme::subview(m_buffers.grad_phinh_i,kv.team_idx,1,igp,jgp); + //auto omega_i = Homme::subview(m_buffers.grad_phinh_i,kv.team_idx,0,igp,jgp); + auto omega_i = Homme::subview(m_buffers.w_tens,kv.team_idx,igp,jgp); + //auto pi_i = Homme::subview(m_buffers.grad_phinh_i,kv.team_idx,1,igp,jgp); + auto pi_i = Homme::subview(m_buffers.dp_i,kv.team_idx,igp,jgp); Kokkos::single(Kokkos::PerThread(kv.team),[&]() { pi_i(0)[0] = m_hvcoord.ps0*m_hvcoord.hybrid_ai0; @@ -561,11 +587,13 @@ struct CaarFunctorImpl { kv.team_barrier(); ColumnOps::column_scan_mid_to_int(kv,div_vdp,omega_i); + kv.team_barrier(); // Average omega_i to midpoints, and change sign, since later // omega=v*grad(pi)-average(omega_i) auto omega = Homme::subview(m_buffers.omega_p,kv.team_idx,igp,jgp); ColumnOps::compute_midpoint_values(kv,omega_i,omega,-1.0); }); + kv.team_barrier(); // Compute grad(pi) @@ -637,6 +665,7 @@ struct CaarFunctorImpl { // Compute interface horiz velocity auto u_i = Homme::subview(m_buffers.v_i,kv.team_idx,0,igp,jgp); auto v_i = Homme::subview(m_buffers.v_i,kv.team_idx,1,igp,jgp); + ColumnOps::compute_interface_values(kv.team,dp,dp_i,u,u_i); ColumnOps::compute_interface_values(kv.team,dp,dp_i,v,v_i); @@ -1133,7 +1162,6 @@ struct CaarFunctorImpl { ColumnOps::compute_midpoint_values(kv, w_sq, Homme::subview(m_buffers.temp,kv.team_idx,igp,jgp),0.5); }); kv.team_barrier(); - // Compute grad(average(w^2/2)). Store in wvor. m_sphere_ops.gradient_sphere(kv, Homme::subview(m_buffers.temp,kv.team_idx), wvor); @@ -1174,8 +1202,8 @@ struct CaarFunctorImpl { if (!m_theta_hydrostatic_mode) { // Compute wvor = grad(average(w^2/2)) - average(w*grad(w)) // Note: vtens is already storing grad(avg(w^2/2)) - auto gradw_x = Homme::subview(m_buffers.v_i,kv.team_idx,0,igp,jgp); - auto gradw_y = Homme::subview(m_buffers.v_i,kv.team_idx,1,igp,jgp); + auto gradw_x = Homme::subview(m_buffers.grad_w_i,kv.team_idx,0,igp,jgp); + auto gradw_y = Homme::subview(m_buffers.grad_w_i,kv.team_idx,1,igp,jgp); auto w_i = Homme::subview(m_state.m_w_i,kv.ie,m_data.n0,igp,jgp); const auto w_gradw_x = [&gradw_x,&w_i] (const int ilev) -> Scalar { diff --git a/components/homme/src/theta-l_kokkos/cxx/EquationOfState.hpp b/components/homme/src/theta-l_kokkos/cxx/EquationOfState.hpp index dd97720f1be2..b22592db5617 100644 --- a/components/homme/src/theta-l_kokkos/cxx/EquationOfState.hpp +++ b/components/homme/src/theta-l_kokkos/cxx/EquationOfState.hpp @@ -248,6 +248,12 @@ class EquationOfState { ColumnOps::column_scan_mid_to_int(kv,integrand_provider,phi_i); } + KOKKOS_INLINE_FUNCTION static + Real compute_dphi (const Real vtheta_dp, const Real exner, const Real p) { + return PhysicalConstants::Rgas*vtheta_dp*exner/p; + } + + public: bool m_theta_hydrostatic_mode; From 91165b2daba4a6b2b7351380847ec72f55c7e5fa Mon Sep 17 00:00:00 2001 From: Trey White Date: Mon, 22 Jul 2024 17:59:52 -0400 Subject: [PATCH 3/5] Changed #ifdef to #if 1 in CaarFunctorImpl.hpp. --- components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp index 5f634b83b951..ddddeb775496 100644 --- a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp +++ b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp @@ -353,9 +353,7 @@ struct CaarFunctorImpl { GPTLstart("caar compute"); -#undef TREY -//#define TREY -#ifdef TREY +#if 1 caar_compute(); Kokkos::fence(); From 308adf792fd07f2d13ad7932ad6f83108bdf5d57 Mon Sep 17 00:00:00 2001 From: "White, Trey" Date: Mon, 29 Jul 2024 13:21:51 -0400 Subject: [PATCH 4/5] Added #ifdef for WARP_SIZE in SphereOperators. --- components/homme/src/share/cxx/SphereOperators.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/components/homme/src/share/cxx/SphereOperators.hpp b/components/homme/src/share/cxx/SphereOperators.hpp index 0b528c1f1b5b..9af9dc604b7c 100644 --- a/components/homme/src/share/cxx/SphereOperators.hpp +++ b/components/homme/src/share/cxx/SphereOperators.hpp @@ -1180,7 +1180,13 @@ using TeamPolicy = Kokkos::TeamPolicy; using Team = TeamPolicy::member_type; static constexpr int NPNP = NP * NP; +#if defined(KOKKOS_ENABLE_HIP) static constexpr int WARP_SIZE = warpSize; +#elif defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_SYCL) +static constexpr int WARP_SIZE = 32; +#else +static constexpr int WARP_SIZE = 1; +#endif static constexpr int SPHERE_BLOCK_LEV = WARP_SIZE; static constexpr int SPHERE_BLOCKS_PER_COL = (NUM_PHYSICAL_LEV - 1) / SPHERE_BLOCK_LEV + 1; From e8b2fa87b02ef93efcdf33eb69d5b15514000cb5 Mon Sep 17 00:00:00 2001 From: "White, Trey" Date: Mon, 29 Jul 2024 14:53:09 -0400 Subject: [PATCH 5/5] Ensured CaarFunctorImpl buffers have at least m_num_elem nslots. --- .../homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.cpp | 12 ++++++------ .../homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp | 5 +++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.cpp b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.cpp index 13d4e5090c43..f319bffce31d 100644 --- a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.cpp +++ b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.cpp @@ -25,7 +25,7 @@ void CaarFunctorImpl::epoch1_blockOps() auto state_vtheta_dp= viewAsReal(m_state.m_vtheta_dp); Kokkos::parallel_for( - "__PRETTY_FUNCTION__", + __PRETTY_FUNCTION__, SphereBlockOps::policy(m_num_elems, 4), KOKKOS_LAMBDA(const Team &team) { @@ -98,7 +98,7 @@ void CaarFunctorImpl::epoch2_scanOps() auto state_dp3d = viewAsReal(m_state.m_dp3d); Kokkos::parallel_for( - "__PRETTY_FUNCTION__", + __PRETTY_FUNCTION__, SphereScanOps::policy(m_num_elems), KOKKOS_LAMBDA(const Team &team) { @@ -157,7 +157,7 @@ void CaarFunctorImpl::epoch3_blockOps() auto state_w_i = viewAsReal(m_state.m_w_i); Kokkos::parallel_for( - "__PRETTY_FUNCTION__", + __PRETTY_FUNCTION__, SphereBlockOps::policy(m_num_elems, 1), KOKKOS_LAMBDA(const Team &team) { @@ -234,7 +234,7 @@ void CaarFunctorImpl::epoch4_scanOps() auto state_phinh_i = viewAsReal(m_state.m_phinh_i); Kokkos::parallel_for( - "__PRETTY_FUNCTION__", + __PRETTY_FUNCTION__, SphereScanOps::policy(m_num_elems), KOKKOS_LAMBDA(const Team &team) { const SphereScanOps s(team); @@ -285,7 +285,7 @@ void CaarFunctorImpl::epoch5_colOps() auto state_w_i = viewAsReal(m_state.m_w_i); Kokkos::parallel_for( - "__PRETTY_FUNCTION__", + __PRETTY_FUNCTION__, SphereColOps::policy(m_num_elems, NUM_INTERFACE_LEV), KOKKOS_LAMBDA(const Team &team) { @@ -502,7 +502,7 @@ void CaarFunctorImpl::epoch7_col() auto state_w_i = viewAsReal(m_state.m_w_i); Kokkos::parallel_for( - "__PRETTY_FUNCTION__", + __PRETTY_FUNCTION__, SphereCol::policy(m_num_elems, NUM_PHYSICAL_LEV), KOKKOS_LAMBDA(const Team &team) { diff --git a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp index ddddeb775496..fc3c93f12ad3 100644 --- a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp +++ b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp @@ -30,6 +30,7 @@ #include "profiling.hpp" #include "ErrorDefs.hpp" +#include #include namespace Homme { @@ -186,7 +187,7 @@ struct CaarFunctorImpl { int requested_buffer_size () const { // Ask the buffers manager to allocate enough buffers to satisfy Caar's needs - const int nslots = m_tu.get_num_ws_slots(); + const int nslots = std::max(m_tu.get_num_ws_slots(), m_num_elems); int num_scalar_mid_buf = Buffers::num_3d_scalar_mid_buf; int num_scalar_int_buf = Buffers::num_3d_scalar_int_buf; @@ -222,7 +223,7 @@ struct CaarFunctorImpl { Errors::runtime_check(fbm.allocated_size()>=requested_buffer_size(), "Error! Buffers size not sufficient.\n"); Scalar* mem = reinterpret_cast(fbm.get_memory()); - const int nslots = m_tu.get_num_ws_slots(); + const int nslots = std::max(m_tu.get_num_ws_slots(), m_num_elems); // Midpoints scalars m_buffers.pnh = decltype(m_buffers.pnh )(mem,nslots);