diff --git a/cime_config/machines/config_machines.xml b/cime_config/machines/config_machines.xml
index a1cf8744de83..e471a4756e68 100644
--- a/cime_config/machines/config_machines.xml
+++ b/cime_config/machines/config_machines.xml
@@ -1441,22 +1441,19 @@
/usr/share/lmod/lmod/libexec/lmod python
- PrgEnv-cray
+ cpe/23.12
craype-accel-amd-gfx90a
- rocm/5.4.0
+ rocm/5.7.1
libunwind/1.6.2
- cce/15.0.1
- craype craype/2.7.20
- cray-mpich cray-mpich/8.1.26
cray-python/3.9.13.1
subversion/1.14.1
git/2.36.1
cmake/3.21.3
- cray-hdf5-parallel/1.12.2.1
- cray-netcdf-hdf5parallel/4.9.0.1
- cray-parallel-netcdf/1.12.3.1
+ cray-hdf5-parallel
+ cray-netcdf-hdf5parallel
+ cray-parallel-netcdf
darshan-runtime
diff --git a/components/eamxx/cmake/machine-files/crusher-scream-cpu.cmake b/components/eamxx/cmake/machine-files/crusher-scream-cpu.cmake
deleted file mode 100644
index 58f15881e31c..000000000000
--- a/components/eamxx/cmake/machine-files/crusher-scream-cpu.cmake
+++ /dev/null
@@ -1,6 +0,0 @@
-include(${CMAKE_CURRENT_LIST_DIR}/common.cmake)
-common_setup()
-
-include (${EKAT_MACH_FILES_PATH}/kokkos/serial.cmake)
-include (${EKAT_MACH_FILES_PATH}/mpi/srun.cmake)
-
diff --git a/components/eamxx/cmake/machine-files/crusher-scream-gpu.cmake b/components/eamxx/cmake/machine-files/crusher-scream-gpu.cmake
deleted file mode 100644
index 391a9b8d6880..000000000000
--- a/components/eamxx/cmake/machine-files/crusher-scream-gpu.cmake
+++ /dev/null
@@ -1,17 +0,0 @@
-include(${CMAKE_CURRENT_LIST_DIR}/common.cmake)
-common_setup()
-
-#serial is needed, but maybe it is always on?
-include (${EKAT_MACH_FILES_PATH}/kokkos/mi250.cmake)
-include (${EKAT_MACH_FILES_PATH}/kokkos/hip.cmake)
-include (${EKAT_MACH_FILES_PATH}/mpi/srun.cmake)
-
-SET(MPICH_DIR "/opt/cray/pe/mpich/8.1.16/ofi/crayclang/10.0" CACHE STRING "")
-
-set(CMAKE_CXX_FLAGS "--amdgpu-target=gfx90a -fno-gpu-rdc -I${MPICH_DIR}/include" CACHE STRING "" FORCE)
-set(CMAKE_Fortran_FLAGS "-I${MPICH_DIR}/include" CACHE STRING "" FORCE)
-set(CMAKE_C_FLAGS "-I${MPICH_DIR}/include" CACHE STRING "" FORCE)
-set(CMAKE_EXE_LINKER_FLAGS "-L${MPICH_DIR}/lib -lmpi -L/opt/cray/pe/mpich/8.1.16/gtl/lib -lmpi_gtl_hsa" CACHE STRING "" FORCE)
-
-
-
diff --git a/components/eamxx/cmake/machine-files/frontier-scream-gpu.cmake b/components/eamxx/cmake/machine-files/frontier-scream-gpu.cmake
index 14d1d501160d..bb1f4455a534 100644
--- a/components/eamxx/cmake/machine-files/frontier-scream-gpu.cmake
+++ b/components/eamxx/cmake/machine-files/frontier-scream-gpu.cmake
@@ -6,4 +6,4 @@ include (${EKAT_MACH_FILES_PATH}/kokkos/hip.cmake)
set(SCREAM_MPIRUN_EXE "srun" CACHE STRING "")
set(SCREAM_MACHINE "frontier-scream-gpu" CACHE STRING "")
-set(CMAKE_CXX_FLAGS "--amdgpu-target=gfx90a -fno-gpu-rdc -I$ENV{MPICH_DIR}/include" CACHE STRING "" FORCE)
+set(CMAKE_CXX_FLAGS "--offload-arch=gfx90a -fno-gpu-rdc -I$ENV{MPICH_DIR}/include" CACHE STRING "" FORCE)
diff --git a/components/homme/cmake/machineFiles/frontier.cmake b/components/homme/cmake/machineFiles/frontier.cmake
new file mode 100644
index 000000000000..84a334ad5ce5
--- /dev/null
+++ b/components/homme/cmake/machineFiles/frontier.cmake
@@ -0,0 +1,55 @@
+SET (HOMME_MACHINE "frontier" CACHE STRING "")
+
+SET (HOMMEXX_CUDA_MAX_WARP_PER_TEAM "16" CACHE STRING "")
+
+SET (NETCDF_DIR $ENV{CRAY_NETCDF_HDF5PARALLEL_PREFIX} CACHE FILEPATH "")
+SET (HDF5_DIR $ENV{CRAY_HDF5_PARALLEL_PREFIX} CACHE FILEPATH "")
+SET (CPRNC_DIR /ccs/proj/cli115/software/frontier/cprnc CACHE FILEPATH "")
+
+SET(BUILD_HOMME_WITHOUT_PIOLIBRARY TRUE CACHE BOOL "")
+
+SET(HOMME_FIND_BLASLAPACK TRUE CACHE BOOL "")
+
+SET(WITH_PNETCDF FALSE CACHE FILEPATH "")
+
+SET(USE_QUEUING FALSE CACHE BOOL "")
+
+SET(BUILD_HOMME_PREQX_KOKKOS TRUE CACHE BOOL "")
+SET(BUILD_HOMME_THETA_KOKKOS TRUE CACHE BOOL "")
+
+SET (HOMMEXX_BFB_TESTING FALSE CACHE BOOL "")
+
+SET(USE_TRILINOS OFF CACHE BOOL "")
+
+SET(HIP_BUILD TRUE CACHE BOOL "")
+
+SET(Kokkos_ENABLE_SERIAL ON CACHE BOOL "")
+SET(Kokkos_ENABLE_DEBUG OFF CACHE BOOL "")
+SET(Kokkos_ARCH_VEGA90A ON CACHE BOOL "")
+SET(Kokkos_ENABLE_OPENMP OFF CACHE BOOL "")
+SET(Kokkos_ENABLE_HIP ON CACHE BOOL "")
+SET(Kokkos_ENABLE_EXPLICIT_INSTANTIATION OFF CACHE BOOL "")
+
+SET(CMAKE_C_COMPILER "cc" CACHE STRING "")
+SET(CMAKE_Fortran_COMPILER "ftn" CACHE STRING "")
+#SET(CMAKE_Fortran_FLAGS "--gcc-toolchain=$ENV{MEMBERWORK}/cli115/workaround" CACHE STRING "")
+SET(CMAKE_CXX_COMPILER "hipcc" CACHE STRING "")
+
+SET(Extrae_LIBRARY "-I$ENV{CRAY_MPICH_DIR}/include -L$ENV{CRAY_MPICH_DIR}/lib -lmpi $ENV{PE_MPICH_GTL_DIR_amd_gfx90a} $ENV{PE_MPICH_GTL_LIBS_amd_gfx90a}" CACHE STRING "")
+
+#SET(ADD_Fortran_FLAGS "-G2 --gcc-toolchain=$ENV{MEMBERWORK}/cli115/workaround ${Extrae_LIBRARY}" CACHE STRING "")
+SET(ADD_C_FLAGS "-g -O ${Extrae_LIBRARY}" CACHE STRING "")
+SET(ADD_CXX_FLAGS "-g -std=c++14 -O3 -munsafe-fp-atomics --offload-arch=gfx90a -fno-gpu-rdc -Wno-unused-command-line-argument -Wno-unsupported-floating-point-opt -Wno-#pragma-messages ${Extrae_LIBRARY}" CACHE STRING "")
+SET(ADD_LINKER_FLAGS "${Extrae_LIBRARY}" CACHE STRING "")
+
+
+set (ENABLE_OPENMP OFF CACHE BOOL "")
+set (ENABLE_COLUMN_OPENMP OFF CACHE BOOL "")
+set (ENABLE_HORIZ_OPENMP OFF CACHE BOOL "")
+
+set (HOMME_TESTING_PROFILE "short" CACHE STRING "")
+SET (BUILD_HOMME_THETA_KOKKOS TRUE CACHE BOOL "")
+
+set (USE_NUM_PROCS 8 CACHE STRING "")
+
+SET (USE_MPI_OPTIONS "-c7 --gpu-bind=closest --gpus-per-task=1" CACHE FILEPATH "")
diff --git a/components/homme/src/preqx_kokkos/cxx/HyperviscosityFunctorImpl.hpp b/components/homme/src/preqx_kokkos/cxx/HyperviscosityFunctorImpl.hpp
index 688007e27f24..79263b98720b 100644
--- a/components/homme/src/preqx_kokkos/cxx/HyperviscosityFunctorImpl.hpp
+++ b/components/homme/src/preqx_kokkos/cxx/HyperviscosityFunctorImpl.hpp
@@ -297,13 +297,8 @@ class HyperviscosityFunctorImpl
SphereOperators m_sphere_ops;
// Policies
-#ifndef NDEBUG
- template
- using TeamPolicyType = Kokkos::TeamPolicy,Tag>;
-#else
template
using TeamPolicyType = Kokkos::TeamPolicy;
-#endif
Kokkos::RangePolicy m_policy_update_states;
TeamPolicyType m_policy_first_laplace;
TeamPolicyType m_policy_pre_exchange;
diff --git a/components/homme/src/share/compose/compose_homme.hpp b/components/homme/src/share/compose/compose_homme.hpp
index a3b40a204a9a..e4a826318f40 100644
--- a/components/homme/src/share/compose/compose_homme.hpp
+++ b/components/homme/src/share/compose/compose_homme.hpp
@@ -200,7 +200,7 @@ struct HommeFormatArray {
std::vector ie_data_ptr;
const Int nlev, qsized, ntimelev;
-#ifdef COMPOSE_ENABLE_GPU
+#if defined __CUDA_ARCH__ || defined __HIP_DEVICE_COMPILE__
COMPOSE_INLINE_FUNCTION static T& unused () {
static T unused = 0;
assert(0);
diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.cpp b/components/homme/src/share/cxx/ExecSpaceDefs.cpp
index 8d496bff5d16..7e762a7a6763 100644
--- a/components/homme/src/share/cxx/ExecSpaceDefs.cpp
+++ b/components/homme/src/share/cxx/ExecSpaceDefs.cpp
@@ -6,7 +6,6 @@
#include
-#include
#include
#include "ExecSpaceDefs.hpp"
@@ -35,32 +34,9 @@ void initialize_kokkos () {
// provides, as that algorithm is hardcoded in Kokkos::initialize(int& narg,
// char* arg[]). Once the behavior is exposed in the InitArguments version of
// initialize, we can remove this string code.
- // If for some reason we're running on a GPU platform, have Cuda enabled,
- // but are using a different execution space, this initialization is still
- // OK. The rank gets a GPU assigned and simply will ignore it.
-#ifdef KOKKOS_ENABLE_CUDA
- int nd;
- const auto ret = cudaGetDeviceCount(&nd);
- if (ret != cudaSuccess) {
- // It isn't a big deal if we can't get the device count.
- nd = 1;
- }
-#elif defined(KOKKOS_ENABLE_HIP)
- int nd;
- const auto ret = hipGetDeviceCount(&nd);
- if (ret != hipSuccess) {
- // It isn't a big deal if we can't get the device count.
- nd = 1;
- }
-#endif
#ifdef HOMMEXX_ENABLE_GPU
- std::stringstream ss;
- ss << "--kokkos-num-devices=" << nd;
- const auto key = ss.str();
- std::vector str(key.size()+1);
- std::copy(key.begin(), key.end(), str.begin());
- str.back() = 0;
- args.push_back(const_cast(str.data()));
+ const char *const map_device = "--kokkos-map-device-id-by=mpi_rank";
+ args.push_back(const_cast(map_device));
#endif
diff --git a/components/homme/src/share/cxx/SphereOperators.hpp b/components/homme/src/share/cxx/SphereOperators.hpp
index c227d97ea708..9af9dc604b7c 100644
--- a/components/homme/src/share/cxx/SphereOperators.hpp
+++ b/components/homme/src/share/cxx/SphereOperators.hpp
@@ -1176,6 +1176,279 @@ class SphereOperators
Real m_scale_factor_inv, m_laplacian_rigid_factor;
};
+using TeamPolicy = Kokkos::TeamPolicy;
+using Team = TeamPolicy::member_type;
+
+static constexpr int NPNP = NP * NP;
+#if defined(KOKKOS_ENABLE_HIP)
+static constexpr int WARP_SIZE = warpSize;
+#elif defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_SYCL)
+static constexpr int WARP_SIZE = 32;
+#else
+static constexpr int WARP_SIZE = 1;
+#endif
+
+static constexpr int SPHERE_BLOCK_LEV = WARP_SIZE;
+static constexpr int SPHERE_BLOCKS_PER_COL = (NUM_PHYSICAL_LEV - 1) / SPHERE_BLOCK_LEV + 1;
+static constexpr int SPHERE_BLOCK = NPNP * SPHERE_BLOCK_LEV;
+
+struct SphereGlobal {
+
+ const ExecViewManaged d;
+ const ExecViewManaged dinv;
+ const ExecViewManaged dvv;
+ const ExecViewManaged metdet;
+ const Real scale_factor_inv;
+
+ SphereGlobal(const SphereOperators &op):
+ d(op.m_d),
+ dinv(op.m_dinv),
+ dvv(op.dvv),
+ metdet(op.m_metdet),
+ scale_factor_inv(op.m_scale_factor_inv)
+ {}
+};
+
+using SphereBlockScratchView = Kokkos::View<
+ Real[SPHERE_BLOCK_LEV][NP][NP],
+ ExecSpace::scratch_memory_space,
+ Kokkos::MemoryTraits
+ >;
+
+using SphereBlockScratchSubview = Kokkos::Subview<
+ SphereBlockScratchView, int,
+ std::remove_const_t,
+ std::remove_const_t
+ >;
+
+struct SphereBlockOps;
+
+struct SphereBlockScratch {
+ SphereBlockScratchView v;
+ SphereBlockScratchSubview sv;
+ KOKKOS_INLINE_FUNCTION SphereBlockScratch(const SphereBlockOps &b);
+};
+
+struct SphereBlockOps {
+ const SphereGlobal &g;
+ const Team &t;
+ Real scale_factor_inv;
+ Real metdet;
+ Real rrdmd;
+ Real dinv[2][2];
+ Real dvvx[NP];
+ Real dvvy[NP];
+ int e,x,y,z;
+
+ KOKKOS_INLINE_FUNCTION SphereBlockOps(const SphereGlobal &sg, const Team &team):
+ g(sg),
+ t(team),
+ scale_factor_inv(g.scale_factor_inv),
+ rrdmd(0)
+ {
+ const int lr = t.league_rank();
+ e = lr / SPHERE_BLOCKS_PER_COL;
+ const int iw = lr % SPHERE_BLOCKS_PER_COL;
+ const int tr = t.team_rank();
+ const int ixy = tr / SPHERE_BLOCK_LEV;
+ x = ixy / NP;
+ y = ixy % NP;
+ const int dz = tr % SPHERE_BLOCK_LEV;
+ z = dz + iw * SPHERE_BLOCK_LEV;
+
+ metdet = g.metdet(e,x,y);
+ for (int i = 0; i < 2; i++) for (int j = 0; j < 2; j++) dinv[i][j] = g.dinv(e,i,j,x,y);
+ for (int j = 0; j < NP; j++) {
+ dvvx[j] = g.dvv(x,j);
+ dvvy[j] = g.dvv(y,j);
+ }
+ }
+
+ KOKKOS_INLINE_FUNCTION void barrier() const
+ {
+ t.team_barrier();
+ }
+
+ KOKKOS_INLINE_FUNCTION Real div(const SphereBlockScratch &t0, const SphereBlockScratch &t1)
+ {
+ // Separately compute du and dv to match Fortran bfb
+ Real du = 0;
+ Real dv = 0;
+ for (int j = 0; j < NP; j++) {
+ du += dvvy[j] * t0.sv(x,j);
+ dv += dvvx[j] * t1.sv(j,y);
+ }
+ if (rrdmd == 0) rrdmd = (1.0 / metdet) * scale_factor_inv;
+ return (du + dv) * rrdmd;
+ }
+
+ KOKKOS_INLINE_FUNCTION void divInit(SphereBlockScratch &t0, SphereBlockScratch &t1, const Real v0, const Real v1) const
+ {
+ t0.sv(x,y) = (dinv[0][0] * v0 + dinv[1][0] * v1) * metdet;
+ t1.sv(x,y) = (dinv[0][1] * v0 + dinv[1][1] * v1) * metdet;
+ }
+
+ KOKKOS_INLINE_FUNCTION void grad(Real &g0, Real &g1, const SphereBlockScratch &t) const
+ {
+ Real s0 = 0;
+ Real s1 = 0;
+ for (int j = 0; j < NP; j++) {
+ s0 += dvvy[j] * t.sv(x,j);
+ s1 += dvvx[j] * t.sv(j,y);
+ }
+ s0 *= scale_factor_inv;
+ s1 *= scale_factor_inv;
+ g0 = dinv[0][0] * s0 + dinv[0][1] * s1;
+ g1 = dinv[1][0] * s0 + dinv[1][1] * s1;
+ }
+
+ KOKKOS_INLINE_FUNCTION void gradInit(SphereBlockScratch &t0, const Real v0)
+ {
+ t0.sv(x,y) = v0;
+ }
+
+ KOKKOS_INLINE_FUNCTION bool skip() const { return (z >= NUM_PHYSICAL_LEV); }
+
+ KOKKOS_INLINE_FUNCTION Real vort(const SphereBlockScratch &t0, const SphereBlockScratch &t1)
+ {
+ Real du = 0;
+ Real dv = 0;
+ for (int j = 0; j < NP; j++) {
+ du += dvvx[j] * t0.sv(j,y);
+ dv += dvvy[j] * t1.sv(x,j);
+ }
+ if (rrdmd == 0) rrdmd = (1.0 / metdet) * scale_factor_inv;
+ return (dv - du) * rrdmd;
+ }
+
+ KOKKOS_INLINE_FUNCTION void vortInit(SphereBlockScratch &t0, SphereBlockScratch &t1, const Real v0, const Real v1) const
+ {
+ t0.sv(x,y) = g.d(e,0,0,x,y) * v0 + g.d(e,0,1,x,y) * v1;
+ t1.sv(x,y) = g.d(e,1,0,x,y) * v0 + g.d(e,1,1,x,y) * v1;
+ }
+
+ static TeamPolicy policy(const int num_elems, const int num_scratch)
+ {
+ return TeamPolicy(num_elems * SPHERE_BLOCKS_PER_COL, SPHERE_BLOCK).
+ set_scratch_size(0, Kokkos::PerTeam(num_scratch * SphereBlockScratchView::shmem_size()));
+ }
+
+};
+
+KOKKOS_INLINE_FUNCTION SphereBlockScratch::SphereBlockScratch(const SphereBlockOps &b):
+ v(b.t.team_scratch(0)),
+ sv(Kokkos::subview(v, b.z % SPHERE_BLOCK_LEV, Kokkos::ALL, Kokkos::ALL))
+{}
+
+struct SphereCol {
+ const Team &t;
+ int e,x,y,z;
+
+ KOKKOS_INLINE_FUNCTION SphereCol(const Team &team):
+ t(team)
+ {
+ const int lr = t.league_rank();
+ e = lr / NPNP;
+ const int xy = lr % NPNP;
+ x = xy / NP;
+ y = xy % NP;
+ z = t.team_rank();
+ }
+ static TeamPolicy policy(const int num_elems, const int num_lev)
+ {
+ return TeamPolicy(num_elems * NPNP, num_lev);
+ }
+};
+
+struct SphereColOps: SphereCol {
+ Real scale_factor_inv;
+ Real dinv[2][2];
+ Real dvvx[NP];
+ Real dvvy[NP];
+
+ KOKKOS_INLINE_FUNCTION SphereColOps(const SphereGlobal &sg, const Team &team):
+ SphereCol(team)
+ {
+ scale_factor_inv = sg.scale_factor_inv;
+ for (int i = 0; i < 2; i++) for (int j = 0; j < 2; j++) dinv[i][j] = sg.dinv(e,i,j,x,y);
+ for (int j = 0; j < NP; j++) {
+ dvvx[j] = sg.dvv(x,j);
+ dvvy[j] = sg.dvv(y,j);
+ }
+ }
+
+ template
+ KOKKOS_INLINE_FUNCTION void grad(OutView &out, const InView &in, const int n) const
+ {
+ Real s0 = 0;
+ Real s1 = 0;
+ for (int j = 0; j < NP; j++) {
+ s0 += dvvy[j] * in(e,n,x,j,z);
+ s1 += dvvx[j] * in(e,n,j,y,z);
+ }
+ s0 *= scale_factor_inv;
+ s1 *= scale_factor_inv;
+ out(e,0,x,y,z) = dinv[0][0] * s0 + dinv[0][1] * s1;
+ out(e,1,x,y,z) = dinv[1][0] * s0 + dinv[1][1] * s1;
+ }
+};
+
+struct SphereScanOps {
+ const Team &t;
+ int e,x,y;
+
+ KOKKOS_INLINE_FUNCTION SphereScanOps(const Team &team):
+ t(team)
+ {
+ e = t.league_rank();
+ const int tr = t.team_rank();
+ x = tr / NP;
+ y = tr % NP;
+ }
+
+ template
+ KOKKOS_INLINE_FUNCTION void nacs(OutView &out, const int n, const InView &in, const EndView &end) const
+ {
+ Dispatch<>::parallel_scan(
+ t, NUM_PHYSICAL_LEV,
+ [&](const int a, Real &sum, const bool last) {
+ if (a == 0) out(e,n,x,y,NUM_PHYSICAL_LEV) = sum = end(e,x,y);
+ const int z = NUM_PHYSICAL_LEV-a-1;
+ sum += in(e,x,y,z);
+ if (last) out(e,n,x,y,z) = sum;
+ });
+ }
+
+ template
+ KOKKOS_INLINE_FUNCTION void scan(OutView &out, const InView &in, const Real zero) const
+ {
+ Dispatch<>::parallel_scan(
+ t, NUM_PHYSICAL_LEV,
+ [&](const int z, Real &sum, const bool last) {
+ if (z == 0) out(e,x,y,0) = sum = zero;
+ sum += in(e,x,y,z);
+ if (last) out(e,x,y,z+1) = sum;
+ });
+ }
+
+ template
+ KOKKOS_INLINE_FUNCTION void scan(OutView &out, const InView &in, const int n, const Real zero) const
+ {
+ Dispatch<>::parallel_scan(
+ t, NUM_PHYSICAL_LEV,
+ [&](const int z, Real &sum, const bool last) {
+ if (z == 0) out(e,x,y,0) = sum = zero;
+ sum += in(e,n,x,y,z);
+ if (last) out(e,x,y,z+1) = sum;
+ });
+ }
+
+ static TeamPolicy policy(const int num_elems)
+ {
+ return TeamPolicy(num_elems, NPNP, WARP_SIZE);
+ }
+};
+
} // namespace Homme
#endif // HOMMEXX_SPHERE_OPERATORS_HPP
diff --git a/components/homme/src/share/cxx/utilities/ViewUtils.hpp b/components/homme/src/share/cxx/utilities/ViewUtils.hpp
index 1b3efcd91020..b749e99e096a 100644
--- a/components/homme/src/share/cxx/utilities/ViewUtils.hpp
+++ b/components/homme/src/share/cxx/utilities/ViewUtils.hpp
@@ -67,6 +67,18 @@ viewAsReal(ViewType v_in) {
return ReturnView(reinterpret_cast(v_in.data()));
}
+template
+KOKKOS_INLINE_FUNCTION
+typename
+std::enable_if::type,Scalar>::value,
+ Unmanaged*[DIM1][DIM2][DIM3*VECTOR_SIZE],Properties...>>
+ >::type
+viewAsReal(ViewType v_in) {
+ using ReturnST = RealType;
+ using ReturnView = Unmanaged*[DIM1][DIM2][DIM3*VECTOR_SIZE],Properties...>>;
+ return ReturnView(reinterpret_cast(v_in.data()));
+}
+
template
KOKKOS_INLINE_FUNCTION
typename
@@ -79,6 +91,30 @@ viewAsReal(ViewType v_in) {
return ReturnView(reinterpret_cast(v_in.data()));
}
+template
+KOKKOS_INLINE_FUNCTION
+typename
+std::enable_if::type,Scalar>::value,
+ Unmanaged*[DIM1][DIM2][DIM3][DIM4*VECTOR_SIZE],Properties...>>
+ >::type
+viewAsReal(ViewType v_in) {
+ using ReturnST = RealType;
+ using ReturnView = Unmanaged*[DIM1][DIM2][DIM3][DIM4*VECTOR_SIZE],Properties...>>;
+ return ReturnView(reinterpret_cast(v_in.data()));
+}
+
+template
+KOKKOS_INLINE_FUNCTION
+typename
+std::enable_if::type,Scalar>::value,
+ Unmanaged*[DIM1][DIM2][DIM3][DIM4][DIM5*VECTOR_SIZE],Properties...>>
+ >::type
+viewAsReal(ViewType v_in) {
+ using ReturnST = RealType;
+ using ReturnView = Unmanaged*[DIM1][DIM2][DIM3][DIM4][DIM5*VECTOR_SIZE],Properties...>>;
+ return ReturnView(reinterpret_cast(v_in.data()));
+}
+
// Structure to define the type of a view that has a const data type,
// given the type of an input view
template
diff --git a/components/homme/src/theta-l_kokkos/CMakeLists.txt b/components/homme/src/theta-l_kokkos/CMakeLists.txt
index 739866960e36..a727ebfac62b 100644
--- a/components/homme/src/theta-l_kokkos/CMakeLists.txt
+++ b/components/homme/src/theta-l_kokkos/CMakeLists.txt
@@ -141,6 +141,7 @@ MACRO(THETAL_KOKKOS_SETUP)
)
SET(THETAL_DEPS_CXX
+ ${TARGET_DIR}/cxx/CaarFunctorImpl.cpp
${TARGET_DIR}/cxx/CamForcing.cpp
${TARGET_DIR}/cxx/Diagnostics.cpp
${TARGET_DIR}/cxx/ElementsForcing.cpp
diff --git a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.cpp b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.cpp
new file mode 100644
index 000000000000..f319bffce31d
--- /dev/null
+++ b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.cpp
@@ -0,0 +1,638 @@
+#include "CaarFunctorImpl.hpp"
+
+namespace Homme {
+
+template
+void CaarFunctorImpl::epoch1_blockOps()
+{
+ auto buffers_dp_tens = viewAsReal(m_buffers.dp_tens);
+ auto buffers_exner = viewAsReal(m_buffers.exner);
+ auto buffers_phi = viewAsReal(m_buffers.phi);
+ auto buffers_pnh = viewAsReal(m_buffers.pnh);
+ auto buffers_theta_tens = viewAsReal(m_buffers.theta_tens);
+ auto buffers_vdp = viewAsReal(m_buffers.vdp);
+
+ const Real data_eta_ave_w = m_data.eta_ave_w;
+ const int data_n0 = m_data.n0;
+
+ auto derived_vn0 = viewAsReal(m_derived.m_vn0);
+
+ const SphereGlobal sg(m_sphere_ops);
+
+ auto state_dp3d = viewAsReal(m_state.m_dp3d);
+ auto state_phinh_i = viewAsReal(m_state.m_phinh_i);
+ auto state_v = viewAsReal(m_state.m_v);
+ auto state_vtheta_dp= viewAsReal(m_state.m_vtheta_dp);
+
+ Kokkos::parallel_for(
+ __PRETTY_FUNCTION__,
+ SphereBlockOps::policy(m_num_elems, 4),
+ KOKKOS_LAMBDA(const Team &team) {
+
+ SphereBlockOps b(sg, team);
+ if (b.skip()) return;
+
+ if (!HYDROSTATIC) {
+
+ const Real dphi = state_phinh_i(b.e,data_n0,b.x,b.y,b.z+1) - state_phinh_i(b.e,data_n0,b.x,b.y,b.z);
+ const Real vtheta_dp = state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z);
+ if ((vtheta_dp < 0) || (dphi > 0)) abort();
+ EquationOfState::compute_pnh_and_exner(vtheta_dp, dphi, buffers_pnh(b.e,b.x,b.y,b.z), buffers_exner(b.e,b.x,b.y,b.z));
+
+ buffers_phi(b.e,b.x,b.y,b.z) = 0.5 * (state_phinh_i(b.e,data_n0,b.x,b.y,b.z) + state_phinh_i(b.e,data_n0,b.x,b.y,b.z+1));
+ }
+
+ const Real v0 = state_v(b.e,data_n0,0,b.x,b.y,b.z) * state_dp3d(b.e,data_n0,b.x,b.y,b.z);
+ const Real v1 = state_v(b.e,data_n0,1,b.x,b.y,b.z) * state_dp3d(b.e,data_n0,b.x,b.y,b.z);
+ buffers_vdp(b.e,0,b.x,b.y,b.z) = v0;
+ buffers_vdp(b.e,1,b.x,b.y,b.z) = v1;
+ derived_vn0(b.e,0,b.x,b.y,b.z) += data_eta_ave_w * v0;
+ derived_vn0(b.e,1,b.x,b.y,b.z) += data_eta_ave_w * v1;
+
+ SphereBlockScratch ttmp0(b);
+ SphereBlockScratch ttmp1(b);
+ b.divInit(ttmp0, ttmp1, v0, v1);
+
+ SphereBlockScratch ttmp2(b);
+ SphereBlockScratch ttmp3(b);
+ Real vtheta = 0;
+
+ if (CONSERVATIVE) {
+ const Real sv0 = state_v(b.e,data_n0,0,b.x,b.y,b.z) * state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z);
+ const Real sv1 = state_v(b.e,data_n0,1,b.x,b.y,b.z) * state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z);
+ b.divInit(ttmp2, ttmp3, sv0, sv1);
+ } else {
+ vtheta = state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z) / state_dp3d(b.e,data_n0,b.x,b.y,b.z);
+ b.gradInit(ttmp2, vtheta);
+ }
+
+ b.barrier();
+
+ const Real dvdp = b.div(ttmp0, ttmp1);
+ buffers_dp_tens(b.e,b.x,b.y,b.z) = dvdp;
+
+ if (CONSERVATIVE) {
+ buffers_theta_tens(b.e,b.x,b.y,b.z) = b.div(ttmp2, ttmp3);
+ } else {
+ Real grad0, grad1;
+ b.grad(grad0, grad1, ttmp2);
+ Real theta_tens = dvdp * vtheta;
+ theta_tens += grad0 * v0;
+ theta_tens += grad1 * v1;
+ buffers_theta_tens(b.e,b.x,b.y,b.z) = theta_tens;
+ }
+ });
+}
+
+template
+void CaarFunctorImpl::epoch2_scanOps()
+{
+ auto buffers_dp_tens = viewAsReal(m_buffers.dp_tens);
+ auto buffers_dp_i = viewAsReal(m_buffers.dp_i);
+ auto buffers_eta_dot_dpdn = viewAsReal(m_buffers.eta_dot_dpdn);
+ auto buffers_w_tens = viewAsReal(m_buffers.w_tens);
+
+ const int data_n0 = m_data.n0;
+ auto &hvcoord_hybrid_bi = m_hvcoord.hybrid_bi;
+ const Real pi_i00 = m_hvcoord.ps0 * m_hvcoord.hybrid_ai0;
+ auto state_dp3d = viewAsReal(m_state.m_dp3d);
+
+ Kokkos::parallel_for(
+ __PRETTY_FUNCTION__,
+ SphereScanOps::policy(m_num_elems),
+ KOKKOS_LAMBDA(const Team &team) {
+
+ const SphereScanOps s(team);
+
+ s.scan(buffers_dp_i, state_dp3d, data_n0, pi_i00);
+ s.scan(buffers_w_tens, buffers_dp_tens, 0);
+
+ if (RSPLIT_ZERO) {
+
+ s.scan(buffers_eta_dot_dpdn, buffers_dp_tens, 0);
+
+ const Real last = buffers_eta_dot_dpdn(s.e,s.x,s.y,NUM_PHYSICAL_LEV);
+
+ Kokkos::parallel_for(
+ Kokkos::ThreadVectorRange(s.t, 1, NUM_PHYSICAL_LEV),
+ [&](const int z) {
+ Real eta_dot_dpdn = -buffers_eta_dot_dpdn(s.e,s.x,s.y,z);
+ eta_dot_dpdn += hvcoord_hybrid_bi(z) * last;
+ buffers_eta_dot_dpdn(s.e,s.x,s.y,z) = eta_dot_dpdn;
+ });
+
+ Kokkos::single(
+ Kokkos::PerThread(team),
+ [&]() {
+ buffers_eta_dot_dpdn(s.e,s.x,s.y,0) = buffers_eta_dot_dpdn(s.e,s.x,s.y,NUM_PHYSICAL_LEV) = 0;
+ });
+ }
+ });
+}
+
+template
+void CaarFunctorImpl::epoch3_blockOps()
+{
+ auto buffers_dp_i = viewAsReal(m_buffers.dp_i);
+ auto buffers_dp_tens = viewAsReal(m_buffers.dp_tens);
+ auto buffers_eta_dot_dpdn = viewAsReal(m_buffers.eta_dot_dpdn);
+ auto buffers_exner = viewAsReal(m_buffers.exner);
+ auto buffers_omega_p = viewAsReal(m_buffers.omega_p);
+ auto buffers_pnh = viewAsReal(m_buffers.pnh);
+ auto buffers_temp = viewAsReal(m_buffers.temp);
+ auto buffers_v_tens = viewAsReal(m_buffers.v_tens);
+ auto buffers_w_tens = viewAsReal(m_buffers.w_tens);
+
+ const Real data_eta_ave_w = m_data.eta_ave_w;
+ const int data_n0 = m_data.n0;
+
+ auto derived_eta_dot_dpdn = viewAsReal(m_derived.m_eta_dot_dpdn);
+ auto derived_omega_p = viewAsReal(m_derived.m_omega_p);
+
+ const SphereGlobal sg(m_sphere_ops);
+
+ auto state_dp3d = viewAsReal(m_state.m_dp3d);
+ auto state_v = viewAsReal(m_state.m_v);
+ auto state_vtheta_dp= viewAsReal(m_state.m_vtheta_dp);
+ auto state_w_i = viewAsReal(m_state.m_w_i);
+
+ Kokkos::parallel_for(
+ __PRETTY_FUNCTION__,
+ SphereBlockOps::policy(m_num_elems, 1),
+ KOKKOS_LAMBDA(const Team &team) {
+
+ SphereBlockOps b(sg, team);
+ if (b.skip()) return;
+
+ const Real pi = 0.5 * (buffers_dp_i(b.e,b.x,b.y,b.z) + buffers_dp_i(b.e,b.x,b.y,b.z+1));
+ SphereBlockScratch tmp0(b);
+ b.gradInit(tmp0, pi);
+
+ if (HYDROSTATIC) {
+ Real exner = pi;
+ EquationOfState::pressure_to_exner(exner);
+ buffers_exner(b.e,b.x,b.y,b.z) = exner;
+ buffers_pnh(b.e,b.x,b.y,b.z) = EquationOfState::compute_dphi(state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z), exner, pi);
+ }
+
+ b.barrier();
+
+ Real grad0, grad1;
+ b.grad(grad0, grad1, tmp0);
+
+ Real omega = -0.5 * (buffers_w_tens(b.e,b.x,b.y,b.z) + buffers_w_tens(b.e,b.x,b.y,b.z+1));
+ const Real uz = state_v(b.e,data_n0,0,b.x,b.y,b.z);
+ const Real vz = state_v(b.e,data_n0,1,b.x,b.y,b.z);
+ omega += uz * grad0 + vz * grad1;
+ buffers_omega_p(b.e,b.x,b.y,b.z) = omega;
+
+ derived_omega_p(b.e,b.x,b.y,b.z) += data_eta_ave_w * omega;
+
+ if (RSPLIT_ZERO) {
+
+ const Real dp = state_dp3d(b.e,data_n0,b.x,b.y,b.z);
+ const Real etap = buffers_eta_dot_dpdn(b.e,b.x,b.y,b.z+1);
+ const Real etaz = buffers_eta_dot_dpdn(b.e,b.x,b.y,b.z);
+
+ buffers_dp_tens(b.e,b.x,b.y,b.z) += etap - etaz;
+
+ derived_eta_dot_dpdn(b.e,b.x,b.y,b.z) += data_eta_ave_w * etaz;
+
+ if (!HYDROSTATIC) {
+ const Real dw = state_w_i(b.e,data_n0,b.x,b.y,b.z+1) - state_w_i(b.e,data_n0,b.x,b.y,b.z);
+ const Real eta = 0.5 * (etaz + etap);
+ buffers_temp(b.e,b.x,b.y,b.z) = dw * eta;
+ }
+
+ Real u = 0;
+ Real v = 0;
+ if (b.z < NUM_PHYSICAL_LEV-1) {
+ const Real facp = 0.5 * etap / dp;
+ u = facp * (state_v(b.e,data_n0,0,b.x,b.y,b.z+1) - uz);
+ v = facp * (state_v(b.e,data_n0,1,b.x,b.y,b.z+1) - vz);
+ }
+ if (b.z > 0) {
+ const Real facm = 0.5 * etaz / dp;
+ u += facm * (uz - state_v(b.e,data_n0,0,b.x,b.y,b.z-1));
+ v += facm * (vz - state_v(b.e,data_n0,1,b.x,b.y,b.z-1));
+ }
+ buffers_v_tens(b.e,0,b.x,b.y,b.z) = u;
+ buffers_v_tens(b.e,1,b.x,b.y,b.z) = v;
+ }
+ });
+}
+
+void CaarFunctorImpl::epoch4_scanOps()
+{
+ auto buffers_phi = viewAsReal(m_buffers.phi);
+ auto buffers_pnh = viewAsReal(m_buffers.pnh);
+
+ const int data_n0 = m_data.n0;
+
+ auto &geometry_phis = m_geometry.m_phis;
+
+ auto state_phinh_i = viewAsReal(m_state.m_phinh_i);
+
+ Kokkos::parallel_for(
+ __PRETTY_FUNCTION__,
+ SphereScanOps::policy(m_num_elems),
+ KOKKOS_LAMBDA(const Team &team) {
+ const SphereScanOps s(team);
+ s.nacs(state_phinh_i, data_n0, buffers_pnh, geometry_phis);
+ Kokkos::parallel_for(
+ Kokkos::ThreadVectorRange(s.t, NUM_PHYSICAL_LEV),
+ [&](const int z) {
+ buffers_phi(s.e,s.x,s.y,z) = 0.5 * (state_phinh_i(s.e,data_n0,s.x,s.y,z) + state_phinh_i(s.e,data_n0,s.x,s.y,z+1));
+ });
+ });
+}
+
+template
+void CaarFunctorImpl::epoch5_colOps()
+{
+ auto buffers_dp_i = viewAsReal(m_buffers.dp_i);
+ auto buffers_dpnh_dp_i = viewAsReal(m_buffers.dpnh_dp_i);
+ auto buffers_eta_dot_dpdn = viewAsReal(m_buffers.eta_dot_dpdn);
+ auto buffers_exner = viewAsReal(m_buffers.exner);
+ auto buffers_grad_phinh_i = viewAsReal(m_buffers.grad_phinh_i);
+ auto buffers_grad_w_i = viewAsReal(m_buffers.grad_w_i);
+ auto buffers_phi = viewAsReal(m_buffers.phi);
+ auto buffers_phi_tens = viewAsReal(m_buffers.phi_tens);
+ auto buffers_pnh = viewAsReal(m_buffers.pnh);
+ auto buffers_temp = viewAsReal(m_buffers.temp);
+ auto buffers_v_i = viewAsReal(m_buffers.v_i);
+ auto buffers_vtheta_i = viewAsReal(m_buffers.vtheta_i);
+ auto buffers_w_tens = viewAsReal(m_buffers.w_tens);
+
+ const int data_n0 = m_data.n0;
+ const Real dscale = m_data.scale1 - m_data.scale2;
+
+ auto &geometry_gradphis = m_geometry.m_gradphis;
+
+ const Real gscale1 = m_data.scale1 * PhysicalConstants::g;
+ const Real gscale2 = m_data.scale2 * PhysicalConstants::g;
+
+ auto hvcoord_hybrid_bi_packed = viewAsReal(m_hvcoord.hybrid_bi_packed);
+
+ const Real ndata_scale1 = -m_data.scale1;
+ const Real pi_i00 = m_hvcoord.ps0 * m_hvcoord.hybrid_ai0;
+
+ const SphereGlobal sg(m_sphere_ops);
+
+ auto state_dp3d = viewAsReal(m_state.m_dp3d);
+ auto state_phinh_i = viewAsReal(m_state.m_phinh_i);
+ auto state_v = viewAsReal(m_state.m_v);
+ auto state_w_i = viewAsReal(m_state.m_w_i);
+
+ Kokkos::parallel_for(
+ __PRETTY_FUNCTION__,
+ SphereColOps::policy(m_num_elems, NUM_INTERFACE_LEV),
+ KOKKOS_LAMBDA(const Team &team) {
+
+ const SphereColOps c(sg, team);
+
+ c.grad(buffers_grad_phinh_i, state_phinh_i, data_n0);
+ if (!HYDROSTATIC) c.grad(buffers_grad_w_i, state_w_i, data_n0);
+
+ const Real dm = (c.z == 0) ? 0 : state_dp3d(c.e,data_n0,c.x,c.y,c.z-1);
+ const Real dz = (c.z == NUM_PHYSICAL_LEV) ? 0 : state_dp3d(c.e,data_n0,c.x,c.y,c.z);
+ const Real dp_i = (c.z == 0) ? dz : (c.z == NUM_PHYSICAL_LEV) ? dm : 0.5 * (dz + dm);
+ buffers_dp_i(c.e,c.x,c.y,c.z) = dp_i;
+
+ if (!HYDROSTATIC) {
+
+ const Real v0m = (c.z == 0) ? 0 : state_v(c.e,data_n0,0,c.x,c.y,c.z-1);
+ const Real v0z = (c.z == NUM_PHYSICAL_LEV) ? 0 : state_v(c.e,data_n0,0,c.x,c.y,c.z);
+ const Real v_i0 = (c.z == 0) ? v0z : (c.z == NUM_PHYSICAL_LEV) ? v0m : (dz * v0z + dm * v0m) / (dm + dz);
+ buffers_v_i(c.e,0,c.x,c.y,c.z) = v_i0;
+
+ const Real v1m = (c.z == 0) ? 0 : state_v(c.e,data_n0,1,c.x,c.y,c.z-1);
+ const Real v1z = (c.z == NUM_PHYSICAL_LEV) ? 0 : state_v(c.e,data_n0,1,c.x,c.y,c.z);
+ const Real v_i1 = (c.z == 0) ? v1z : (c.z == NUM_PHYSICAL_LEV) ? v1m :(dz * v1z + dm * v1m) / (dm + dz);
+ buffers_v_i(c.e,1,c.x,c.y,c.z) = v_i1;
+
+ const Real pm = (c.z == 0) ? pi_i00 : buffers_pnh(c.e,c.x,c.y,c.z-1);
+ const Real pz = (c.z == NUM_PHYSICAL_LEV) ? pm + 0.5 * dm : buffers_pnh(c.e,c.x,c.y,c.z);
+ buffers_dpnh_dp_i(c.e,c.x,c.y,c.z) = 2.0 * (pz - pm) / (dm + dz);
+ }
+
+ if (RSPLIT_ZERO) {
+
+ const Real phim = (c.z == 0) ? 0 : buffers_phi(c.e,c.x,c.y,c.z-1);
+ const Real phiz = (c.z == NUM_PHYSICAL_LEV) ? 0 : buffers_phi(c.e,c.x,c.y,c.z);
+
+ if (!HYDROSTATIC) {
+ const Real phi_vadv = ((c.z == 0) || (c.z == NUM_PHYSICAL_LEV)) ? 0 : (phiz - phim) * buffers_eta_dot_dpdn(c.e,c.x,c.y,c.z) / dp_i;
+ buffers_phi_tens(c.e,c.x,c.y,c.z) = phi_vadv;
+ }
+
+ Real vtheta_i = phiz - phim;
+ if (!(c.z == 0) && !(c.z == NUM_PHYSICAL_LEV)) {
+ const Real dexner = buffers_exner(c.e,c.x,c.y,c.z) - buffers_exner(c.e,c.x,c.y,c.z-1);
+ vtheta_i /= dexner;
+ }
+ vtheta_i /= -PhysicalConstants::cp;
+ if (!HYDROSTATIC) vtheta_i *= buffers_dpnh_dp_i(c.e,c.x,c.y,c.z);
+ buffers_vtheta_i(c.e,c.x,c.y,c.z) = vtheta_i;
+ }
+
+ if (!HYDROSTATIC) {
+
+ Real w_tens = 0;
+ if (RSPLIT_ZERO) {
+ const Real tempm = (c.z == 0) ? 0 : buffers_temp(c.e,c.x,c.y,c.z-1);
+ const Real tempz = (c.z == NUM_PHYSICAL_LEV) ? 0 : buffers_temp(c.e,c.x,c.y,c.z);
+ const Real dw = (c.z == 0) ? tempz : (c.z == NUM_PHYSICAL_LEV) ? tempm : 0.5 * (tempz + tempm);
+ w_tens = dw / dp_i;
+ }
+ w_tens += buffers_v_i(c.e,0,c.x,c.y,c.z) * buffers_grad_w_i(c.e,0,c.x,c.y,c.z) + buffers_v_i(c.e,1,c.x,c.y,c.z) * buffers_grad_w_i(c.e,1,c.x,c.y,c.z);
+ w_tens *= ndata_scale1;
+ const Real gscale = (c.z == NUM_PHYSICAL_LEV) ? gscale1 : gscale2;
+ w_tens += (buffers_dpnh_dp_i(c.e,c.x,c.y,c.z)-Real(1)) * gscale;
+ buffers_w_tens(c.e,c.x,c.y,c.z) = w_tens;
+
+ Real phi_tens = (RSPLIT_ZERO) ? buffers_phi_tens(c.e,c.x,c.y,c.z) : 0;
+ phi_tens += buffers_v_i(c.e,0,c.x,c.y,c.z) * buffers_grad_phinh_i(c.e,0,c.x,c.y,c.z) + buffers_v_i(c.e,1,c.x,c.y,c.z) * buffers_grad_phinh_i(c.e,1,c.x,c.y,c.z);
+ phi_tens *= ndata_scale1;
+ phi_tens += state_w_i(c.e,data_n0,c.x,c.y,c.z) * gscale;
+
+ if (dscale) phi_tens += dscale * (buffers_v_i(c.e,0,c.x,c.y,c.z) * geometry_gradphis(c.e,0,c.x,c.y) + buffers_v_i(c.e,1,c.x,c.y,c.z) * geometry_gradphis(c.e,1,c.x,c.y)) * hvcoord_hybrid_bi_packed(c.z);
+
+ buffers_phi_tens(c.e,c.x,c.y,c.z) = phi_tens;
+ }
+ });
+}
+
+template
+void CaarFunctorImpl::epoch6_blockOps()
+{
+ auto buffers_dpnh_dp_i = viewAsReal(m_buffers.dpnh_dp_i);
+ auto buffers_exner = viewAsReal(m_buffers.exner);
+ auto buffers_grad_phinh_i = viewAsReal(m_buffers.grad_phinh_i);
+ auto buffers_grad_w_i = viewAsReal(m_buffers.grad_w_i);
+ auto buffers_v_tens = viewAsReal(m_buffers.v_tens);
+
+ const int data_n0 = m_data.n0;
+
+ auto &geometry_fcor = m_geometry.m_fcor;
+
+ const SphereGlobal sg(m_sphere_ops);
+
+ auto state_dp3d = viewAsReal(m_state.m_dp3d);
+ auto state_v = viewAsReal(m_state.m_v);
+ auto state_vtheta_dp= viewAsReal(m_state.m_vtheta_dp);
+ auto state_w_i = viewAsReal(m_state.m_w_i);
+
+ Kokkos::parallel_for(
+ "caar compute_v_tens",
+ SphereBlockOps::policy(m_num_elems, 6),
+ KOKKOS_LAMBDA(const Team &team) {
+
+ SphereBlockOps b(sg, team);
+ if (b.skip()) return;
+
+ SphereBlockScratch ttmp0(b);
+ const Real w2 = (HYDROSTATIC) ? 0 : 0.25 * (state_w_i(b.e,data_n0,b.x,b.y,b.z) * state_w_i(b.e,data_n0,b.x,b.y,b.z) + state_w_i(b.e,data_n0,b.x,b.y,b.z+1) * state_w_i(b.e,data_n0,b.x,b.y,b.z+1));
+ b.gradInit(ttmp0, w2);
+
+ SphereBlockScratch ttmp1(b);
+ const Real exneriz = buffers_exner(b.e,b.x,b.y,b.z);
+ b.gradInit(ttmp1, exneriz);
+
+ SphereBlockScratch ttmp2(b);
+ const Real log_exneriz = (PGRAD_CORRECTION) ? log(exneriz) : 0;
+ b.gradInit(ttmp2, log_exneriz);
+
+ const Real v0 = state_v(b.e,data_n0,0,b.x,b.y,b.z);
+ const Real v1 = state_v(b.e,data_n0,1,b.x,b.y,b.z);
+
+ SphereBlockScratch ttmp3(b);
+ SphereBlockScratch ttmp4(b);
+ b.vortInit(ttmp3, ttmp4, v0, v1);
+
+ SphereBlockScratch ttmp5(b);
+ b.gradInit(ttmp5, 0.5 * (v0 * v0 + v1 * v1));
+
+ b.barrier();
+
+ Real grad_v0, grad_v1;
+ b.grad(grad_v0, grad_v1, ttmp5);
+
+ Real u_tens = (RSPLIT_ZERO) ? buffers_v_tens(b.e,0,b.x,b.y,b.z) : 0;
+ Real v_tens = (RSPLIT_ZERO) ? buffers_v_tens(b.e,1,b.x,b.y,b.z) : 0;
+ u_tens += grad_v0;
+ v_tens += grad_v1;
+
+ const Real cp_vtheta = PhysicalConstants::cp * (state_vtheta_dp(b.e,data_n0,b.x,b.y,b.z) / state_dp3d(b.e,data_n0,b.x,b.y,b.z));
+
+ Real grad_exner0, grad_exner1;
+ b.grad(grad_exner0, grad_exner1, ttmp1);
+
+ u_tens += cp_vtheta * grad_exner0;
+ v_tens += cp_vtheta * grad_exner1;
+
+ Real mgrad_x, mgrad_y;
+ if (HYDROSTATIC) {
+
+ mgrad_x = 0.5 * (buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z) + buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z+1));
+ mgrad_y = 0.5 * (buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z) + buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z+1));
+
+ } else {
+
+ mgrad_x = 0.5 * (buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z) * buffers_dpnh_dp_i(b.e,b.x,b.y,b.z) + buffers_grad_phinh_i(b.e,0,b.x,b.y,b.z+1) * buffers_dpnh_dp_i(b.e,b.x,b.y,b.z+1));
+ mgrad_y = 0.5 * (buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z) * buffers_dpnh_dp_i(b.e,b.x,b.y,b.z) + buffers_grad_phinh_i(b.e,1,b.x,b.y,b.z+1) * buffers_dpnh_dp_i(b.e,b.x,b.y,b.z+1));
+
+ }
+
+ if (PGRAD_CORRECTION) {
+
+ Real grad_lexner0, grad_lexner1;
+ b.grad(grad_lexner0, grad_lexner1, ttmp2);
+
+ namespace PC = PhysicalConstants;
+ constexpr Real cpt0 = PC::cp * (PC::Tref - PC::Tref_lapse_rate * PC::Tref * PC::cp / PC::g);
+ mgrad_x += cpt0 * (grad_lexner0 - grad_exner0 / exneriz);
+ mgrad_y += cpt0 * (grad_lexner1 - grad_exner1 / exneriz);
+ }
+
+ Real wvor_x = 0;
+ Real wvor_y = 0;
+ if (!HYDROSTATIC) {
+ b.grad(wvor_x, wvor_y, ttmp0);
+ wvor_x -= 0.5 * (buffers_grad_w_i(b.e,0,b.x,b.y,b.z) * state_w_i(b.e,data_n0,b.x,b.y,b.z) + buffers_grad_w_i(b.e,0,b.x,b.y,b.z+1) * state_w_i(b.e,data_n0,b.x,b.y,b.z+1));
+ wvor_y -= 0.5 * (buffers_grad_w_i(b.e,1,b.x,b.y,b.z) * state_w_i(b.e,data_n0,b.x,b.y,b.z) + buffers_grad_w_i(b.e,1,b.x,b.y,b.z+1) * state_w_i(b.e,data_n0,b.x,b.y,b.z+1));
+ }
+
+ u_tens += mgrad_x + wvor_x;
+ v_tens += mgrad_y + wvor_y;
+
+ const Real vort = b.vort(ttmp3, ttmp4) + geometry_fcor(b.e,b.x,b.y);
+ u_tens -= v1 * vort;
+ v_tens += v0 * vort;
+
+ buffers_v_tens(b.e,0,b.x,b.y,b.z) = u_tens;
+ buffers_v_tens(b.e,1,b.x,b.y,b.z) = v_tens;
+ });
+}
+
+template
+void CaarFunctorImpl::epoch7_col()
+{
+ auto buffers_dp_tens = viewAsReal(m_buffers.dp_tens);
+ auto buffers_eta_dot_dpdn = viewAsReal(m_buffers.eta_dot_dpdn);
+ auto buffers_phi_tens = viewAsReal(m_buffers.phi_tens);
+ auto buffers_theta_tens = viewAsReal(m_buffers.theta_tens);
+ auto buffers_v_tens = viewAsReal(m_buffers.v_tens);
+ auto buffers_vtheta_i = viewAsReal(m_buffers.vtheta_i);
+ auto buffers_w_tens = viewAsReal(m_buffers.w_tens);
+
+ const Real data_dt = m_data.dt;
+ const int data_nm1 = m_data.nm1;
+ const int data_np1 = m_data.np1;
+ const Real data_scale3 = m_data.scale3;
+
+ auto &geometry_spheremp = m_geometry.m_spheremp;
+
+ const Real scale1_dt = m_data.scale1 * m_data.dt;
+
+ auto state_dp3d = viewAsReal(m_state.m_dp3d);
+ auto state_phinh_i = viewAsReal(m_state.m_phinh_i);
+ auto state_v = viewAsReal(m_state.m_v);
+ auto state_vtheta_dp = viewAsReal(m_state.m_vtheta_dp);
+ auto state_w_i = viewAsReal(m_state.m_w_i);
+
+ Kokkos::parallel_for(
+ __PRETTY_FUNCTION__,
+ SphereCol::policy(m_num_elems, NUM_PHYSICAL_LEV),
+ KOKKOS_LAMBDA(const Team &team) {
+
+ const SphereCol c(team);
+
+ const Real spheremp = geometry_spheremp(c.e,c.x,c.y);
+ const Real scale1_dt_spheremp = scale1_dt * spheremp;
+ const Real scale3_spheremp = data_scale3 * spheremp;
+
+ Real dp_tens = buffers_dp_tens(c.e,c.x,c.y,c.z);
+ dp_tens *= scale1_dt_spheremp;
+ Real dp_np1 = scale3_spheremp * state_dp3d(c.e,data_nm1,c.x,c.y,c.z);
+ dp_np1 -= dp_tens;
+ state_dp3d(c.e,data_np1,c.x,c.y,c.z) = dp_np1;
+
+ Real theta_tens = buffers_theta_tens(c.e,c.x,c.y,c.z);
+ if (RSPLIT_ZERO) {
+ const Real etap = buffers_eta_dot_dpdn(c.e,c.x,c.y,c.z+1);
+ const Real etaz = buffers_eta_dot_dpdn(c.e,c.x,c.y,c.z);
+ const Real thetap = etap * buffers_vtheta_i(c.e,c.x,c.y,c.z+1);
+ const Real thetaz = etaz * buffers_vtheta_i(c.e,c.x,c.y,c.z);
+ theta_tens += thetap - thetaz;
+ }
+ theta_tens *= -scale1_dt_spheremp;
+ Real vtheta_np1 = state_vtheta_dp(c.e,data_nm1,c.x,c.y,c.z);
+ vtheta_np1 *= scale3_spheremp;
+ vtheta_np1 += theta_tens;
+ state_vtheta_dp(c.e,data_np1,c.x,c.y,c.z) = vtheta_np1;
+
+ Real u_tens = buffers_v_tens(c.e,0,c.x,c.y,c.z);
+ u_tens *= -scale1_dt_spheremp;
+ Real u_np1 = state_v(c.e,data_nm1,0,c.x,c.y,c.z);
+ u_np1 *= scale3_spheremp;
+ u_np1 += u_tens;
+ state_v(c.e,data_np1,0,c.x,c.y,c.z) = u_np1;
+
+ Real v_tens = buffers_v_tens(c.e,1,c.x,c.y,c.z);
+ v_tens *= -scale1_dt_spheremp;
+ Real v_np1 = state_v(c.e,data_nm1,1,c.x,c.y,c.z);
+ v_np1 *= scale3_spheremp;
+ v_np1 += v_tens;
+ state_v(c.e,data_np1,1,c.x,c.y,c.z) = v_np1;
+
+ if (!HYDROSTATIC) {
+
+ const Real dt_spheremp = data_dt * spheremp;
+
+ Real phi_tens = buffers_phi_tens(c.e,c.x,c.y,c.z);
+ phi_tens *= dt_spheremp;
+ Real phi_np1 = state_phinh_i(c.e,data_nm1,c.x,c.y,c.z);
+ phi_np1 *= scale3_spheremp;
+ phi_np1 += phi_tens;
+ state_phinh_i(c.e,data_np1,c.x,c.y,c.z) = phi_np1;
+
+ Real w_tens = buffers_w_tens(c.e,c.x,c.y,c.z);
+ w_tens *= dt_spheremp;
+ Real w_np1 = state_w_i(c.e,data_nm1,c.x,c.y,c.z);
+ w_np1 *= scale3_spheremp;
+ w_np1 += w_tens;
+ state_w_i(c.e,data_np1,c.x,c.y,c.z) = w_np1;
+
+ if (c.z == NUM_PHYSICAL_LEV-1) {
+ Real w_tens = buffers_w_tens(c.e,c.x,c.y,NUM_PHYSICAL_LEV);
+ w_tens *= dt_spheremp;
+ buffers_w_tens(c.e,c.x,c.y,NUM_PHYSICAL_LEV) = w_tens;
+ Real w_np1 = state_w_i(c.e,data_nm1,c.x,c.y,NUM_PHYSICAL_LEV);
+ w_np1 *= scale3_spheremp;
+ w_np1 += w_tens;
+ state_w_i(c.e,data_np1,c.x,c.y,NUM_PHYSICAL_LEV) = w_np1;
+ }
+ }
+ });
+}
+
+void CaarFunctorImpl::caar_compute()
+{
+ if (m_theta_hydrostatic_mode) {
+ if (m_theta_advection_form == AdvectionForm::Conservative) epoch1_blockOps();
+ else epoch1_blockOps();
+ } else {
+ if (m_theta_advection_form == AdvectionForm::Conservative) epoch1_blockOps();
+ else epoch1_blockOps();
+ }
+
+ if (m_rsplit == 0) epoch2_scanOps();
+ else epoch2_scanOps();
+
+ if (m_theta_hydrostatic_mode) {
+ if (m_rsplit == 0) epoch3_blockOps();
+ else epoch3_blockOps();
+ } else {
+ if (m_rsplit == 0) epoch3_blockOps();
+ else epoch3_blockOps();
+ }
+
+ if (m_theta_hydrostatic_mode) epoch4_scanOps();
+
+ if (m_theta_hydrostatic_mode) {
+ if (m_rsplit == 0) epoch5_colOps();
+ else epoch5_colOps();
+ } else {
+ if (m_rsplit == 0) epoch5_colOps();
+ else epoch5_colOps();
+ }
+
+ if (m_theta_hydrostatic_mode) {
+ if (m_rsplit == 0) {
+ if (m_pgrad_correction) epoch6_blockOps();
+ else epoch6_blockOps();
+ } else {
+ if (m_pgrad_correction) epoch6_blockOps();
+ else epoch6_blockOps();
+ }
+ } else {
+ if (m_rsplit == 0) {
+ if (m_pgrad_correction) epoch6_blockOps();
+ else epoch6_blockOps();
+ } else {
+ if (m_pgrad_correction) epoch6_blockOps();
+ else epoch6_blockOps();
+ }
+ }
+
+ if (m_theta_hydrostatic_mode) {
+ if (m_rsplit == 0) epoch7_col();
+ else epoch7_col();
+ } else {
+ if (m_rsplit == 0) epoch7_col();
+ else epoch7_col();
+ }
+}
+
+}
diff --git a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp
index 38f9dc8573d8..fc3c93f12ad3 100644
--- a/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp
+++ b/components/homme/src/theta-l_kokkos/cxx/CaarFunctorImpl.hpp
@@ -30,6 +30,7 @@
#include "profiling.hpp"
#include "ErrorDefs.hpp"
+#include
#include
namespace Homme {
@@ -107,13 +108,8 @@ struct CaarFunctorImpl {
struct TagPostExchange {};
// Policies
-#ifndef NDEBUG
- template
- using TeamPolicyType = Kokkos::TeamPolicy,Tag>;
-#else
template
using TeamPolicyType = Kokkos::TeamPolicy;
-#endif
TeamPolicyType m_policy_pre;
@@ -123,6 +119,17 @@ struct CaarFunctorImpl {
Kokkos::Array, NUM_TIME_LEVELS> m_bes;
+ private:
+ template void epoch1_blockOps();
+ template void epoch2_scanOps();
+ template void epoch3_blockOps();
+ void epoch4_scanOps();
+ template void epoch5_colOps();
+ template void epoch6_blockOps();
+ template void epoch7_col();
+ void caar_compute();
+
+ public:
CaarFunctorImpl(const Elements &elements, const Tracers &/* tracers */,
const ReferenceElement &ref_FE, const HybridVCoord &hvcoord,
const SphereOperators &sphere_ops, const SimulationParams& params)
@@ -180,7 +187,7 @@ struct CaarFunctorImpl {
int requested_buffer_size () const {
// Ask the buffers manager to allocate enough buffers to satisfy Caar's needs
- const int nslots = m_tu.get_num_ws_slots();
+ const int nslots = std::max(m_tu.get_num_ws_slots(), m_num_elems);
int num_scalar_mid_buf = Buffers::num_3d_scalar_mid_buf;
int num_scalar_int_buf = Buffers::num_3d_scalar_int_buf;
@@ -192,7 +199,7 @@ struct CaarFunctorImpl {
if (m_theta_hydrostatic_mode) {
// pi=pnh, and no wtens/phitens
num_scalar_mid_buf -= 1;
- num_scalar_int_buf -= 3;
+ num_scalar_int_buf -= 2;
// No grad_w_i/v_i
num_vector_int_buf -= 2;
@@ -202,7 +209,7 @@ struct CaarFunctorImpl {
num_scalar_int_buf -=2;
if (m_theta_hydrostatic_mode) {
// No dp_i
- num_scalar_int_buf -= 1;
+ //num_scalar_int_buf -= 1;
}
}
@@ -216,7 +223,7 @@ struct CaarFunctorImpl {
Errors::runtime_check(fbm.allocated_size()>=requested_buffer_size(), "Error! Buffers size not sufficient.\n");
Scalar* mem = reinterpret_cast(fbm.get_memory());
- const int nslots = m_tu.get_num_ws_slots();
+ const int nslots = std::max(m_tu.get_num_ws_slots(), m_num_elems);
// Midpoints scalars
m_buffers.pnh = decltype(m_buffers.pnh )(mem,nslots);
@@ -260,10 +267,10 @@ struct CaarFunctorImpl {
mem += m_buffers.v_tens.size();
// Interface scalars
- if (!m_theta_hydrostatic_mode || m_rsplit==0) {
+ //if (!m_theta_hydrostatic_mode || m_rsplit==0) {
m_buffers.dp_i = decltype(m_buffers.dp_i)(mem,nslots);
mem += m_buffers.dp_i.size();
- }
+ //}
if (!m_theta_hydrostatic_mode) {
m_buffers.dpnh_dp_i = decltype(m_buffers.dpnh_dp_i)(mem,nslots);
@@ -280,9 +287,9 @@ struct CaarFunctorImpl {
if (!m_theta_hydrostatic_mode) {
m_buffers.phi_tens = decltype(m_buffers.phi_tens )(mem,nslots);
mem += m_buffers.phi_tens.size();
- m_buffers.w_tens = decltype(m_buffers.w_tens )(mem,nslots);
- mem += m_buffers.w_tens.size();
}
+ m_buffers.w_tens = decltype(m_buffers.w_tens )(mem,nslots);
+ mem += m_buffers.w_tens.size();
// Interface vectors
if (!m_theta_hydrostatic_mode) {
@@ -346,6 +353,15 @@ struct CaarFunctorImpl {
profiling_resume();
GPTLstart("caar compute");
+
+#if 1
+
+ caar_compute();
+ Kokkos::fence();
+ GPTLstop("caar compute");
+
+#else
+
int nerr;
Kokkos::parallel_reduce("caar loop pre-boundary exchange", m_policy_pre, *this, nerr);
Kokkos::fence();
@@ -353,6 +369,8 @@ struct CaarFunctorImpl {
if (nerr > 0)
check_print_abort_on_bad_elems("CaarFunctorImpl::run TagPreExchange", data.n0);
+#endif
+
GPTLstart("caar_bexchV");
m_bes[data.np1]->exchange(m_geometry.m_rspheremp);
Kokkos::fence();
@@ -544,8 +562,10 @@ struct CaarFunctorImpl {
auto dp = Homme::subview(m_state.m_dp3d,kv.ie,m_data.n0,igp,jgp);
auto div_vdp = Homme::subview(m_buffers.div_vdp,kv.team_idx,igp,jgp);
auto pi = Homme::subview(m_buffers.pi,kv.team_idx,igp,jgp);
- auto omega_i = Homme::subview(m_buffers.grad_phinh_i,kv.team_idx,0,igp,jgp);
- auto pi_i = Homme::subview(m_buffers.grad_phinh_i,kv.team_idx,1,igp,jgp);
+ //auto omega_i = Homme::subview(m_buffers.grad_phinh_i,kv.team_idx,0,igp,jgp);
+ auto omega_i = Homme::subview(m_buffers.w_tens,kv.team_idx,igp,jgp);
+ //auto pi_i = Homme::subview(m_buffers.grad_phinh_i,kv.team_idx,1,igp,jgp);
+ auto pi_i = Homme::subview(m_buffers.dp_i,kv.team_idx,igp,jgp);
Kokkos::single(Kokkos::PerThread(kv.team),[&]() {
pi_i(0)[0] = m_hvcoord.ps0*m_hvcoord.hybrid_ai0;
@@ -566,11 +586,13 @@ struct CaarFunctorImpl {
kv.team_barrier();
ColumnOps::column_scan_mid_to_int(kv,div_vdp,omega_i);
+ kv.team_barrier();
// Average omega_i to midpoints, and change sign, since later
// omega=v*grad(pi)-average(omega_i)
auto omega = Homme::subview(m_buffers.omega_p,kv.team_idx,igp,jgp);
ColumnOps::compute_midpoint_values(kv,omega_i,omega,-1.0);
});
+
kv.team_barrier();
// Compute grad(pi)
@@ -642,6 +664,7 @@ struct CaarFunctorImpl {
// Compute interface horiz velocity
auto u_i = Homme::subview(m_buffers.v_i,kv.team_idx,0,igp,jgp);
auto v_i = Homme::subview(m_buffers.v_i,kv.team_idx,1,igp,jgp);
+
ColumnOps::compute_interface_values(kv.team,dp,dp_i,u,u_i);
ColumnOps::compute_interface_values(kv.team,dp,dp_i,v,v_i);
@@ -1138,7 +1161,6 @@ struct CaarFunctorImpl {
ColumnOps::compute_midpoint_values(kv, w_sq, Homme::subview(m_buffers.temp,kv.team_idx,igp,jgp),0.5);
});
kv.team_barrier();
-
// Compute grad(average(w^2/2)). Store in wvor.
m_sphere_ops.gradient_sphere(kv, Homme::subview(m_buffers.temp,kv.team_idx),
wvor);
@@ -1179,8 +1201,8 @@ struct CaarFunctorImpl {
if (!m_theta_hydrostatic_mode) {
// Compute wvor = grad(average(w^2/2)) - average(w*grad(w))
// Note: vtens is already storing grad(avg(w^2/2))
- auto gradw_x = Homme::subview(m_buffers.v_i,kv.team_idx,0,igp,jgp);
- auto gradw_y = Homme::subview(m_buffers.v_i,kv.team_idx,1,igp,jgp);
+ auto gradw_x = Homme::subview(m_buffers.grad_w_i,kv.team_idx,0,igp,jgp);
+ auto gradw_y = Homme::subview(m_buffers.grad_w_i,kv.team_idx,1,igp,jgp);
auto w_i = Homme::subview(m_state.m_w_i,kv.ie,m_data.n0,igp,jgp);
const auto w_gradw_x = [&gradw_x,&w_i] (const int ilev) -> Scalar {
diff --git a/components/homme/src/theta-l_kokkos/cxx/EquationOfState.hpp b/components/homme/src/theta-l_kokkos/cxx/EquationOfState.hpp
index dd97720f1be2..b22592db5617 100644
--- a/components/homme/src/theta-l_kokkos/cxx/EquationOfState.hpp
+++ b/components/homme/src/theta-l_kokkos/cxx/EquationOfState.hpp
@@ -248,6 +248,12 @@ class EquationOfState {
ColumnOps::column_scan_mid_to_int(kv,integrand_provider,phi_i);
}
+ KOKKOS_INLINE_FUNCTION static
+ Real compute_dphi (const Real vtheta_dp, const Real exner, const Real p) {
+ return PhysicalConstants::Rgas*vtheta_dp*exner/p;
+ }
+
+
public:
bool m_theta_hydrostatic_mode;
diff --git a/components/homme/src/theta-l_kokkos/cxx/LimiterFunctor.hpp b/components/homme/src/theta-l_kokkos/cxx/LimiterFunctor.hpp
index cd3bf7c32526..5dd16d792b9f 100644
--- a/components/homme/src/theta-l_kokkos/cxx/LimiterFunctor.hpp
+++ b/components/homme/src/theta-l_kokkos/cxx/LimiterFunctor.hpp
@@ -49,13 +49,8 @@ struct LimiterFunctor {
struct TagDp3dLimiter {};
// Policies
-#ifndef NDEBUG
- template
- using TeamPolicyType = Kokkos::TeamPolicy,Tag>;
-#else
template
using TeamPolicyType = Kokkos::TeamPolicy;
-#endif
TeamPolicyType m_policy_dp3d_lim;