diff --git a/octotiger/unitiger/hydro_impl/hydro_kokkos_kernel.hpp b/octotiger/unitiger/hydro_impl/hydro_kokkos_kernel.hpp index 0a6f6fc00..4681be6d0 100644 --- a/octotiger/unitiger/hydro_impl/hydro_kokkos_kernel.hpp +++ b/octotiger/unitiger/hydro_impl/hydro_kokkos_kernel.hpp @@ -51,8 +51,8 @@ void flux_impl_teamless(hpx::kokkos::executor& executor, // Supported team_sizes need to be the power of two! Team size of 1 is a special case for usage // with the serial kokkos backend: assert((team_size == 1)); - auto policy = Kokkos::Experimental::require(Kokkos::RangePolicy( - executor.instance(), 0, number_blocks), + auto policy = Kokkos::Experimental::require( + Kokkos::RangePolicy(executor.instance(), 0, number_blocks), Kokkos::Experimental::WorkItemProperty::HintLightWeight); // Start kernel using policy (and through it the passed executor): @@ -324,6 +324,7 @@ void flux_impl(hpx::kokkos::executor& executor, const kokkos_b }); } +/// Reconstruct with or without am template void reconstruct_impl(hpx::kokkos::executor& executor, const double omega, const int nf_, const int angmom_index_, const kokkos_int_buffer_t& smooth_field_, @@ -361,6 +362,38 @@ void reconstruct_impl(hpx::kokkos::executor& executor, const d }); } +/// Optimized for reconstruct without am correction +template +void reconstruct_no_amc_impl(hpx::kokkos::executor& executor, const double omega, + const int nf_, const int angmom_index_, const kokkos_int_buffer_t& smooth_field_, + const kokkos_int_buffer_t& disc_detect_, kokkos_buffer_t& combined_q, + const kokkos_buffer_t& combined_x, kokkos_buffer_t& combined_u, kokkos_buffer_t& AM, + const double dx, const kokkos_buffer_t& cdiscs, const int n_species_, const int ndir, + const int nangmom, const Kokkos::Array&& tiling_config) { + const int blocks = q_inx3 / 64 + 1; + auto policy = Kokkos::Experimental::require( + Kokkos::MDRangePolicy>( + executor.instance(), {0, 0, 0}, {blocks, 8, 8}, tiling_config), + Kokkos::Experimental::WorkItemProperty::HintLightWeight); + Kokkos::parallel_for( + "kernel hydro reconstruct", policy, KOKKOS_LAMBDA(int idx, int idy, int idz) { + const int q_i = (idx) *64 + (idy) *8 + (idz); + const int i = ((q_i / q_inx2) + 2) * inx_large * inx_large + + (((q_i % q_inx2) / q_inx) + 2) * inx_large + (((q_i % q_inx2) % q_inx) + 2); + if (q_i < q_inx3) { + for (int d = 0; d < ndir; d++) { + cell_reconstruct_inner_loop_p1(nf_, angmom_index_, smooth_field_, disc_detect_, + combined_q, combined_u, AM, dx, cdiscs, d, i, q_i, ndir, nangmom); + } + // Phase 2 + for (int d = 0; d < ndir; d++) { + cell_reconstruct_inner_loop_p2(omega, angmom_index_, combined_q, combined_x, + combined_u, AM, dx, d, i, q_i, ndir, nangmom, n_species_); + } + } + }); +} + template void hydro_pre_recon_impl(hpx::kokkos::executor& executor, const kokkos_buffer_t& large_x, const double omega, const bool angmom, kokkos_buffer_t& u, @@ -431,8 +464,14 @@ timestep_t device_interface_kokkos_hydro(executor_t& exec, const host_buffer q(nf * 27 * q_inx3 + padding); device_buffer AM(NDIM * q_inx3 + padding); - reconstruct_impl(exec, omega, nf, angmom_index, device_smooth_field, device_disc_detect, q, x, - u, AM, dx, disc, n_species, ndir, nangmom, {1, 8, 8}); + + if (angmom_index > -1) { + reconstruct_impl(exec, omega, nf, angmom_index, device_smooth_field, device_disc_detect, q, + x, u, AM, dx, disc, n_species, ndir, nangmom, {1, 8, 8}); + } else { + reconstruct_no_amc_impl(exec, omega, nf, angmom_index, device_smooth_field, + device_disc_detect, q, x, u, AM, dx, disc, n_species, ndir, nangmom, {1, 8, 8}); + } // Flux const device_buffer& masks = @@ -461,7 +500,7 @@ timestep_t device_interface_kokkos_hydro(executor_t& exec, const host_buffer