Skip to content

Commit

Permalink
Allow functors as reducers for nested team parallel reduce (kokkos#6921)
Browse files Browse the repository at this point in the history
* Functor as reducer for TeamThreadRange, ThreadVectorRange and TeamVectorRange

* Fixed typos

* Fix an error in hpx

* Apply suggestions from code review

Co-authored-by: Daniel Arndt <[email protected]>

* Applying the same changes in OpenAcc

---------

Co-authored-by: Daniel Arndt <[email protected]>
  • Loading branch information
ldh4 and masterleinad authored Oct 2, 2024
1 parent 859055c commit 4aae49f
Show file tree
Hide file tree
Showing 9 changed files with 904 additions and 239 deletions.
186 changes: 137 additions & 49 deletions core/src/Cuda/Kokkos_Cuda_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,24 +184,37 @@ class CudaTeamMember {
* ( 1 == blockDim.z )
*/
template <typename ReducerType>
KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value>
KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer_v<ReducerType>>
team_reduce(ReducerType const& reducer) const noexcept {
team_reduce(reducer, reducer.reference());
}

template <typename ReducerType>
KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value>
KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer_v<ReducerType>>
team_reduce(ReducerType const& reducer,
typename ReducerType::value_type& value) const noexcept {
(void)reducer;
(void)value;

KOKKOS_IF_ON_DEVICE((
typename Impl::FunctorAnalysis<
Impl::FunctorPatternInterface::REDUCE, TeamPolicy<Cuda>,
ReducerType, typename ReducerType::value_type>::Reducer
wrapped_reducer(reducer);

impl_team_reduce(wrapped_reducer, value); reducer.reference() = value;))
}

template <typename WrappedReducerType>
KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer_v<WrappedReducerType>>
impl_team_reduce(
WrappedReducerType const& wrapped_reducer,
typename WrappedReducerType::value_type& value) const noexcept {
(void)wrapped_reducer;
(void)value;

KOKKOS_IF_ON_DEVICE(
(typename Impl::FunctorAnalysis<
Impl::FunctorPatternInterface::REDUCE, TeamPolicy<Cuda>,
ReducerType, typename ReducerType::value_type>::Reducer
wrapped_reducer(reducer);
cuda_intra_block_reduction(value, wrapped_reducer, blockDim.y);
reducer.reference() = value;))
(cuda_intra_block_reduction(value, wrapped_reducer, blockDim.y);))
}

//--------------------------------------------------------------------------
Expand Down Expand Up @@ -260,23 +273,42 @@ class CudaTeamMember {
//----------------------------------------

template <typename ReducerType>
KOKKOS_INLINE_FUNCTION static std::enable_if_t<is_reducer<ReducerType>::value>
KOKKOS_INLINE_FUNCTION static std::enable_if_t<is_reducer_v<ReducerType>>
vector_reduce(ReducerType const& reducer) {
vector_reduce(reducer, reducer.reference());
}

template <typename ReducerType>
KOKKOS_INLINE_FUNCTION static std::enable_if_t<is_reducer<ReducerType>::value>
KOKKOS_INLINE_FUNCTION static std::enable_if_t<is_reducer_v<ReducerType>>
vector_reduce(ReducerType const& reducer,
typename ReducerType::value_type& value) {
(void)reducer;
(void)value;

KOKKOS_IF_ON_DEVICE(
(typename Impl::FunctorAnalysis<
Impl::FunctorPatternInterface::REDUCE, TeamPolicy<Cuda>,
ReducerType, typename ReducerType::value_type>::Reducer
wrapped_reducer(reducer);

impl_vector_reduce(wrapped_reducer, value);
reducer.reference() = value;))
}

template <typename WrappedReducerType>
KOKKOS_INLINE_FUNCTION static std::enable_if_t<
is_reducer_v<WrappedReducerType>>
impl_vector_reduce(WrappedReducerType const& wrapped_reducer,
typename WrappedReducerType::value_type& value) {
(void)wrapped_reducer;
(void)value;

KOKKOS_IF_ON_DEVICE(
(if (blockDim.x == 1) return;

// Intra vector lane shuffle reduction:
typename ReducerType::value_type tmp(value);
typename ReducerType::value_type tmp2 = tmp;
typename WrappedReducerType::value_type tmp(value);
typename WrappedReducerType::value_type tmp2 = tmp;

unsigned mask =
blockDim.x == 32
Expand All @@ -287,7 +319,7 @@ class CudaTeamMember {
for (int i = blockDim.x; (i >>= 1);) {
Impl::in_place_shfl_down(tmp2, tmp, i, blockDim.x, mask);
if ((int)threadIdx.x < i) {
reducer.join(tmp, tmp2);
wrapped_reducer.join(&tmp, &tmp2);
}
}

Expand All @@ -297,7 +329,7 @@ class CudaTeamMember {
// and thus different threads could have different results.

Impl::in_place_shfl(tmp2, tmp, 0, blockDim.x, mask);
value = tmp2; reducer.reference() = tmp2;))
value = tmp2;))
}

//----------------------------------------
Expand Down Expand Up @@ -487,14 +519,21 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
iType, Impl::CudaTeamMember>& loop_boundaries,
const Closure& closure, const ReducerType& reducer) {
KOKKOS_IF_ON_DEVICE(
(typename ReducerType::value_type value;
(using value_type = typename ReducerType::value_type;
using functor_analysis_type = typename Impl::FunctorAnalysis<
Impl::FunctorPatternInterface::REDUCE,
TeamPolicy<typename Impl::CudaTeamMember::execution_space>,
ReducerType, value_type>;
using wrapped_reducer_type = typename functor_analysis_type::Reducer;

reducer.init(value);
wrapped_reducer_type wrapped_reducer(reducer); value_type value;
wrapped_reducer.init(&value);

for (iType i = loop_boundaries.start + threadIdx.y;
i < loop_boundaries.end; i += blockDim.y) { closure(i, value); }

loop_boundaries.member.team_reduce(reducer, value);))
loop_boundaries.member.impl_team_reduce(wrapped_reducer, value);
wrapped_reducer.final(&value); reducer.reference() = value;))
// Avoid bogus warning about reducer value being uninitialized with combined
// reducers
KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure;
Expand All @@ -518,16 +557,25 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<
(void)loop_boundaries;
(void)closure;
(void)result;

KOKKOS_IF_ON_DEVICE(
(ValueType val; Kokkos::Sum<ValueType> reducer(val);
(using functor_analysis_type = typename Impl::FunctorAnalysis<
Impl::FunctorPatternInterface::REDUCE,
TeamPolicy<typename Impl::CudaTeamMember::execution_space>, Closure,
ValueType>;
using wrapped_reducer_type = typename functor_analysis_type::Reducer;
using value_type = typename wrapped_reducer_type::value_type;

reducer.init(reducer.reference());
wrapped_reducer_type wrapped_reducer(closure); value_type value{};
wrapped_reducer.init(&value);

for (iType i = loop_boundaries.start + threadIdx.y;
i < loop_boundaries.end; i += blockDim.y) { closure(i, val); }
i < loop_boundaries.end; i += blockDim.y) { closure(i, value); }

loop_boundaries.member.team_reduce(reducer, val);
result = reducer.reference();))
loop_boundaries.member.impl_team_reduce(wrapped_reducer, value);
wrapped_reducer.final(&value); result = value;

))
}

template <typename iType, class Closure>
Expand All @@ -548,16 +596,27 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
iType, Impl::CudaTeamMember>& loop_boundaries,
const Closure& closure, const ReducerType& reducer) {
KOKKOS_IF_ON_DEVICE((typename ReducerType::value_type value;
reducer.init(value);
KOKKOS_IF_ON_DEVICE(
(using value_type = typename ReducerType::value_type;
using functor_analysis_type = typename Impl::FunctorAnalysis<
Impl::FunctorPatternInterface::REDUCE,
TeamPolicy<typename Impl::CudaTeamMember::execution_space>,
ReducerType, value_type>;
using wrapped_reducer_type = typename functor_analysis_type::Reducer;

for (iType i = loop_boundaries.start +
threadIdx.y * blockDim.x + threadIdx.x;
i < loop_boundaries.end;
i += blockDim.y * blockDim.x) { closure(i, value); }
wrapped_reducer_type wrapped_reducer(reducer); value_type value;
wrapped_reducer.init(&value);

for (iType i =
loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x;
i < loop_boundaries.end;
i += blockDim.y * blockDim.x) { closure(i, value); }

loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value);
loop_boundaries.member.impl_team_reduce(wrapped_reducer, value);

wrapped_reducer.final(&value); reducer.reference() = value;))

loop_boundaries.member.vector_reduce(reducer, value);
loop_boundaries.member.team_reduce(reducer, value);))
// Avoid bogus warning about reducer value being uninitialized with combined
// reducers
KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure;
Expand All @@ -573,18 +632,27 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
(void)loop_boundaries;
(void)closure;
(void)result;
KOKKOS_IF_ON_DEVICE((ValueType val; Kokkos::Sum<ValueType> reducer(val);

reducer.init(reducer.reference());
KOKKOS_IF_ON_DEVICE(
(using functor_analysis_type = typename Impl::FunctorAnalysis<
Impl::FunctorPatternInterface::REDUCE,
TeamPolicy<typename Impl::CudaTeamMember::execution_space>, Closure,
ValueType>;
using wrapped_reducer_type = typename functor_analysis_type::Reducer;
using value_type = typename wrapped_reducer_type::value_type;

for (iType i = loop_boundaries.start +
threadIdx.y * blockDim.x + threadIdx.x;
i < loop_boundaries.end;
i += blockDim.y * blockDim.x) { closure(i, val); }
wrapped_reducer_type wrapped_reducer(closure); value_type value;
wrapped_reducer.init(&value);

loop_boundaries.member.vector_reduce(reducer);
loop_boundaries.member.team_reduce(reducer);
result = reducer.reference();))
for (iType i =
loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x;
i < loop_boundaries.end;
i += blockDim.y * blockDim.x) { closure(i, value); }

loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value);
loop_boundaries.member.impl_team_reduce(wrapped_reducer, value);

wrapped_reducer.final(&value); result = value;))
}

//----------------------------------------------------------------------------
Expand Down Expand Up @@ -632,13 +700,22 @@ parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
Closure const& closure, ReducerType const& reducer) {
KOKKOS_IF_ON_DEVICE((

reducer.init(reducer.reference());
using value_type = typename ReducerType::value_type;
using functor_analysis_type = typename Impl::FunctorAnalysis<
Impl::FunctorPatternInterface::REDUCE,
TeamPolicy<typename Impl::CudaTeamMember::execution_space>,
ReducerType, value_type>;
using wrapped_reducer_type = typename functor_analysis_type::Reducer;

wrapped_reducer_type wrapped_reducer(reducer); value_type value;
wrapped_reducer.init(&value);

for (iType i = loop_boundaries.start + threadIdx.x;
i < loop_boundaries.end;
i += blockDim.x) { closure(i, reducer.reference()); }
i < loop_boundaries.end; i += blockDim.x) { closure(i, value); }

Impl::CudaTeamMember::vector_reduce(reducer);
Impl::CudaTeamMember::impl_vector_reduce(wrapped_reducer, value);

wrapped_reducer.final(&value); reducer.reference() = value;

))
// Avoid bogus warning about reducer value being uninitialized with combined
Expand Down Expand Up @@ -667,15 +744,26 @@ parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
(void)loop_boundaries;
(void)closure;
(void)result;
KOKKOS_IF_ON_DEVICE(
(result = ValueType();

for (iType i = loop_boundaries.start + threadIdx.x;
i < loop_boundaries.end; i += blockDim.x) { closure(i, result); }
KOKKOS_IF_ON_DEVICE((

using functor_analysis_type = typename Impl::FunctorAnalysis<
Impl::FunctorPatternInterface::REDUCE,
TeamPolicy<typename Impl::CudaTeamMember::execution_space>, Closure,
ValueType>;
using wrapped_reducer_type = typename functor_analysis_type::Reducer;
using value_type = typename wrapped_reducer_type::value_type;

Impl::CudaTeamMember::vector_reduce(Kokkos::Sum<ValueType>(result));
wrapped_reducer_type wrapped_reducer(closure); value_type value;
wrapped_reducer.init(&value);

))
for (iType i = loop_boundaries.start + threadIdx.x;
i < loop_boundaries.end; i += blockDim.x) { closure(i, value); }

Impl::CudaTeamMember::impl_vector_reduce(wrapped_reducer, value);
wrapped_reducer.final(&value); result = value;

))
}

//----------------------------------------------------------------------------
Expand Down
Loading

0 comments on commit 4aae49f

Please sign in to comment.