From a8c4a37805f042684add6a02c3dcda3cf6b6d1e0 Mon Sep 17 00:00:00 2001 From: Atell Krasnopolski Date: Mon, 18 Mar 2024 13:22:18 +0100 Subject: [PATCH] Add Kokkos unittests --- unittests/Kokkos/CMakeLists.txt | 4 +- unittests/Kokkos/ParallelFor.cpp | 87 +++++++ unittests/Kokkos/ParallelReduce.cpp | 87 +++++++ unittests/Kokkos/TestUtils.hpp | 21 ++ unittests/Kokkos/ViewBasics.cpp | 159 +++++++++++++ unittests/Kokkos/parallel_for.cpp | 21 -- unittests/Kokkos/parallel_sum.hpp | 348 ++++++++++++++++++++++++++++ 7 files changed, 705 insertions(+), 22 deletions(-) create mode 100644 unittests/Kokkos/ParallelFor.cpp create mode 100644 unittests/Kokkos/ParallelReduce.cpp create mode 100644 unittests/Kokkos/TestUtils.hpp create mode 100644 unittests/Kokkos/ViewBasics.cpp delete mode 100644 unittests/Kokkos/parallel_for.cpp create mode 100644 unittests/Kokkos/parallel_sum.hpp diff --git a/unittests/Kokkos/CMakeLists.txt b/unittests/Kokkos/CMakeLists.txt index 2f6ee89dc..941f3671b 100644 --- a/unittests/Kokkos/CMakeLists.txt +++ b/unittests/Kokkos/CMakeLists.txt @@ -1,6 +1,8 @@ add_clad_unittest(KokkosTests - parallel_for.cpp main.cpp + ViewBasics.cpp + ParallelReduce.cpp + ParallelFor.cpp ) # If llvm does not require rtti, kokkos does. diff --git a/unittests/Kokkos/ParallelFor.cpp b/unittests/Kokkos/ParallelFor.cpp new file mode 100644 index 000000000..8e28031bc --- /dev/null +++ b/unittests/Kokkos/ParallelFor.cpp @@ -0,0 +1,87 @@ +#include +#include "clad/Differentiator/Differentiator.h" +#include "gtest/gtest.h" +// #include "TestUtils.hpp" +#include "parallel_sum.hpp" + +TEST(ParallelFor, HelloWorldLambdaLoopForward) { // data races + // // check finite difference and forward mode similarity + // const double eps = 1e-5; + // const double tau = 1e-6; // tolerance + + std::function _f = [](double x) { + double res = 0.; + Kokkos::parallel_for("HelloWorld-forward", 5, [&res, x](const int i){ res = x*x; }); // this is not a KOKKOS_LAMBDA because it must capture res by reference + // res = x*x; + return res; + }; + + // // TODO: uncomment this once it has been implemented + // auto f_diff = clad::differentiate(_f, 0/*x*/); + // for (double x = -2; x <= 2; x += 1) { + // double f_diff_ex = f_diff.execute(x); + // double dx_f_FD = finite_difference_tangent(_f, x, eps); + // EXPECT_NEAR(f_diff_ex, dx_f_FD, abs(tau*dx_f_FD)); + // } +} + +TEST(ParallelFor, HelloWorldLambdaLoopReverse) { // data races + // // check finite difference and reverse mode similarity + // const double eps = 1e-5; + // const double tau = 1e-6; // tolerance + + std::function _f = [](double x) { + double res = 0.; + Kokkos::parallel_for("HelloWorld-reverse", 5, [&res, x](const int i){ res = x*x; }); // this is not a KOKKOS_LAMBDA because it must capture res by reference + // res = x*x; + return res; + }; + + // // TODO: uncomment this once it has been implemented + // auto f_grad = clad::gradient(_f); + // for (double x = -2; x <= 2; x += 1) { + // double dx_f_FD = finite_difference_tangent(_f, x, eps); + // double dx; + // f_grad.execute(x, &dx); + // EXPECT_NEAR(dx_f_FD, dx, abs(tau*dx)); + // } +} + +double parallel_polynomial_for(double x) { // data races + Kokkos::View res("res"); + res(0) = 0; + Kokkos::parallel_for("polycalc", 5, KOKKOS_LAMBDA(const int i) { + res(0) += pow(x, i+1)/(i+1); + }); + // for (int i = 0; i < 6; ++i) { + // res(0) += pow(x, i+1)/(i+1); + // } + return res(0); +} + +TEST(ParallelFor, ParallelPolynomialForward) { + // // check true derivative and forward mode similarity + // const double tau = 1e-5; // tolerance + + // // TODO: uncomment this once it has been implemented + // auto f_diff = clad::differentiate(parallel_polynomial_for, "x"); + // for (double x = -2; x <= 2; x += 1) { + // double f_diff_ex = f_diff.execute(x); + // double dx_f_true = parallel_polynomial_true_derivative(x); + // EXPECT_NEAR(f_diff_ex, dx_f_true, abs(tau*dx_f_true)); + // } +} + +TEST(ParallelFor, ParallelPolynomialReverse) { + // // check true derivative and reverse mode similarity + // const double tau = 1e-5; // tolerance + + // // TODO: uncomment this once it has been implemented + // auto f_grad = clad::gradient(parallel_polynomial_for); + // for (double x = -2; x <= 2; x += 1) { + // double dx_f_true = parallel_polynomial_true_derivative(x); + // double dx = 0; + // f_grad.execute(x, &dx); + // EXPECT_NEAR(dx_f_true, dx, abs(tau*dx)); + // } +} \ No newline at end of file diff --git a/unittests/Kokkos/ParallelReduce.cpp b/unittests/Kokkos/ParallelReduce.cpp new file mode 100644 index 000000000..95f1ac2b5 --- /dev/null +++ b/unittests/Kokkos/ParallelReduce.cpp @@ -0,0 +1,87 @@ +#include +#include "clad/Differentiator/Differentiator.h" +#include "gtest/gtest.h" +// #include "TestUtils.hpp" +#include "parallel_sum.hpp" + +TEST(ParallelReduce, HelloWorldLambdaLoopForward) { + // // check finite difference and forward mode similarity + // const double eps = 1e-5; + // const double tau = 1e-6; // tolerance + + std::function _f = [](double x) { + double res = 0.; + Kokkos::parallel_reduce("HelloWorld-forward", 5, KOKKOS_LAMBDA(const int& i, double& _res){ _res += x; }, res); + // res = 5*x; + return res; + }; + + // // TODO: uncomment this once it has been implemented + // auto f_diff = clad::differentiate(_f, 0/*x*/); + // for (double x = -2; x <= 2; x += 1) { + // double f_diff_ex = f_diff.execute(x); + // double dx_f_FD = finite_difference_tangent(_f, x, eps); + // EXPECT_NEAR(f_diff_ex, dx_f_FD, abs(tau*dx_f_FD)); + // } +} + +TEST(ParallelReduce, HelloWorldLambdaLoopReverse) { + // // check finite difference and reverse mode similarity + // const double eps = 1e-5; + // const double tau = 1e-6; // tolerance + + std::function _f = [](double x) { + double res = 0.; + Kokkos::parallel_reduce("HelloWorld-reverse", 5, KOKKOS_LAMBDA(const int& i, double& _res){ _res += x; }, res); + // res = 5*x; + return res; + }; + + // // TODO: uncomment this once it has been implemented + // auto f_grad = clad::gradient(_f); + // for (double x = -2; x <= 2; x += 1) { + // double dx_f_FD = finite_difference_tangent(_f, x, eps); + // double dx; + // f_grad.execute(x, &dx); + // EXPECT_NEAR(dx_f_FD, dx, abs(tau*dx)); + // } +} + +double parallel_polynomial_reduce(double x) { + Kokkos::View res("res"); + res(0) = 0; + Kokkos::parallel_reduce("polycalc", 5, KOKKOS_LAMBDA(const int& i, double& _res) { + _res += pow(x, i+1)/(i+1); + }, res(0)); + // for (int i = 0; i < 6; ++i) { + // res(0) += pow(x, i+1)/(i+1); + // } + return res(0); +} + +TEST(ParallelReduce, ParallelPolynomialForward) { + // // check true derivative and forward mode similarity + // const double tau = 1e-5; // tolerance + + // // TODO: uncomment this once it has been implemented + // auto f_diff = clad::differentiate(parallel_polynomial_reduce, "x"); + // for (double x = -2; x <= 2; x += 1) { + // double f_diff_ex = f_diff.execute(x); + // double dx_f_true = parallel_polynomial_true_derivative(x); + // EXPECT_NEAR(f_diff_ex, dx_f_true, abs(tau*dx_f_true)); + // } +} + +TEST(ParallelReduce, ParallelPolynomialReverse) { + // // check true derivative and reverse mode similarity + // const double tau = 1e-5; // tolerance + + // // TODO: uncomment this once it has been implemented + // auto f_grad = clad::gradient(parallel_polynomial_reduce); + // for (double x = -2; x <= 2; x += 1) { + // double dx_f_true = parallel_polynomial_true_derivative(x); + // double dx = 0; + // f_grad.execute(x, &dx); + // EXPECT_NEAR(dx_f_true, dx, abs(tau*dx)); + // } +} \ No newline at end of file diff --git a/unittests/Kokkos/TestUtils.hpp b/unittests/Kokkos/TestUtils.hpp new file mode 100644 index 000000000..cb72905b7 --- /dev/null +++ b/unittests/Kokkos/TestUtils.hpp @@ -0,0 +1,21 @@ +// Useful things + +#ifndef KOKKOS_UNITTEST_UTILS +#define KOKKOS_UNITTEST_UTILS + +template // comparison with the finite difference approx. has been tested in the initial PR for Kokkos-aware Clad by Kim Liegeois +T finite_difference_tangent(std::function func, const T& x, const T& epsilon) { + return (func(x+epsilon)-func(x-epsilon)) / (2 * epsilon); +} + +double parallel_polynomial_true_derivative(double x) { // the true derivative of the polynomial tested in ParallelFor.cpp and ParallelReduce.cpp + double res = 0; + double x_c = 1; + for (unsigned i = 0; i < 6; ++i) { + res += x_c; + x_c *= x; + } + return res; +} + +#endif \ No newline at end of file diff --git a/unittests/Kokkos/ViewBasics.cpp b/unittests/Kokkos/ViewBasics.cpp new file mode 100644 index 000000000..2733348f0 --- /dev/null +++ b/unittests/Kokkos/ViewBasics.cpp @@ -0,0 +1,159 @@ +// Very basic Kokkos::View usage test that should work by all means +// inspired by https://github.com/kliegeois/clad/blob/kokkos-PR/unittests/Kokkos/view_access.cpp +// it has been modified to match gtest guidelines and improve readability + +#include +#include "clad/Differentiator/Differentiator.h" +#include "gtest/gtest.h" +#include "TestUtils.hpp" +#include "parallel_sum.hpp" + +double f(double x, double y) { + const int N = 2; + + Kokkos::View a("a", N); + Kokkos::View b("b", N); + + a(0,0) = x; + b(0,0) = y*x; + + return a(0,0) + a(0,0)*b(0,0) + b(0,0); +} + +TEST(ViewBasics, TestAccessForward) { + // // check finite difference and forward mode similarity + // const double eps = 1e-5; + // const double tau = 1e-6; // tolerance + + // // TODO: uncomment this once it has been implemented + // auto f_x = clad::differentiate(f, "x"); + // for (double y = 3; y <= 5; y += 1) { + // std::function f_tmp = [y](double t){ return f(t, y); }; + // for (double x = 3; x <= 5; x += 1) { + // double f_x_ex = f_x.execute(x, y); + // double dx_f_FD = finite_difference_tangent(f_tmp, x, eps); + // EXPECT_NEAR(f_x_ex, dx_f_FD, abs(tau*dx_f_FD)); + // } + // } +} + +TEST(ViewBasics, TestAccessReverse) { + // // check reverse mode and forward mode similarity + // const double eps = 1e-5; + // const double tau = 1e-6; // tolerance + + // // TODO: uncomment this once it has been implemented + // auto f_grad_exe = clad::gradient(f); + // for (double y = 3; y <= 5; y += 1) { + // std::function f_tmp = [y](double t){ return f(t, y); }; + // for (double x = 3; x <= 5; x += 1) { + // double dx_f_FD = finite_difference_tangent(f_tmp, x, eps); + // double dx, dy; + // f_grad_exe.execute(x, y, &dx, &dy); + // EXPECT_NEAR(dx_f_FD, dx, abs(tau*dx)); + // } + // } +} + +double f_2(double x, double y) { + const int N = 2; + + Kokkos::View a("a", N); + Kokkos::View b("b", N); + + Kokkos::deep_copy(a, 3*x+y); + b(0,0) = x; + Kokkos::deep_copy(b, a); + + b(0,0) = b(0,0) + a(0,0) * b(0,0); + + return a(0,0); // derivative wrt x is constantly 3 +} + +TEST(ViewBasics, TestDeepCopyForward) { + // // check finite difference and forward mode similarity + // const double eps = 1e-5; + // const double tau = 1e-6; // tolerance + + // // TODO: uncomment this once it has been implemented + // auto f_x = clad::differentiate(f_2, "x"); + // for (double y = 3; y <= 5; y += 1) { + // std::function f_tmp = [y](double t){ return f_2(t, y); }; + // for (double x = 3; x <= 5; x += 1) { + // double f_x_ex = f_x.execute(x, y); + // double dx_f_FD = finite_difference_tangent(f_tmp, x, eps); + // EXPECT_NEAR(f_x_ex, dx_f_FD, abs(tau*dx_f_FD)); + // } + // } +} + +TEST(ViewBasics, TestDeepCopyReverse) { + // // check reverse mode and forward mode similarity + // const double eps = 1e-5; + // const double tau = 1e-6; // tolerance + + // // TODO: uncomment this once it has been implemented + // auto f_grad_exe = clad::gradient(f_2); + // for (double y = 3; y <= 5; y += 1) { + // std::function f_tmp = [y](double t){ return f_2(t, y); }; + // for (double x = 3; x <= 5; x += 1) { + // double dx_f_FD = finite_difference_tangent(f_tmp, x, eps); + // double dx, dy; + // f_grad_exe.execute(x, y, &dx, &dy); + // EXPECT_NEAR(dx_f_FD, dx, abs(tau*dx)); + // } + // } +} + +double f_3(double x, double y) { + const int N = 2; + + Kokkos::View a("a", N); + Kokkos::View b("b", N); + + Kokkos::deep_copy(a, 3*y+x+50); + b(1) = x*y; + Kokkos::deep_copy(b, a); + + b(1) = b(1) + a(0) * b(1); + + a(1) = x*x*x; + a(0) += a(1); + + return a(0); // derivative of this wrt y is constantly 3 +} + +TEST(ViewBasics, TestDeepCopy2Forward) { + // // check finite difference and forward mode similarity + // const double eps = 1e-5; + // const double tau = 1e-6; // tolerance + + // // TODO: uncomment this once it has been implemented + // auto f_y = clad::differentiate(f_3, "y"); + // for (double x = 3; x <= 5; x += 1) { + // std::function f_tmp = [x](double t){ return f_3(x, t); }; + // for (double y = 3; y <= 5; y += 1) { + // double f_y_ex = f_y.execute(x, y); + // double dy_f_FD = finite_difference_tangent(f_tmp, y, eps); + // EXPECT_NEAR(f_y_ex, dy_f_FD, abs(tau*dy_f_FD)); + // } + // } +} + +TEST(ViewBasics, TestDeepCopy2Reverse) { + // // check reverse mode and forward mode similarity + // const double eps = 1e-5; + // const double tau = 1e-6; // tolerance + + // // TODO: uncomment this once it has been implemented + // auto f_grad_exe = clad::gradient(f_3); + // for (double x = 3; x <= 5; x += 1) { + // std::function f_tmp = [x](double t){ return f_3(x, t); }; + // for (double y = 3; y <= 5; y += 1) { + // double dy_f_FD = finite_difference_tangent(f_tmp, y, eps); + // double dx, dy; + // f_grad_exe.execute(x, y, &dx, &dy); + // EXPECT_NEAR(dy_f_FD, dy, abs(tau*dy)); + // } + // } +} \ No newline at end of file diff --git a/unittests/Kokkos/parallel_for.cpp b/unittests/Kokkos/parallel_for.cpp deleted file mode 100644 index addade30b..000000000 --- a/unittests/Kokkos/parallel_for.cpp +++ /dev/null @@ -1,21 +0,0 @@ -#include - -#include "gtest/gtest.h" - -#include "clad/Differentiator/Differentiator.h" - -struct hello_world_pow2 { - double x = 0.; - // double result = 0.; - KOKKOS_INLINE_FUNCTION void operator()(const int i) const { - // result = x * x; - } -}; - -TEST(parallel_for, HelloWorldFunctor) { - hello_world_pow2 hw; - hw.x = 2; - Kokkos::parallel_for("HelloWorld", 15, hw); - // EXPECT_EQ(); - // FIXME: Add the calls to clad::differentiate/gradient... -} diff --git a/unittests/Kokkos/parallel_sum.hpp b/unittests/Kokkos/parallel_sum.hpp new file mode 100644 index 000000000..fdcb493ea --- /dev/null +++ b/unittests/Kokkos/parallel_sum.hpp @@ -0,0 +1,348 @@ +// source: https://github.com/kliegeois/clad/blob/kokkos-PR/unittests/Kokkos/parallel_sum.hpp + +#ifndef KOKKOS_UNITTEST_PARALLELSUM +#define KOKKOS_UNITTEST_PARALLELSUM + +#include + +namespace kokkos_builtin_derivative { + +// Parallel sum: + +template +struct ViewSum; + +template +struct ViewSum { + + template + static void execute(ResultT& result, const Viewtype& v, const ExecSpace space = ExecSpace()) { + + using policy_type = Kokkos::RangePolicy>; + using value_type = typename Viewtype::value_type; + + value_type sum; + + Kokkos::parallel_reduce( + "ViewSum-1D", + policy_type(space, 0, v.extent(0)), + KOKKOS_LAMBDA ( + const iType& i0, + value_type& update) { + update += v(i0); + }, + sum ); + + result += sum; + } +}; + +template +struct ViewSum { + + template + static void execute(ResultT& result, const Viewtype& v, const ExecSpace space = ExecSpace()) { + + static const Kokkos::Iterate outer_iteration_pattern = + Kokkos::layout_iterate_type_selector::outer_iteration_pattern; + static const Kokkos::Iterate inner_iteration_pattern = + Kokkos::layout_iterate_type_selector::inner_iteration_pattern; + using iterate_type = + Kokkos::Rank<2, outer_iteration_pattern, inner_iteration_pattern>; + using policy_type = + Kokkos::MDRangePolicy>; + using value_type = typename Viewtype::value_type; + + value_type sum; + + Kokkos::parallel_reduce( + "ViewSum-2D", + policy_type(space, {0, 0}, {v.extent(0), v.extent(1)}), + KOKKOS_LAMBDA ( + const iType& i0, + const iType& i1, + value_type& update) { + update += v(i0, i1); + }, + sum ); + + result += sum; + } +}; + +template +struct ViewSum { + + template + static void execute(ResultT& result, const Viewtype& v, const ExecSpace space = ExecSpace()) { + + static const Kokkos::Iterate outer_iteration_pattern = + Kokkos::layout_iterate_type_selector::outer_iteration_pattern; + static const Kokkos::Iterate inner_iteration_pattern = + Kokkos::layout_iterate_type_selector::inner_iteration_pattern; + using iterate_type = + Kokkos::Rank<3, outer_iteration_pattern, inner_iteration_pattern>; + using policy_type = + Kokkos::MDRangePolicy>; + using value_type = typename Viewtype::value_type; + + value_type sum; + + Kokkos::parallel_reduce( + "ViewSum-3D", + policy_type(space, {0, 0}, {v.extent(0), v.extent(1), v.extent(2)}), + KOKKOS_LAMBDA ( + const iType& i0, + const iType& i1, + const iType& i2, + value_type& update) { + update += v(i0, i1, i2); + }, + sum ); + + result += sum; + } +}; + +// Parallel add + +template +struct ViewAdd; + +template +struct ViewAdd { + + template + static void execute(const Viewtype& v, ResultT& update, const ExecSpace space = ExecSpace()) { + + using policy_type = Kokkos::RangePolicy>; + + Kokkos::parallel_for( + "ViewAdd-1D", + policy_type(space, 0, v.extent(0)), + KOKKOS_LAMBDA ( + const iType& i0) { + v(i0) += update; + }); + } + + template + static void executeView(const Viewtype& v, ResultT& update, const ExecSpace space = ExecSpace()) { + + using policy_type = Kokkos::RangePolicy>; + + Kokkos::parallel_for( + "ViewAdd-1D", + policy_type(space, 0, v.extent(0)), + KOKKOS_LAMBDA ( + const iType& i0) { + v(i0) += update(i0); + }); + } +}; + +template +struct ViewAdd { + + template + static void execute(const Viewtype& v, ResultT& update, const ExecSpace space = ExecSpace()) { + + static const Kokkos::Iterate outer_iteration_pattern = + Kokkos::layout_iterate_type_selector::outer_iteration_pattern; + static const Kokkos::Iterate inner_iteration_pattern = + Kokkos::layout_iterate_type_selector::inner_iteration_pattern; + using iterate_type = + Kokkos::Rank<2, outer_iteration_pattern, inner_iteration_pattern>; + using policy_type = + Kokkos::MDRangePolicy>; + + Kokkos::parallel_for( + "ViewAdd-2D", + policy_type(space, {0, 0}, {v.extent(0), v.extent(1)}), + KOKKOS_LAMBDA ( + const iType& i0, + const iType& i1) { + v(i0, i1) += update; + }); + } + + template + static void executeView(const Viewtype& v, ResultT& update, const ExecSpace space = ExecSpace()) { + + static const Kokkos::Iterate outer_iteration_pattern = + Kokkos::layout_iterate_type_selector::outer_iteration_pattern; + static const Kokkos::Iterate inner_iteration_pattern = + Kokkos::layout_iterate_type_selector::inner_iteration_pattern; + using iterate_type = + Kokkos::Rank<2, outer_iteration_pattern, inner_iteration_pattern>; + using policy_type = + Kokkos::MDRangePolicy>; + + Kokkos::parallel_for( + "ViewAdd-2D", + policy_type(space, {0, 0}, {v.extent(0), v.extent(1)}), + KOKKOS_LAMBDA ( + const iType& i0, + const iType& i1) { + v(i0, i1) += update(i0, i1); + }); + } +}; + +template +struct ViewAdd { + + template + static void execute(const Viewtype& v, ResultT& update, const ExecSpace space = ExecSpace()) { + + static const Kokkos::Iterate outer_iteration_pattern = + Kokkos::layout_iterate_type_selector::outer_iteration_pattern; + static const Kokkos::Iterate inner_iteration_pattern = + Kokkos::layout_iterate_type_selector::inner_iteration_pattern; + using iterate_type = + Kokkos::Rank<3, outer_iteration_pattern, inner_iteration_pattern>; + using policy_type = + Kokkos::MDRangePolicy>; + + Kokkos::parallel_for( + "ViewAdd-3D", + policy_type(space, {0, 0}, {v.extent(0), v.extent(1), v.extent(2)}), + KOKKOS_LAMBDA ( + const iType& i0, + const iType& i1, + const iType& i2) { + v(i0, i1, i2) += update; + }); + } + + template + static void executeView(const Viewtype& v, ResultT& update, const ExecSpace space = ExecSpace()) { + + static const Kokkos::Iterate outer_iteration_pattern = + Kokkos::layout_iterate_type_selector::outer_iteration_pattern; + static const Kokkos::Iterate inner_iteration_pattern = + Kokkos::layout_iterate_type_selector::inner_iteration_pattern; + using iterate_type = + Kokkos::Rank<3, outer_iteration_pattern, inner_iteration_pattern>; + using policy_type = + Kokkos::MDRangePolicy>; + + Kokkos::parallel_for( + "ViewAdd-3D", + policy_type(space, {0, 0}, {v.extent(0), v.extent(1), v.extent(2)}), + KOKKOS_LAMBDA ( + const iType& i0, + const iType& i1, + const iType& i2) { + v(i0, i1, i2) += update(i0, i1, i2); + }); + } +}; + + +template +void parallel_sum(typename Kokkos::ViewTraits::value_type &sum, const Kokkos::View A) { + using ViewtypeA = Kokkos::View; + Kokkos::fence("parallel_sum: pre sum fence"); + if (A.span_is_contiguous()) { + + using ViewTypeFlat = Kokkos::View< + typename ViewtypeA::value_type*, Kokkos::LayoutRight, + Kokkos::Device>, + Kokkos::MemoryTraits<0>>; + + ViewTypeFlat A_flat(A.data(), A.size()); + ViewSum::template execute(sum, A_flat); + } + else { + ViewSum::template execute(sum, A); + } + Kokkos::fence("parallel_sum: post sum fence"); +} + +template +void parallel_sum(const ExecSpace& space, typename Kokkos::ViewTraits::value_type &sum, const Kokkos::View A) { + using ViewtypeA = Kokkos::View; + space.fence("parallel_sum: pre sum fence"); + if (A.span_is_contiguous()) { + + using ViewTypeFlat = Kokkos::View< + typename ViewtypeA::value_type*, Kokkos::LayoutRight, + Kokkos::Device>, + Kokkos::MemoryTraits<0>>; + + ViewTypeFlat A_flat(A.data(), A.size()); + ViewSum::template execute(sum, A_flat, space); + } + else { + ViewSum::template execute(sum, A, space); + } + space.fence("parallel_sum: post sum fence"); +} + +template +void parallel_sum(Kokkos::View A, typename Kokkos::ViewTraits::const_value_type b) { + using ViewtypeA = Kokkos::View; + Kokkos::fence("parallel_sum: pre add fence"); + if (A.span_is_contiguous()) { + + using ViewTypeFlat = Kokkos::View< + typename ViewtypeA::value_type*, Kokkos::LayoutRight, + Kokkos::Device>, + Kokkos::MemoryTraits<0>>; + + ViewTypeFlat A_flat(A.data(), A.size()); + ViewAdd::template execute(A_flat, b); + } + else { + ViewAdd::template execute(A, b); + } + Kokkos::fence("parallel_sum: post add fence"); +} + +template +void parallel_sum(const ExecSpace& space, Kokkos::View A, typename Kokkos::ViewTraits::const_value_type b) { + using ViewtypeA = Kokkos::View; + space.fence("parallel_sum: pre add fence"); + if (A.span_is_contiguous()) { + + using ViewTypeFlat = Kokkos::View< + typename ViewtypeA::value_type*, Kokkos::LayoutRight, + Kokkos::Device>, + Kokkos::MemoryTraits<0>>; + + ViewTypeFlat A_flat(A.data(), A.size()); + ViewAdd::template execute(A_flat, b, space); + } + else { + ViewAdd::template execute(A, b, space); + } + space.fence("parallel_sum: post add fence"); +} + +template +void parallel_sum(Kokkos::View A, const Kokkos::View B) { + using ViewtypeA = Kokkos::View; + using ViewtypeA = Kokkos::View; + Kokkos::fence("parallel_sum: pre add fence"); + + ViewAdd::template executeView(A, B); + + Kokkos::fence("parallel_sum: post add fence"); +} + +} + +#endif \ No newline at end of file