diff --git a/include/boost/math/distributions/beta.hpp b/include/boost/math/distributions/beta.hpp index 6c17ffa1a2..fef991a870 100644 --- a/include/boost/math/distributions/beta.hpp +++ b/include/boost/math/distributions/beta.hpp @@ -25,12 +25,15 @@ #ifndef BOOST_MATH_DIST_BETA_HPP #define BOOST_MATH_DIST_BETA_HPP +#include +#include #include #include // for beta. #include // complements. #include // error checks #include // isnan. #include // for root finding. +#include #if defined (BOOST_MSVC) # pragma warning(push) @@ -38,8 +41,6 @@ // in domain_error_imp in error_handling #endif -#include - namespace boost { namespace math @@ -48,7 +49,7 @@ namespace boost { // Common error checking routines for beta distribution functions: template - inline bool check_alpha(const char* function, const RealType& alpha, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_alpha(const char* function, const RealType& alpha, RealType* result, const Policy& pol) { if(!(boost::math::isfinite)(alpha) || (alpha <= 0)) { @@ -61,7 +62,7 @@ namespace boost } // bool check_alpha template - inline bool check_beta(const char* function, const RealType& beta, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_beta(const char* function, const RealType& beta, RealType* result, const Policy& pol) { if(!(boost::math::isfinite)(beta) || (beta <= 0)) { @@ -74,7 +75,7 @@ namespace boost } // bool check_beta template - inline bool check_prob(const char* function, const RealType& p, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_prob(const char* function, const RealType& p, RealType* result, const Policy& pol) { if((p < 0) || (p > 1) || !(boost::math::isfinite)(p)) { @@ -87,7 +88,7 @@ namespace boost } // bool check_prob template - inline bool check_x(const char* function, const RealType& x, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_x(const char* function, const RealType& x, RealType* result, const Policy& pol) { if(!(boost::math::isfinite)(x) || (x < 0) || (x > 1)) { @@ -100,28 +101,28 @@ namespace boost } // bool check_x template - inline bool check_dist(const char* function, const RealType& alpha, const RealType& beta, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_dist(const char* function, const RealType& alpha, const RealType& beta, RealType* result, const Policy& pol) { // Check both alpha and beta. return check_alpha(function, alpha, result, pol) && check_beta(function, beta, result, pol); } // bool check_dist template - inline bool check_dist_and_x(const char* function, const RealType& alpha, const RealType& beta, RealType x, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_dist_and_x(const char* function, const RealType& alpha, const RealType& beta, RealType x, RealType* result, const Policy& pol) { return check_dist(function, alpha, beta, result, pol) && beta_detail::check_x(function, x, result, pol); } // bool check_dist_and_x template - inline bool check_dist_and_prob(const char* function, const RealType& alpha, const RealType& beta, RealType p, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_dist_and_prob(const char* function, const RealType& alpha, const RealType& beta, RealType p, RealType* result, const Policy& pol) { return check_dist(function, alpha, beta, result, pol) && check_prob(function, p, result, pol); } // bool check_dist_and_prob template - inline bool check_mean(const char* function, const RealType& mean, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_mean(const char* function, const RealType& mean, RealType* result, const Policy& pol) { if(!(boost::math::isfinite)(mean) || (mean <= 0)) { @@ -133,7 +134,7 @@ namespace boost return true; } // bool check_mean template - inline bool check_variance(const char* function, const RealType& variance, RealType* result, const Policy& pol) + BOOST_MATH_GPU_ENABLED inline bool check_variance(const char* function, const RealType& variance, RealType* result, const Policy& pol) { if(!(boost::math::isfinite)(variance) || (variance <= 0)) { @@ -157,7 +158,7 @@ namespace boost typedef RealType value_type; typedef Policy policy_type; - beta_distribution(RealType l_alpha = 1, RealType l_beta = 1) : m_alpha(l_alpha), m_beta(l_beta) + BOOST_MATH_GPU_ENABLED beta_distribution(RealType l_alpha = 1, RealType l_beta = 1) : m_alpha(l_alpha), m_beta(l_beta) { RealType result; beta_detail::check_dist( @@ -167,11 +168,11 @@ namespace boost &result, Policy()); } // beta_distribution constructor. // Accessor functions: - RealType alpha() const + BOOST_MATH_GPU_ENABLED RealType alpha() const { return m_alpha; } - RealType beta() const + BOOST_MATH_GPU_ENABLED RealType beta() const { // . return m_beta; } @@ -183,11 +184,11 @@ namespace boost // http://www.itl.nist.gov/div898/handbook/eda/section3/eda366h.htm // http://www.epi.ucdavis.edu/diagnostictests/betabuster.html - static RealType find_alpha( + BOOST_MATH_GPU_ENABLED static RealType find_alpha( RealType mean, // Expected value of mean. RealType variance) // Expected value of variance. { - static const char* function = "boost::math::beta_distribution<%1%>::find_alpha"; + constexpr auto function = "boost::math::beta_distribution<%1%>::find_alpha"; RealType result = 0; // of error checks. if(false == ( @@ -201,11 +202,11 @@ namespace boost return mean * (( (mean * (1 - mean)) / variance)- 1); } // RealType find_alpha - static RealType find_beta( + BOOST_MATH_GPU_ENABLED static RealType find_beta( RealType mean, // Expected value of mean. RealType variance) // Expected value of variance. { - static const char* function = "boost::math::beta_distribution<%1%>::find_beta"; + constexpr auto function = "boost::math::beta_distribution<%1%>::find_beta"; RealType result = 0; // of error checks. if(false == ( @@ -223,12 +224,12 @@ namespace boost // Estimate alpha & beta from either alpha or beta, and x and probability. // Uses for these parameter estimators are unclear. - static RealType find_alpha( + BOOST_MATH_GPU_ENABLED static RealType find_alpha( RealType beta, // from beta. RealType x, // x. RealType probability) // cdf { - static const char* function = "boost::math::beta_distribution<%1%>::find_alpha"; + constexpr auto function = "boost::math::beta_distribution<%1%>::find_alpha"; RealType result = 0; // of error checks. if(false == ( @@ -245,13 +246,13 @@ namespace boost return static_cast(ibeta_inva(beta, x, probability, Policy())); } // RealType find_alpha(beta, a, probability) - static RealType find_beta( + BOOST_MATH_GPU_ENABLED static RealType find_beta( // ibeta_invb(T b, T x, T p); (alpha, x, cdf,) RealType alpha, // alpha. RealType x, // probability x. RealType probability) // probability cdf. { - static const char* function = "boost::math::beta_distribution<%1%>::find_beta"; + constexpr auto function = "boost::math::beta_distribution<%1%>::find_beta"; RealType result = 0; // of error checks. if(false == ( @@ -281,27 +282,27 @@ namespace boost #endif template - inline const std::pair range(const beta_distribution& /* dist */) + BOOST_MATH_GPU_ENABLED inline const boost::math::pair range(const beta_distribution& /* dist */) { // Range of permissible values for random variable x. using boost::math::tools::max_value; - return std::pair(static_cast(0), static_cast(1)); + return boost::math::pair(static_cast(0), static_cast(1)); } template - inline const std::pair support(const beta_distribution& /* dist */) + BOOST_MATH_GPU_ENABLED inline const boost::math::pair support(const beta_distribution& /* dist */) { // Range of supported values for random variable x. // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero. - return std::pair(static_cast(0), static_cast(1)); + return boost::math::pair(static_cast(0), static_cast(1)); } template - inline RealType mean(const beta_distribution& dist) + BOOST_MATH_GPU_ENABLED inline RealType mean(const beta_distribution& dist) { // Mean of beta distribution = np. return dist.alpha() / (dist.alpha() + dist.beta()); } // mean template - inline RealType variance(const beta_distribution& dist) + BOOST_MATH_GPU_ENABLED inline RealType variance(const beta_distribution& dist) { // Variance of beta distribution = np(1-p). RealType a = dist.alpha(); RealType b = dist.beta(); @@ -309,9 +310,9 @@ namespace boost } // variance template - inline RealType mode(const beta_distribution& dist) + BOOST_MATH_GPU_ENABLED inline RealType mode(const beta_distribution& dist) { - static const char* function = "boost::math::mode(beta_distribution<%1%> const&)"; + constexpr auto function = "boost::math::mode(beta_distribution<%1%> const&)"; RealType result; if ((dist.alpha() <= 1)) @@ -343,7 +344,7 @@ namespace boost //But WILL be provided by the derived accessor as quantile(0.5). template - inline RealType skewness(const beta_distribution& dist) + BOOST_MATH_GPU_ENABLED inline RealType skewness(const beta_distribution& dist) { BOOST_MATH_STD_USING // ADL of std functions. RealType a = dist.alpha(); @@ -352,7 +353,7 @@ namespace boost } // skewness template - inline RealType kurtosis_excess(const beta_distribution& dist) + BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const beta_distribution& dist) { RealType a = dist.alpha(); RealType b = dist.beta(); @@ -363,17 +364,17 @@ namespace boost } // kurtosis_excess template - inline RealType kurtosis(const beta_distribution& dist) + BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const beta_distribution& dist) { return 3 + kurtosis_excess(dist); } // kurtosis template - inline RealType pdf(const beta_distribution& dist, const RealType& x) + BOOST_MATH_GPU_ENABLED inline RealType pdf(const beta_distribution& dist, const RealType& x) { // Probability Density/Mass Function. BOOST_FPU_EXCEPTION_GUARD - static const char* function = "boost::math::pdf(beta_distribution<%1%> const&, %1%)"; + constexpr auto function = "boost::math::pdf(beta_distribution<%1%> const&, %1%)"; BOOST_MATH_STD_USING // for ADL of std functions @@ -428,11 +429,11 @@ namespace boost } // pdf template - inline RealType cdf(const beta_distribution& dist, const RealType& x) + BOOST_MATH_GPU_ENABLED inline RealType cdf(const beta_distribution& dist, const RealType& x) { // Cumulative Distribution Function beta. BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::cdf(beta_distribution<%1%> const&, %1%)"; + constexpr auto function = "boost::math::cdf(beta_distribution<%1%> const&, %1%)"; RealType a = dist.alpha(); RealType b = dist.beta(); @@ -459,12 +460,12 @@ namespace boost } // beta cdf template - inline RealType cdf(const complemented2_type, RealType>& c) + BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type, RealType>& c) { // Complemented Cumulative Distribution Function beta. BOOST_MATH_STD_USING // for ADL of std functions - static const char* function = "boost::math::cdf(beta_distribution<%1%> const&, %1%)"; + constexpr auto function = "boost::math::cdf(beta_distribution<%1%> const&, %1%)"; RealType const& x = c.param; beta_distribution const& dist = c.dist; @@ -495,7 +496,7 @@ namespace boost } // beta cdf template - inline RealType quantile(const beta_distribution& dist, const RealType& p) + BOOST_MATH_GPU_ENABLED inline RealType quantile(const beta_distribution& dist, const RealType& p) { // Quantile or Percent Point beta function or // Inverse Cumulative probability distribution function CDF. // Return x (0 <= x <= 1), @@ -505,7 +506,7 @@ namespace boost // will be less than or equal to that value // is whatever probability you supplied as an argument. - static const char* function = "boost::math::quantile(beta_distribution<%1%> const&, %1%)"; + constexpr auto function = "boost::math::quantile(beta_distribution<%1%> const&, %1%)"; RealType result = 0; // of argument checks: RealType a = dist.alpha(); @@ -530,12 +531,12 @@ namespace boost } // quantile template - inline RealType quantile(const complemented2_type, RealType>& c) + BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type, RealType>& c) { // Complement Quantile or Percent Point beta function . // Return the number of expected x for a given // complement of the probability q. - static const char* function = "boost::math::quantile(beta_distribution<%1%> const&, %1%)"; + constexpr auto function = "boost::math::quantile(beta_distribution<%1%> const&, %1%)"; // // Error checks: diff --git a/include/boost/math/policies/error_handling.hpp b/include/boost/math/policies/error_handling.hpp index ce3f1e7ccd..559e70a2f4 100644 --- a/include/boost/math/policies/error_handling.hpp +++ b/include/boost/math/policies/error_handling.hpp @@ -12,6 +12,7 @@ #include #include #include +#include #ifndef BOOST_MATH_HAS_NVRTC @@ -877,20 +878,6 @@ BOOST_MATH_GPU_ENABLED inline void check_root_iterations(const char* function, s } //namespace policies -namespace detail{ - -// -// Simple helper function to assist in returning a pair from a single value, -// that value usually comes from one of the error handlers above: -// -template -BOOST_MATH_GPU_ENABLED std::pair pair_from_single(const T& val) BOOST_MATH_NOEXCEPT(T) -{ - return std::make_pair(val, val); -} - -} - #ifdef _MSC_VER # pragma warning(pop) #endif @@ -1039,7 +1026,21 @@ BOOST_MATH_GPU_ENABLED inline void check_root_iterations(const char* function, b } // namespace math } // namespace boost -#endif +#endif // BOOST_MATH_HAS_NVRTC + +namespace boost { namespace math { namespace detail { + +// +// Simple helper function to assist in returning a pair from a single value, +// that value usually comes from one of the error handlers above: +// +template +BOOST_MATH_GPU_ENABLED boost::math::pair pair_from_single(const T& val) BOOST_MATH_NOEXCEPT(T) +{ + return boost::math::make_pair(val, val); +} + +}}} // boost::math::detail #endif // BOOST_MATH_POLICY_ERROR_HANDLING_HPP diff --git a/include/boost/math/special_functions/beta.hpp b/include/boost/math/special_functions/beta.hpp index 00b8e45bf2..27901a1131 100644 --- a/include/boost/math/special_functions/beta.hpp +++ b/include/boost/math/special_functions/beta.hpp @@ -28,14 +28,10 @@ #include #include #include - -#ifndef BOOST_MATH_HAS_NVRTC #include #include #include #include -#include -#endif namespace boost{ namespace math{ @@ -800,7 +796,7 @@ BOOST_MATH_GPU_ENABLED T ibeta_series(T a, T b, T x, T s0, const boost::math::la policies::check_series_iterations("boost::math::ibeta<%1%>(%1%, %1%, %1%) in ibeta_series (without lanczos)", max_iter, pol); return result; } - +#endif // // Continued fraction for the incomplete beta: // @@ -884,7 +880,7 @@ BOOST_MATH_GPU_ENABLED T ibeta_a_step(T a, T b, T x, T y, int k, const Policy& p return prefix; } -#endif + // // This function is only needed for the non-regular incomplete beta, // it computes the delta in: @@ -958,7 +954,6 @@ struct Pn_size #endif }; -#ifndef BOOST_MATH_HAS_GPU_SUPPORT template BOOST_MATH_GPU_ENABLED T beta_small_b_large_a_series(T a, T b, T x, T y, T s0, T mult, const Policy& pol, bool normalised) { @@ -1060,7 +1055,7 @@ BOOST_MATH_GPU_ENABLED T beta_small_b_large_a_series(T a, T b, T x, T y, T s0, T } return sum; } // template T beta_small_b_large_a_series(T a, T b, T x, T y, T s0, T mult, const Lanczos& l, bool normalised) -#endif + // // For integer arguments we can relate the incomplete beta to the // complement of the binomial distribution cdf and use this finite sum. @@ -1130,6 +1125,7 @@ BOOST_MATH_GPU_ENABLED T binomial_ccdf(T n, T k, T x, T y, const Policy& pol) // input range and select the right implementation method for // each domain: // + template BOOST_MATH_GPU_ENABLED T ibeta_imp(T a, T b, T x, const Policy& pol, bool inv, bool normalised, T* p_derivative) { @@ -1749,12 +1745,7 @@ BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type } // namespace math } // namespace boost -// TODO(mborland): Get the ibeta_inv working on NVRTC -#ifndef BOOST_MATH_HAS_NVRTC - #include #include -#endif - #endif // BOOST_MATH_SPECIAL_BETA_HPP diff --git a/include/boost/math/special_functions/binomial.hpp b/include/boost/math/special_functions/binomial.hpp index e776a90bb8..3c49ff30d5 100644 --- a/include/boost/math/special_functions/binomial.hpp +++ b/include/boost/math/special_functions/binomial.hpp @@ -10,20 +10,21 @@ #pragma once #endif +#include +#include #include #include #include #include -#include namespace boost{ namespace math{ template -T binomial_coefficient(unsigned n, unsigned k, const Policy& pol) +BOOST_MATH_GPU_ENABLED T binomial_coefficient(unsigned n, unsigned k, const Policy& pol) { - static_assert(!std::is_integral::value, "Type T must not be an integral type"); + static_assert(!boost::math::is_integral::value, "Type T must not be an integral type"); BOOST_MATH_STD_USING - static const char* function = "boost::math::binomial_coefficient<%1%>(unsigned, unsigned)"; + constexpr auto function = "boost::math::binomial_coefficient<%1%>(unsigned, unsigned)"; if(k > n) return policies::raise_domain_error(function, "The binomial coefficient is undefined for k > n, but got k = %1%.", static_cast(k), pol); T result; // LCOV_EXCL_LINE @@ -43,9 +44,9 @@ T binomial_coefficient(unsigned n, unsigned k, const Policy& pol) { // Use the beta function: if(k < n - k) - result = static_cast(k * beta(static_cast(k), static_cast(n-k+1), pol)); + result = static_cast(k * boost::math::beta(static_cast(k), static_cast(n-k+1), pol)); else - result = static_cast((n - k) * beta(static_cast(k+1), static_cast(n-k), pol)); + result = static_cast((n - k) * boost::math::beta(static_cast(k+1), static_cast(n-k), pol)); if(result == 0) return policies::raise_overflow_error(function, nullptr, pol); result = 1 / result; @@ -59,7 +60,7 @@ T binomial_coefficient(unsigned n, unsigned k, const Policy& pol) // we'll promote to double: // template <> -inline float binomial_coefficient >(unsigned n, unsigned k, const policies::policy<>&) +BOOST_MATH_GPU_ENABLED inline float binomial_coefficient >(unsigned n, unsigned k, const policies::policy<>&) { typedef policies::normalise< policies::policy<>, @@ -71,7 +72,7 @@ inline float binomial_coefficient >(unsigned n, unsign } template -inline T binomial_coefficient(unsigned n, unsigned k) +BOOST_MATH_GPU_ENABLED inline T binomial_coefficient(unsigned n, unsigned k) { return binomial_coefficient(n, k, policies::policy<>()); } diff --git a/include/boost/math/special_functions/cbrt.hpp b/include/boost/math/special_functions/cbrt.hpp index fb05996cf1..7fdf78d014 100644 --- a/include/boost/math/special_functions/cbrt.hpp +++ b/include/boost/math/special_functions/cbrt.hpp @@ -11,15 +11,16 @@ #pragma once #endif -#ifndef __CUDACC_RTC__ - #include + +#ifndef BOOST_MATH_HAS_NVRTC + #include +#include +#include #include #include #include -#include -#include namespace boost{ namespace math{ @@ -174,19 +175,30 @@ BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type cbrt(T z) } // namespace math } // namespace boost -#else +#else // Special NVRTC handling namespace boost { namespace math { template -__host__ __device__ T cbrt(T x) +BOOST_MATH_GPU_ENABLED double cbrt(T x) +{ + return ::cbrt(x); +} + +BOOST_MATH_GPU_ENABLED inline float cbrt(float x) +{ + return ::cbrtf(x); +} + +template +BOOST_MATH_GPU_ENABLED double cbrt(T x, const Policy&) { return ::cbrt(x); } -template <> -__host__ __device__ float cbrt(float x) +template +BOOST_MATH_GPU_ENABLED float cbrt(float x, const Policy&) { return ::cbrtf(x); } diff --git a/include/boost/math/special_functions/detail/ibeta_inv_ab.hpp b/include/boost/math/special_functions/detail/ibeta_inv_ab.hpp index 9e30db2a37..aab18f50f1 100644 --- a/include/boost/math/special_functions/detail/ibeta_inv_ab.hpp +++ b/include/boost/math/special_functions/detail/ibeta_inv_ab.hpp @@ -17,17 +17,19 @@ #pragma once #endif -#include -#include +#include #include +#include +#include +#include namespace boost{ namespace math{ namespace detail{ template struct beta_inv_ab_t { - beta_inv_ab_t(T b_, T z_, T p_, bool invert_, bool swap_ab_) : b(b_), z(z_), p(p_), invert(invert_), swap_ab(swap_ab_) {} - T operator()(T a) + BOOST_MATH_GPU_ENABLED beta_inv_ab_t(T b_, T z_, T p_, bool invert_, bool swap_ab_) : b(b_), z(z_), p(p_), invert(invert_), swap_ab(swap_ab_) {} + BOOST_MATH_GPU_ENABLED T operator()(T a) { return invert ? p - boost::math::ibetac(swap_ab ? b : a, swap_ab ? a : b, z, Policy()) @@ -39,7 +41,7 @@ struct beta_inv_ab_t }; template -T inverse_negative_binomial_cornish_fisher(T n, T sf, T sfc, T p, T q, const Policy& pol) +BOOST_MATH_GPU_ENABLED T inverse_negative_binomial_cornish_fisher(T n, T sf, T sfc, T p, T q, const Policy& pol) { BOOST_MATH_STD_USING // mean: @@ -72,7 +74,7 @@ T inverse_negative_binomial_cornish_fisher(T n, T sf, T sfc, T p, T q, const Pol } template -T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab, const Policy& pol) +BOOST_MATH_GPU_ENABLED T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab, const Policy& pol) { BOOST_MATH_STD_USING // for ADL of std lib math functions // @@ -121,11 +123,11 @@ T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab, // if((p < q) != swap_ab) { - guess = (std::min)(T(b * 2), T(1)); + guess = BOOST_MATH_GPU_SAFE_MIN(T(b * 2), T(1)); } else { - guess = (std::min)(T(b / 2), T(1)); + guess = BOOST_MATH_GPU_SAFE_MIN(T(b / 2), T(1)); } } if(n * n * n * u * sf > 0.005) @@ -138,11 +140,11 @@ T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab, // if((p < q) != swap_ab) { - guess = (std::min)(T(b * 2), T(10)); + guess = BOOST_MATH_GPU_SAFE_MIN(T(b * 2), T(10)); } else { - guess = (std::min)(T(b / 2), T(10)); + guess = BOOST_MATH_GPU_SAFE_MIN(T(b / 2), T(10)); } } else @@ -151,8 +153,8 @@ T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab, // // Max iterations permitted: // - std::uintmax_t max_iter = policies::get_max_root_iterations(); - std::pair r = bracket_and_solve_root(f, guess, factor, swap_ab ? true : false, tol, max_iter, pol); + boost::math::uintmax_t max_iter = policies::get_max_root_iterations(); + boost::math::pair r = bracket_and_solve_root(f, guess, factor, swap_ab ? true : false, tol, max_iter, pol); if(max_iter >= policies::get_max_root_iterations()) return policies::raise_evaluation_error("boost::math::ibeta_invab_imp<%1%>(%1%,%1%,%1%)", "Unable to locate the root within a reasonable number of iterations, closest approximation so far was %1%", r.first, pol); return (r.first + r.second) / 2; @@ -161,7 +163,7 @@ T ibeta_inv_ab_imp(const T& b, const T& z, const T& p, const T& q, bool swap_ab, } // namespace detail template -typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ibeta_inva(RT1 b, RT2 x, RT3 p, const Policy& pol) { typedef typename tools::promote_args::type result_type; @@ -194,7 +196,7 @@ typename tools::promote_args::type } template -typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ibetac_inva(RT1 b, RT2 x, RT3 q, const Policy& pol) { typedef typename tools::promote_args::type result_type; @@ -227,7 +229,7 @@ typename tools::promote_args::type } template -typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ibeta_invb(RT1 a, RT2 x, RT3 p, const Policy& pol) { typedef typename tools::promote_args::type result_type; @@ -260,7 +262,7 @@ typename tools::promote_args::type } template -typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED typename tools::promote_args::type ibetac_invb(RT1 a, RT2 x, RT3 q, const Policy& pol) { constexpr auto function = "boost::math::ibeta_invb<%1%>(%1%, %1%, %1%)"; @@ -293,28 +295,28 @@ typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ibeta_inva(RT1 b, RT2 x, RT3 p) { return boost::math::ibeta_inva(b, x, p, policies::policy<>()); } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ibetac_inva(RT1 b, RT2 x, RT3 q) { return boost::math::ibetac_inva(b, x, q, policies::policy<>()); } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ibeta_invb(RT1 a, RT2 x, RT3 p) { return boost::math::ibeta_invb(a, x, p, policies::policy<>()); } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ibetac_invb(RT1 a, RT2 x, RT3 q) { return boost::math::ibetac_invb(a, x, q, policies::policy<>()); diff --git a/include/boost/math/special_functions/detail/ibeta_inverse.hpp b/include/boost/math/special_functions/detail/ibeta_inverse.hpp index 9e4fb08d4e..90f6e90705 100644 --- a/include/boost/math/special_functions/detail/ibeta_inverse.hpp +++ b/include/boost/math/special_functions/detail/ibeta_inverse.hpp @@ -11,12 +11,14 @@ #pragma once #endif +#include +#include +#include +#include #include #include -#include #include #include -#include namespace boost{ namespace math{ namespace detail{ @@ -27,12 +29,12 @@ namespace boost{ namespace math{ namespace detail{ template struct temme_root_finder { - temme_root_finder(const T t_, const T a_) : t(t_), a(a_) { + BOOST_MATH_GPU_ENABLED temme_root_finder(const T t_, const T a_) : t(t_), a(a_) { BOOST_MATH_ASSERT( math::tools::epsilon() <= a && !(boost::math::isinf)(a)); } - boost::math::tuple operator()(T x) + BOOST_MATH_GPU_ENABLED boost::math::tuple operator()(T x) { BOOST_MATH_STD_USING // ADL of std names @@ -52,7 +54,7 @@ struct temme_root_finder // Section 2. // template -T temme_method_1_ibeta_inverse(T a, T b, T z, const Policy& pol) +BOOST_MATH_GPU_ENABLED T temme_method_1_ibeta_inverse(T a, T b, T z, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names @@ -138,7 +140,7 @@ T temme_method_1_ibeta_inverse(T a, T b, T z, const Policy& pol) // Section 3. // template -T temme_method_2_ibeta_inverse(T /*a*/, T /*b*/, T z, T r, T theta, const Policy& pol) +BOOST_MATH_GPU_ENABLED T temme_method_2_ibeta_inverse(T /*a*/, T /*b*/, T z, T r, T theta, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names @@ -315,7 +317,7 @@ T temme_method_2_ibeta_inverse(T /*a*/, T /*b*/, T z, T r, T theta, const Policy // Section 4. // template -T temme_method_3_ibeta_inverse(T a, T b, T p, T q, const Policy& pol) +BOOST_MATH_GPU_ENABLED T temme_method_3_ibeta_inverse(T a, T b, T p, T q, const Policy& pol) { BOOST_MATH_STD_USING // ADL of std names @@ -420,10 +422,10 @@ T temme_method_3_ibeta_inverse(T a, T b, T p, T q, const Policy& pol) template struct ibeta_roots { - ibeta_roots(T _a, T _b, T t, bool inv = false) + BOOST_MATH_GPU_ENABLED ibeta_roots(T _a, T _b, T t, bool inv = false) : a(_a), b(_b), target(t), invert(inv) {} - boost::math::tuple operator()(T x) + BOOST_MATH_GPU_ENABLED boost::math::tuple operator()(T x) { BOOST_MATH_STD_USING // ADL of std names @@ -457,7 +459,7 @@ struct ibeta_roots }; template -T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) +BOOST_MATH_GPU_ENABLED T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) { BOOST_MATH_STD_USING // For ADL of math functions. @@ -487,8 +489,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) return p; } // Change things around so we can handle as b == 1 special case below: - std::swap(a, b); - std::swap(p, q); + BOOST_MATH_GPU_SAFE_SWAP(a, b); + BOOST_MATH_GPU_SAFE_SWAP(p, q); invert = true; } // @@ -524,8 +526,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) } else if(b > 0.5f) { - std::swap(a, b); - std::swap(p, q); + BOOST_MATH_GPU_SAFE_SWAP(a, b); + BOOST_MATH_GPU_SAFE_SWAP(p, q); invert = !invert; } } @@ -559,7 +561,7 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) y = -boost::math::expm1(boost::math::log1p(-q, pol) / a, pol); } if(invert) - std::swap(x, y); + BOOST_MATH_GPU_SAFE_SWAP(x, y); if(py) *py = y; return x; @@ -574,12 +576,12 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) // if(p > 0.5) { - std::swap(a, b); - std::swap(p, q); + BOOST_MATH_GPU_SAFE_SWAP(a, b); + BOOST_MATH_GPU_SAFE_SWAP(p, q); invert = !invert; } - T minv = (std::min)(a, b); - T maxv = (std::max)(a, b); + T minv = BOOST_MATH_GPU_SAFE_MIN(a, b); + T maxv = BOOST_MATH_GPU_SAFE_MAX(a, b); if((sqrt(minv) > (maxv - minv)) && (minv > 5)) { // @@ -630,8 +632,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) // if(a < b) { - std::swap(a, b); - std::swap(p, q); + BOOST_MATH_GPU_SAFE_SWAP(a, b); + BOOST_MATH_GPU_SAFE_SWAP(p, q); invert = !invert; } // @@ -694,8 +696,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) } if(fs < 0) { - std::swap(a, b); - std::swap(p, q); + BOOST_MATH_GPU_SAFE_SWAP(a, b); + BOOST_MATH_GPU_SAFE_SWAP(p, q); invert = !invert; xs = 1 - xs; } @@ -758,9 +760,9 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) if(ps < 0) { - std::swap(a, b); - std::swap(p, q); - std::swap(xs, xs2); + BOOST_MATH_GPU_SAFE_SWAP(a, b); + BOOST_MATH_GPU_SAFE_SWAP(p, q); + BOOST_MATH_GPU_SAFE_SWAP(xs, xs2); invert = !invert; } // @@ -823,8 +825,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) // if(b < a) { - std::swap(a, b); - std::swap(p, q); + BOOST_MATH_GPU_SAFE_SWAP(a, b); + BOOST_MATH_GPU_SAFE_SWAP(p, q); invert = !invert; } if (a < tools::min_value()) @@ -890,9 +892,9 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) // if(x > 0.5) { - std::swap(a, b); - std::swap(p, q); - std::swap(x, y); + BOOST_MATH_GPU_SAFE_SWAP(a, b); + BOOST_MATH_GPU_SAFE_SWAP(p, q); + BOOST_MATH_GPU_SAFE_SWAP(x, y); invert = !invert; T l = 1 - upper; T u = 1 - lower; @@ -922,8 +924,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) if(x < lower) x = lower; } - std::uintmax_t max_iter = policies::get_max_root_iterations(); - std::uintmax_t max_iter_used = 0; + boost::math::uintmax_t max_iter = policies::get_max_root_iterations(); + boost::math::uintmax_t max_iter_used = 0; // // Figure out how many digits to iterate towards: // @@ -946,7 +948,13 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) // Now iterate, we can use either p or q as the target here // depending on which is smaller: // + // Since we can't use halley_iterate on device we use newton raphson + // + #ifndef BOOST_MATH_HAS_GPU_SUPPORT x = boost::math::tools::halley_iterate( + #else + x = boost::math::tools::newton_raphson_iterate( + #endif boost::math::detail::ibeta_roots(a, b, (p < q ? p : q), (p < q ? false : true)), x, lower, upper, digits, max_iter); policies::check_root_iterations("boost::math::ibeta<%1%>(%1%, %1%, %1%)", max_iter + max_iter_used, pol); // @@ -968,7 +976,7 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py) } // namespace detail template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ibeta_inv(T1 a, T2 b, T3 p, T4* py, const Policy& pol) { constexpr auto function = "boost::math::ibeta_inv<%1%>(%1%,%1%,%1%)"; @@ -1003,14 +1011,14 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ibeta_inv(T1 a, T2 b, T3 p, T4* py) { return ibeta_inv(a, b, p, py, policies::policy<>()); } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ibeta_inv(T1 a, T2 b, T3 p) { typedef typename tools::promote_args::type result_type; @@ -1018,7 +1026,7 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ibeta_inv(T1 a, T2 b, T3 p, const Policy& pol) { typedef typename tools::promote_args::type result_type; @@ -1026,7 +1034,7 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ibetac_inv(T1 a, T2 b, T3 q, T4* py, const Policy& pol) { constexpr auto function = "boost::math::ibetac_inv<%1%>(%1%,%1%,%1%)"; @@ -1061,14 +1069,14 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ibetac_inv(T1 a, T2 b, T3 q, T4* py) { return ibetac_inv(a, b, q, py, policies::policy<>()); } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ibetac_inv(RT1 a, RT2 b, RT3 q) { typedef typename tools::promote_args::type result_type; @@ -1076,7 +1084,7 @@ inline typename tools::promote_args::type } template -inline typename tools::promote_args::type +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type ibetac_inv(RT1 a, RT2 b, RT3 q, const Policy& pol) { typedef typename tools::promote_args::type result_type; diff --git a/include/boost/math/special_functions/detail/t_distribution_inv.hpp b/include/boost/math/special_functions/detail/t_distribution_inv.hpp index 9209b6d405..22e1d11f02 100644 --- a/include/boost/math/special_functions/detail/t_distribution_inv.hpp +++ b/include/boost/math/special_functions/detail/t_distribution_inv.hpp @@ -11,6 +11,9 @@ #pragma once #endif +#include +#include +#include #include #include #include @@ -24,7 +27,7 @@ namespace boost{ namespace math{ namespace detail{ // Communications of the ACM, 13(10): 619-620, Oct., 1970. // template -T inverse_students_t_hill(T ndf, T u, const Policy& pol) +BOOST_MATH_GPU_ENABLED T inverse_students_t_hill(T ndf, T u, const Policy& pol) { BOOST_MATH_STD_USING BOOST_MATH_ASSERT(u <= 0.5); @@ -74,7 +77,7 @@ T inverse_students_t_hill(T ndf, T u, const Policy& pol) // Journal of Computational Finance, Vol 9 Issue 4, pp 37-73, Summer 2006 // template -T inverse_students_t_tail_series(T df, T v, const Policy& pol) +BOOST_MATH_GPU_ENABLED T inverse_students_t_tail_series(T df, T v, const Policy& pol) { BOOST_MATH_STD_USING // Tail series expansion, see section 6 of Shaw's paper. @@ -125,7 +128,7 @@ T inverse_students_t_tail_series(T df, T v, const Policy& pol) } template -T inverse_students_t_body_series(T df, T u, const Policy& pol) +BOOST_MATH_GPU_ENABLED T inverse_students_t_body_series(T df, T u, const Policy& pol) { BOOST_MATH_STD_USING // @@ -204,7 +207,7 @@ T inverse_students_t_body_series(T df, T u, const Policy& pol) } template -T inverse_students_t(T df, T u, T v, const Policy& pol, bool* pexact = nullptr) +BOOST_MATH_GPU_ENABLED T inverse_students_t(T df, T u, T v, const Policy& pol, bool* pexact = nullptr) { // // df = number of degrees of freedom. @@ -220,7 +223,7 @@ T inverse_students_t(T df, T u, T v, const Policy& pol, bool* pexact = nullptr) if(u > v) { // function is symmetric, invert it: - std::swap(u, v); + BOOST_MATH_GPU_SAFE_SWAP(u, v); invert = true; } if((floor(df) == df) && (df < 20)) @@ -416,7 +419,7 @@ T inverse_students_t(T df, T u, T v, const Policy& pol, bool* pexact = nullptr) } template -inline T find_ibeta_inv_from_t_dist(T a, T p, T /*q*/, T* py, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T find_ibeta_inv_from_t_dist(T a, T p, T /*q*/, T* py, const Policy& pol) { T u = p / 2; T v = 1 - u; @@ -427,7 +430,7 @@ inline T find_ibeta_inv_from_t_dist(T a, T p, T /*q*/, T* py, const Policy& pol) } template -inline T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const std::false_type*) +BOOST_MATH_GPU_ENABLED inline T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const boost::math::false_type*) { BOOST_MATH_STD_USING // @@ -450,12 +453,12 @@ inline T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const std::f } template -T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const std::true_type*) +BOOST_MATH_GPU_ENABLED T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const boost::math::true_type*) { BOOST_MATH_STD_USING bool invert = false; if((df < 2) && (floor(df) != df)) - return boost::math::detail::fast_students_t_quantile_imp(df, p, pol, static_cast(nullptr)); + return boost::math::detail::fast_students_t_quantile_imp(df, p, pol, static_cast(nullptr)); if(p > 0.5) { p = 1 - p; @@ -521,7 +524,7 @@ T fast_students_t_quantile_imp(T df, T p, const Policy& pol, const std::true_typ } template -inline T fast_students_t_quantile(T df, T p, const Policy& pol) +BOOST_MATH_GPU_ENABLED inline T fast_students_t_quantile(T df, T p, const Policy& pol) { typedef typename policies::evaluation::type value_type; typedef typename policies::normalise< @@ -531,12 +534,12 @@ inline T fast_students_t_quantile(T df, T p, const Policy& pol) policies::discrete_quantile<>, policies::assert_undefined<> >::type forwarding_policy; - typedef std::integral_constant::digits <= 53) + typedef boost::math::integral_constant::digits <= 53) && - (std::numeric_limits::is_specialized) + (boost::math::numeric_limits::is_specialized) && - (std::numeric_limits::radix == 2) + (boost::math::numeric_limits::radix == 2) > tag_type; return policies::checked_narrowing_cast(fast_students_t_quantile_imp(static_cast(df), static_cast(p), pol, static_cast(nullptr)), "boost::math::students_t_quantile<%1%>(%1%,%1%,%1%)"); } diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp index 186befb612..41c85936dd 100644 --- a/include/boost/math/special_functions/gamma.hpp +++ b/include/boost/math/special_functions/gamma.hpp @@ -1735,10 +1735,50 @@ BOOST_MATH_GPU_ENABLED T gamma_incomplete_imp(T a, T x, bool normalised, bool in // // Ratios of two gamma functions: // +template +BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos_final(T z, T delta, const Policy& pol, const Lanczos&) +{ + BOOST_MATH_STD_USING + + T zgh = static_cast(z + T(Lanczos::g()) - constants::half()); + T result; + if(z + delta == z) + { + if (fabs(delta / zgh) < boost::math::tools::epsilon()) + { + // We have: + // result = exp((constants::half() - z) * boost::math::log1p(delta / zgh, pol)); + // 0.5 - z == -z + // log1p(delta / zgh) = delta / zgh = delta / z + // multiplying we get -delta. + result = exp(-delta); + } + else + // from the pow formula below... but this may actually be wrong, we just can't really calculate it :( + result = 1; + } + else + { + if(fabs(delta) < 10) + { + result = exp((constants::half() - z) * boost::math::log1p(delta / zgh, pol)); + } + else + { + result = pow(T(zgh / (zgh + delta)), T(z - constants::half())); + } + // Split the calculation up to avoid spurious overflow: + result *= Lanczos::lanczos_sum(z) / Lanczos::lanczos_sum(T(z + delta)); + } + result *= pow(T(constants::e() / (zgh + delta)), delta); + return result; +} + template BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Policy& pol, const Lanczos& l) { BOOST_MATH_STD_USING + if(z < tools::epsilon()) { // @@ -1752,7 +1792,7 @@ BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Poli // if(boost::math::max_factorial::value < delta) { - T ratio = tgamma_delta_ratio_imp_lanczos(delta, T(boost::math::max_factorial::value - delta), pol, l); + T ratio = tgamma_delta_ratio_imp_lanczos_final(T(delta), T(boost::math::max_factorial::value - delta), pol, l); ratio *= z; ratio *= boost::math::unchecked_factorial(boost::math::max_factorial::value - 1); return 1 / ratio; @@ -1773,39 +1813,10 @@ BOOST_MATH_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Poli #endif } } - T zgh = static_cast(z + T(Lanczos::g()) - constants::half()); - T result; - if(z + delta == z) - { - if (fabs(delta / zgh) < boost::math::tools::epsilon()) - { - // We have: - // result = exp((constants::half() - z) * boost::math::log1p(delta / zgh, pol)); - // 0.5 - z == -z - // log1p(delta / zgh) = delta / zgh = delta / z - // multiplying we get -delta. - result = exp(-delta); - } - else - // from the pow formula below... but this may actually be wrong, we just can't really calculate it :( - result = 1; - } - else - { - if(fabs(delta) < 10) - { - result = exp((constants::half() - z) * boost::math::log1p(delta / zgh, pol)); - } - else - { - result = pow(T(zgh / (zgh + delta)), T(z - constants::half())); - } - // Split the calculation up to avoid spurious overflow: - result *= Lanczos::lanczos_sum(z) / Lanczos::lanczos_sum(T(z + delta)); - } - result *= pow(T(constants::e() / (zgh + delta)), delta); - return result; + + return tgamma_delta_ratio_imp_lanczos_final(T(z), T(delta), pol, l); } + // // And again without Lanczos support this time: // diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp index 16ae3b61eb..e3a2722c3d 100644 --- a/include/boost/math/special_functions/math_fwd.hpp +++ b/include/boost/math/special_functions/math_fwd.hpp @@ -26,7 +26,19 @@ #include -#ifndef BOOST_MATH_HAS_NVRTC +#ifdef BOOST_MATH_HAS_NVRTC + +namespace boost { +namespace math { + +template +BOOST_MATH_GPU_ENABLED inline typename tools::promote_args::type +beta(RT1 a, RT2 b, A arg); + +} // namespace math +} // namespace boost + +#else #include #include @@ -154,9 +166,9 @@ namespace boost // Binomial: template - T binomial_coefficient(unsigned n, unsigned k, const Policy& pol); + BOOST_MATH_GPU_ENABLED T binomial_coefficient(unsigned n, unsigned k, const Policy& pol); template - T binomial_coefficient(unsigned n, unsigned k); + BOOST_MATH_GPU_ENABLED T binomial_coefficient(unsigned n, unsigned k); // erf & erfc error functions. template // Error function. @@ -874,19 +886,19 @@ namespace boost BOOST_MATH_GPU_ENABLED tools::promote_args_t cos_pi(T x); template - int fpclassify BOOST_NO_MACRO_EXPAND(T t); + BOOST_MATH_GPU_ENABLED int fpclassify BOOST_NO_MACRO_EXPAND(T t); template - bool isfinite BOOST_NO_MACRO_EXPAND(T z); + BOOST_MATH_GPU_ENABLED bool isfinite BOOST_NO_MACRO_EXPAND(T z); template - bool isinf BOOST_NO_MACRO_EXPAND(T t); + BOOST_MATH_GPU_ENABLED bool isinf BOOST_NO_MACRO_EXPAND(T t); template - bool isnan BOOST_NO_MACRO_EXPAND(T t); + BOOST_MATH_GPU_ENABLED bool isnan BOOST_NO_MACRO_EXPAND(T t); template - bool isnormal BOOST_NO_MACRO_EXPAND(T t); + BOOST_MATH_GPU_ENABLED bool isnormal BOOST_NO_MACRO_EXPAND(T t); template BOOST_MATH_GPU_ENABLED int signbit BOOST_NO_MACRO_EXPAND(T x); @@ -1218,62 +1230,62 @@ namespace boost BOOST_MATH_DETAIL_11_FUNC(Policy)\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ beta(RT1 a, RT2 b) { return ::boost::math::beta(a, b, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ beta(RT1 a, RT2 b, A x){ return ::boost::math::beta(a, b, x, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ betac(RT1 a, RT2 b, RT3 x) { return ::boost::math::betac(a, b, x, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ibeta(RT1 a, RT2 b, RT3 x){ return ::boost::math::ibeta(a, b, x, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ibetac(RT1 a, RT2 b, RT3 x){ return ::boost::math::ibetac(a, b, x, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ibeta_inv(T1 a, T2 b, T3 p, T4* py){ return ::boost::math::ibeta_inv(a, b, p, py, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ibeta_inv(RT1 a, RT2 b, RT3 p){ return ::boost::math::ibeta_inv(a, b, p, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ibetac_inv(T1 a, T2 b, T3 q, T4* py){ return ::boost::math::ibetac_inv(a, b, q, py, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ibeta_inva(RT1 a, RT2 b, RT3 p){ return ::boost::math::ibeta_inva(a, b, p, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ibetac_inva(T1 a, T2 b, T3 q){ return ::boost::math::ibetac_inva(a, b, q, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ibeta_invb(RT1 a, RT2 b, RT3 p){ return ::boost::math::ibeta_invb(a, b, p, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ibetac_invb(T1 a, T2 b, T3 q){ return ::boost::math::ibetac_invb(a, b, q, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ibetac_inv(RT1 a, RT2 b, RT3 q){ return ::boost::math::ibetac_inv(a, b, q, Policy()); }\ \ template \ - inline boost::math::tools::promote_args_t \ + BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t \ ibeta_derivative(RT1 a, RT2 b, RT3 x){ return ::boost::math::ibeta_derivative(a, b, x, Policy()); }\ \ - template T binomial_coefficient(unsigned n, unsigned k){ return ::boost::math::binomial_coefficient(n, k, Policy()); }\ + template BOOST_MATH_GPU_ENABLED T binomial_coefficient(unsigned n, unsigned k){ return ::boost::math::binomial_coefficient(n, k, Policy()); }\ \ template \ BOOST_MATH_GPU_ENABLED inline boost::math::tools::promote_args_t erf(RT z) { return ::boost::math::erf(z, Policy()); }\ diff --git a/include/boost/math/tools/config.hpp b/include/boost/math/tools/config.hpp index a2af127630..fac8f84db0 100644 --- a/include/boost/math/tools/config.hpp +++ b/include/boost/math/tools/config.hpp @@ -800,7 +800,7 @@ BOOST_MATH_GPU_ENABLED constexpr T gpu_safe_max(const T& a, const T& b) { return #define BOOST_MATH_BIG_CONSTANT(T, N, V) static_cast(V) #define BOOST_MATH_FORCEINLINE __forceinline__ #define BOOST_MATH_STD_USING -#define BOOST_MATH_IF_CONSTEXPR if constexpr +#define BOOST_MATH_IF_CONSTEXPR if #define BOOST_MATH_IS_FLOAT(T) (boost::math::is_floating_point::value) #define BOOST_MATH_CONSTEXPR_TABLE_FUNCTION constexpr #define BOOST_MATH_NO_EXCEPTIONS diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 1fb55da197..c9a70e8a99 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -25,6 +25,13 @@ run test_bernoulli_pdf_float.cu ; run test_bernoulli_range_support_double.cu ; run test_bernoulli_range_support_float.cu ; +run test_beta_dist_cdf_double.cu ; +run test_beta_dist_cdf_float.cu ; +run test_beta_dist_pdf_double.cu ; +run test_beta_dist_pdf_float.cu ; +run test_beta_dist_quan_double.cu ; +run test_beta_dist_quan_float.cu ; + run test_cauchy_cdf_double.cu ; run test_cauchy_cdf_float.cu ; run test_cauchy_pdf_double.cu ; @@ -107,6 +114,24 @@ run test_weibull_quan_float.cu ; # Special Functions run test_beta_double.cu ; run test_beta_float.cu ; +run test_betac_double.cu ; +run test_betac_float.cu ; +run test_ibeta_double.cu ; +run test_ibeta_float.cu ; +run test_ibeta_derivative_double.cu ; +run test_ibeta_derivative_float.cu ; +run test_ibeta_inv_double.cu ; +run test_ibeta_inv_float.cu ; +run test_ibeta_inva_double.cu ; +run test_ibeta_inva_float.cu ; +run test_ibeta_invb_double.cu ; +run test_ibeta_invb_float.cu ; +run test_ibetac_inv_double.cu ; +run test_ibetac_inv_float.cu ; +run test_ibetac_inva_double.cu ; +run test_ibetac_inva_float.cu ; +run test_ibetac_invb_double.cu ; +run test_ibetac_invb_float.cu ; run test_bessel_i0_double.cu ; run test_bessel_i0_float.cu ; diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index 4a37960a51..1fc2746a1f 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -24,6 +24,13 @@ run test_bernoulli_pdf_nvrtc_float.cpp ; run test_bernoulli_quan_nvrtc_double.cpp ; run test_bernoulli_quan_nvrtc_float.cpp ; +run test_beta_dist_cdf_nvrtc_double.cpp ; +run test_beta_dist_cdf_nvrtc_float.cpp ; +run test_beta_dist_pdf_nvrtc_double.cpp ; +run test_beta_dist_pdf_nvrtc_float.cpp ; +run test_beta_dist_quan_nvrtc_double.cpp ; +run test_beta_dist_quan_nvrtc_float.cpp ; + run test_cauchy_cdf_nvrtc_double.cpp ; run test_cauchy_cdf_nvrtc_float.cpp ; run test_cauchy_pdf_nvrtc_double.cpp ; @@ -104,6 +111,26 @@ run test_weibull_quan_nvrtc_float.cpp ; # Special Functions run test_beta_nvrtc_double.cpp ; run test_beta_nvrtc_float.cpp ; +run test_betac_nvrtc_double.cpp ; +run test_betac_nvrtc_float.cpp ; +run test_ibeta_nvrtc_double.cpp ; +run test_ibeta_nvrtc_float.cpp ; +run test_ibetac_nvrtc_double.cpp ; +run test_ibetac_nvrtc_float.cpp ; +run test_ibeta_derivative_nvrtc_double.cpp ; +run test_ibeta_derivative_nvrtc_float.cpp ; +run test_ibeta_inv_nvrtc_double.cpp ; +run test_ibeta_inv_nvrtc_float.cpp ; +run test_ibeta_inva_nvrtc_double.cpp ; +run test_ibeta_inva_nvrtc_float.cpp ; +run test_ibeta_invb_nvrtc_double.cpp ; +run test_ibeta_invb_nvrtc_float.cpp ; +run test_ibetac_inv_nvrtc_double.cpp ; +run test_ibetac_inv_nvrtc_float.cpp ; +run test_ibetac_inva_nvrtc_double.cpp ; +run test_ibetac_inva_nvrtc_float.cpp ; +run test_ibetac_invb_nvrtc_double.cpp ; +run test_ibetac_invb_nvrtc_float.cpp ; run test_bessel_i0_nvrtc_double.cpp ; run test_bessel_i0_nvrtc_float.cpp ; diff --git a/test/sycl_jamfile b/test/sycl_jamfile index 2fd5954ae1..5d3d85cd8f 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -12,6 +12,7 @@ project : requirements # Distributions run test_arcsine.cpp ; run test_bernoulli.cpp ; +run test_beta_dist.cpp ; run test_cauchy.cpp ; run test_chi_squared.cpp ; run test_exponential_dist.cpp ; @@ -28,6 +29,10 @@ run test_weibull.cpp ; run pow_test.cpp ; run test_beta_simple.cpp ; +run test_beta.cpp ; +run test_ibeta.cpp ; +run test_ibeta_inv.cpp ; +run test_ibeta_inv_ab.cpp ; run test_bessel_i.cpp ; run test_bessel_j.cpp ; diff --git a/test/test_bessel_i.cpp b/test/test_bessel_i.cpp index 09487ddf1b..817569760a 100644 --- a/test/test_bessel_i.cpp +++ b/test/test_bessel_i.cpp @@ -15,7 +15,7 @@ # pragma clang diagnostic ignored "-Wliteral-range" #elif defined(__GNUC__) # pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wliteral-range" +# pragma GCC diagnostic ignored "-Woverflow" #endif #include "test_bessel_i.hpp" diff --git a/test/test_bessel_j.cpp b/test/test_bessel_j.cpp index 31a64bc579..1dd63a68a5 100644 --- a/test/test_bessel_j.cpp +++ b/test/test_bessel_j.cpp @@ -15,7 +15,7 @@ # pragma clang diagnostic ignored "-Wliteral-range" #elif defined(__GNUC__) # pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wliteral-range" +# pragma GCC diagnostic ignored "-Woverflow" #endif #include "test_bessel_j.hpp" diff --git a/test/test_bessel_k.cpp b/test/test_bessel_k.cpp index 84ba0830f2..6c31f5ab05 100644 --- a/test/test_bessel_k.cpp +++ b/test/test_bessel_k.cpp @@ -22,7 +22,7 @@ # pragma clang diagnostic ignored "-Wliteral-range" #elif defined(__GNUC__) # pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wliteral-range" +# pragma GCC diagnostic ignored "-Woverflow" #endif #include "test_bessel_k.hpp" diff --git a/test/test_bessel_y.cpp b/test/test_bessel_y.cpp index 232a903963..8251920c5b 100644 --- a/test/test_bessel_y.cpp +++ b/test/test_bessel_y.cpp @@ -15,7 +15,7 @@ # pragma clang diagnostic ignored "-Wliteral-range" #elif defined(__GNUC__) # pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wliteral-range" +# pragma GCC diagnostic ignored "-Woverflow" #endif #include "test_bessel_y.hpp" diff --git a/test/test_beta.cpp b/test/test_beta.cpp index b24cb32c07..4e27b71353 100644 --- a/test/test_beta.cpp +++ b/test/test_beta.cpp @@ -5,7 +5,17 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -#include "pch_light.hpp" +#ifndef SYCL_LANGUAGE_VERSION +#include +#endif + +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wliteral-range" +#elif defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Woverflow" +#endif #include "test_beta.hpp" diff --git a/test/test_beta.hpp b/test/test_beta.hpp index e633935a3c..3019c17e71 100644 --- a/test/test_beta.hpp +++ b/test/test_beta.hpp @@ -18,9 +18,10 @@ #define BOOST_TEST_MAIN #include #include +#include #include #include -#include +#include "../include_private/boost/math/tools/test.hpp" #include #include #include @@ -109,9 +110,12 @@ void test_spots(T) // Inexact input, so disable for ultra precise long doubles: BOOST_CHECK_CLOSE(::boost::math::beta(static_cast(0.0125L), static_cast(0.000023L)), static_cast(43558.24045647538375006349016083320744662L), tolerance * 2); } + + #ifndef BOOST_MATH_NO_EXCEPTIONS BOOST_CHECK_THROW(boost::math::beta(static_cast(0), static_cast(1)), std::domain_error); BOOST_CHECK_THROW(boost::math::beta(static_cast(-1), static_cast(1)), std::domain_error); BOOST_CHECK_THROW(boost::math::beta(static_cast(1), static_cast(-1)), std::domain_error); BOOST_CHECK_THROW(boost::math::beta(static_cast(1), static_cast(0)), std::domain_error); + #endif } diff --git a/test/test_beta_dist.cpp b/test/test_beta_dist.cpp index 943718a39f..1652309eb7 100644 --- a/test/test_beta_dist.cpp +++ b/test/test_beta_dist.cpp @@ -32,9 +32,14 @@ # pragma warning (disable : 4224) // nonstandard extension used : formal parameter 'arg' was previously defined as a type. #endif +#include + +#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS #include // for real_concept using ::boost::math::concepts::real_concept; -#include +#endif + +#include "../include_private/boost/math/tools/test.hpp" #include // for beta_distribution using boost::math::beta_distribution; @@ -634,12 +639,13 @@ BOOST_AUTO_TEST_CASE( test_main ) BOOST_CHECK_CLOSE_FRACTION(mybeta22.find_alpha(mybeta22.beta(), 0.8, cdf(mybeta22, 0.8)), mybeta22.alpha(), tol); BOOST_CHECK_CLOSE_FRACTION(mybeta22.find_beta(mybeta22.alpha(), 0.8, cdf(mybeta22, 0.8)), mybeta22.beta(), tol); - + #ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS beta_distribution rcbeta22(2, 2); // Using RealType real_concept. cout << "numeric_limits::is_specialized " << numeric_limits::is_specialized << endl; cout << "numeric_limits::digits " << numeric_limits::digits << endl; cout << "numeric_limits::digits10 " << numeric_limits::digits10 << endl; cout << "numeric_limits::epsilon " << numeric_limits::epsilon() << endl; + #endif // (Parameter value, arbitrarily zero, only communicates the floating point type). test_spots(0.0F); // Test float. diff --git a/test/test_beta_dist_cdf_double.cu b/test/test_beta_dist_cdf_double.cu new file mode 100644 index 0000000000..fa460244a3 --- /dev/null +++ b/test/test_beta_dist_cdf_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::beta_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::beta_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_beta_dist_cdf_float.cu b/test/test_beta_dist_cdf_float.cu new file mode 100644 index 0000000000..321c844205 --- /dev/null +++ b/test/test_beta_dist_cdf_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = cdf(boost::math::beta_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(cdf(boost::math::beta_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_beta_dist_cdf_nvrtc_double.cpp b/test/test_beta_dist_cdf_nvrtc_double.cpp new file mode 100644 index 0000000000..4f5913c108 --- /dev/null +++ b/test/test_beta_dist_cdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::beta_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_beta_dist_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::beta_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_beta_dist_cdf_nvrtc_float.cpp b/test/test_beta_dist_cdf_nvrtc_float.cpp new file mode 100644 index 0000000000..f5b031c5a9 --- /dev/null +++ b/test/test_beta_dist_cdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = cdf(boost::math::beta_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_beta_dist_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = cdf(boost::math::beta_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_beta_dist_pdf_double.cu b/test/test_beta_dist_pdf_double.cu new file mode 100644 index 0000000000..c0ee9272ae --- /dev/null +++ b/test/test_beta_dist_pdf_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::beta_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::beta_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_beta_dist_pdf_float.cu b/test/test_beta_dist_pdf_float.cu new file mode 100644 index 0000000000..75e4fa27b4 --- /dev/null +++ b/test/test_beta_dist_pdf_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = pdf(boost::math::beta_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(pdf(boost::math::beta_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_beta_dist_pdf_nvrtc_double.cpp b/test/test_beta_dist_pdf_nvrtc_double.cpp new file mode 100644 index 0000000000..c9870e2ce4 --- /dev/null +++ b/test/test_beta_dist_pdf_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::beta_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_beta_dist_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::beta_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_beta_dist_pdf_nvrtc_float.cpp b/test/test_beta_dist_pdf_nvrtc_float.cpp new file mode 100644 index 0000000000..0b4fd83488 --- /dev/null +++ b/test/test_beta_dist_pdf_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = pdf(boost::math::beta_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_beta_dist_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = pdf(boost::math::beta_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_beta_dist_quan_double.cu b/test/test_beta_dist_quan_double.cu new file mode 100644 index 0000000000..101526afae --- /dev/null +++ b/test/test_beta_dist_quan_double.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::beta_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::beta_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_beta_dist_quan_float.cu b/test/test_beta_dist_quan_float.cu new file mode 100644 index 0000000000..77696c6393 --- /dev/null +++ b/test/test_beta_dist_quan_float.cu @@ -0,0 +1,109 @@ +// Copyright John Maddock 2016. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = quantile(boost::math::beta_distribution(), in1[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + try{ + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + boost::random::mt19937 gen; + boost::random::uniform_real_distribution dist; + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = dist(gen); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 512; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(quantile(boost::math::beta_distribution(), input_vector1[i])); + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} \ No newline at end of file diff --git a/test/test_beta_dist_quan_nvrtc_double.cpp b/test/test_beta_dist_quan_nvrtc_double.cpp new file mode 100644 index 0000000000..9726bf019e --- /dev/null +++ b/test/test_beta_dist_quan_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::beta_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_beta_dist_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::beta_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_beta_dist_quan_nvrtc_float.cpp b/test/test_beta_dist_quan_nvrtc_float.cpp new file mode 100644 index 0000000000..d2476cb2ac --- /dev/null +++ b/test/test_beta_dist_quan_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_beta_dist_kernel(const float_type *in1, const float_type*, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = quantile(boost::math::beta_distribution(), in1[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_dist_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_beta_dist_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_dist_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + auto res = quantile(boost::math::beta_distribution(), h_in1[i]); + + if (boost::math::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_betac_double.cu b/test/test_betac_double.cu new file mode 100644 index 0000000000..8bb31d3219 --- /dev/null +++ b/test/test_betac_double.cu @@ -0,0 +1,146 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::betac(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "beta_med_data.ipp" +#include "beta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < beta_med_data.size(); ++i) + { + v1.push_back(beta_med_data[i][0]); + v2.push_back(beta_med_data[i][1]); + v3.push_back(beta_med_data[i][2]); + } + for(unsigned i = 0; i < beta_small_data.size(); ++i) + { + v1.push_back(beta_small_data[i][0]); + v2.push_back(beta_small_data[i][1]); + v3.push_back(beta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::betac(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_betac_float.cu b/test/test_betac_float.cu new file mode 100644 index 0000000000..7070c567cc --- /dev/null +++ b/test/test_betac_float.cu @@ -0,0 +1,146 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::betac(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "beta_med_data.ipp" +#include "beta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < beta_med_data.size(); ++i) + { + v1.push_back(beta_med_data[i][0]); + v2.push_back(beta_med_data[i][1]); + v3.push_back(beta_med_data[i][2]); + } + for(unsigned i = 0; i < beta_small_data.size(); ++i) + { + v1.push_back(beta_small_data[i][0]); + v2.push_back(beta_small_data[i][1]); + v3.push_back(beta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::betac(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_betac_nvrtc_double.cpp b/test/test_betac_nvrtc_double.cpp new file mode 100644 index 0000000000..0667cfe0d4 --- /dev/null +++ b/test/test_betac_nvrtc_double.cpp @@ -0,0 +1,196 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_beta_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::betac(in1[i], in2[i], in3[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_beta_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + h_in3[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::betac(h_in1[i], h_in2[i], h_in3[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_betac_nvrtc_float.cpp b/test/test_betac_nvrtc_float.cpp new file mode 100644 index 0000000000..0667cfe0d4 --- /dev/null +++ b/test/test_betac_nvrtc_float.cpp @@ -0,0 +1,196 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_beta_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::betac(in1[i], in2[i], in3[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_beta_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_beta_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_beta_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(dist(rng)); + h_in2[i] = static_cast(dist(rng)); + h_in3[i] = static_cast(dist(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::betac(h_in1[i], h_in2[i], h_in3[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_erf.cpp b/test/test_erf.cpp index 5044847114..2232c1c759 100644 --- a/test/test_erf.cpp +++ b/test/test_erf.cpp @@ -13,7 +13,7 @@ # pragma clang diagnostic ignored "-Wliteral-range" #elif defined(__GNUC__) # pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wliteral-range" +# pragma GCC diagnostic ignored "-Woverflow" #endif #define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error diff --git a/test/test_holtsmark.cpp b/test/test_holtsmark.cpp index 475f5400aa..93a40924d6 100644 --- a/test/test_holtsmark.cpp +++ b/test/test_holtsmark.cpp @@ -21,7 +21,7 @@ # pragma clang diagnostic ignored "-Wliteral-range" #elif defined(__GNUC__) # pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wliteral-range" +# pragma GCC diagnostic ignored "-Woverflow" #endif using boost::math::holtsmark_distribution; diff --git a/test/test_ibeta.cpp b/test/test_ibeta.cpp index e026ac6c52..987b361105 100644 --- a/test/test_ibeta.cpp +++ b/test/test_ibeta.cpp @@ -3,7 +3,18 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif + +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wliteral-range" +#elif defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Woverflow" +#endif + #include "test_ibeta.hpp" #if !defined(TEST_FLOAT) && !defined(TEST_DOUBLE) && !defined(TEST_LDOUBLE) && !defined(TEST_REAL_CONCEPT) diff --git a/test/test_ibeta.hpp b/test/test_ibeta.hpp index 7c951d614f..cfd5d78cd1 100644 --- a/test/test_ibeta.hpp +++ b/test/test_ibeta.hpp @@ -8,9 +8,10 @@ #define BOOST_TEST_MAIN #include #include +#include #include #include -#include +#include "../include_private/boost/math/tools/test.hpp" #include #include #include diff --git a/test/test_ibeta_derivative.cpp b/test/test_ibeta_derivative.cpp index c899c94bf5..5d6a312754 100644 --- a/test/test_ibeta_derivative.cpp +++ b/test/test_ibeta_derivative.cpp @@ -4,7 +4,7 @@ // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) #if defined(__GNUC__) && __GNUC__ <= 12 #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wliteral-range" +#pragma GCC diagnostic ignored "-Woverflow" #endif #include #include "test_ibeta_derivative.hpp" diff --git a/test/test_ibeta_derivative_double.cu b/test/test_ibeta_derivative_double.cu new file mode 100644 index 0000000000..e5f7f340ba --- /dev/null +++ b/test/test_ibeta_derivative_double.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibeta_derivative(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibeta_derivative(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibeta_derivative_float.cu b/test/test_ibeta_derivative_float.cu new file mode 100644 index 0000000000..36a79665d4 --- /dev/null +++ b/test/test_ibeta_derivative_float.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibeta_derivative(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibeta_derivative(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibeta_derivative_nvrtc_double.cpp b/test/test_ibeta_derivative_nvrtc_double.cpp new file mode 100644 index 0000000000..f15d21db00 --- /dev/null +++ b/test/test_ibeta_derivative_nvrtc_double.cpp @@ -0,0 +1,207 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ibeta_derivative_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibeta_derivative(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_derivative_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibeta_derivative_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_derivative_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ibeta_derivative(h_in1[i], h_in2[i], h_in3[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibeta_derivative_nvrtc_float.cpp b/test/test_ibeta_derivative_nvrtc_float.cpp new file mode 100644 index 0000000000..17443e0bdc --- /dev/null +++ b/test/test_ibeta_derivative_nvrtc_float.cpp @@ -0,0 +1,207 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_ibeta_derivative_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibeta_derivative(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_derivative_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibeta_derivative_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_derivative_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ibeta_derivative(h_in1[i], h_in2[i], h_in3[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibeta_double.cu b/test/test_ibeta_double.cu new file mode 100644 index 0000000000..20384bf25f --- /dev/null +++ b/test/test_ibeta_double.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibeta(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibeta(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibeta_float.cu b/test/test_ibeta_float.cu new file mode 100644 index 0000000000..be17813ee4 --- /dev/null +++ b/test/test_ibeta_float.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibeta(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibeta(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibeta_inv.cpp b/test/test_ibeta_inv.cpp index 218c1625e8..ab1f4267fc 100644 --- a/test/test_ibeta_inv.cpp +++ b/test/test_ibeta_inv.cpp @@ -3,7 +3,18 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif + +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wliteral-range" +#elif defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Woverflow" +#endif + #include"test_ibeta_inv.hpp" #if !defined(TEST_FLOAT) && !defined(TEST_DOUBLE) && !defined(TEST_LDOUBLE) && !defined(TEST_REAL_CONCEPT) diff --git a/test/test_ibeta_inv.hpp b/test/test_ibeta_inv.hpp index ba98901773..fa765b2ef8 100644 --- a/test/test_ibeta_inv.hpp +++ b/test/test_ibeta_inv.hpp @@ -8,10 +8,11 @@ #define BOOST_TEST_MAIN #include #include +#include #include // for has_denorm_now #include #include -#include +#include "../include_private/boost/math/tools/test.hpp" #include #include #include @@ -306,6 +307,7 @@ void test_spots(T) BOOST_MATH_CHECK_THROW(::boost::math::ibeta_inv(static_cast(2.125), -n, static_cast(0.125)), std::domain_error); BOOST_MATH_CHECK_THROW(::boost::math::ibeta_inv(static_cast(2.125), static_cast(1.125), -n), std::domain_error); } + #ifndef SYCL_LANGUAGE_VERSION if (boost::math::detail::has_denorm_now()) { T m = std::numeric_limits::denorm_min(); @@ -317,5 +319,6 @@ void test_spots(T) BOOST_CHECK((boost::math::isfinite)(boost::math::ibeta_inv(static_cast(12.125), m, static_cast(0.125)))); BOOST_CHECK((boost::math::isfinite)(boost::math::ibeta_inv(m, m, static_cast(0.125)))); } + #endif } diff --git a/test/test_ibeta_inv_ab.cpp b/test/test_ibeta_inv_ab.cpp index c1acb2d1ca..fdf735ef1e 100644 --- a/test/test_ibeta_inv_ab.cpp +++ b/test/test_ibeta_inv_ab.cpp @@ -3,7 +3,18 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef SYCL_LANGUAGE_VERSION #include +#endif + +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wliteral-range" +#elif defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Woverflow" +#endif + #include "test_ibeta_inv_ab.hpp" #if !defined(TEST_FLOAT) && !defined(TEST_DOUBLE) && !defined(TEST_LDOUBLE) && !defined(TEST_REAL_CONCEPT) diff --git a/test/test_ibeta_inv_ab.hpp b/test/test_ibeta_inv_ab.hpp index c378d15287..b91ab5261d 100644 --- a/test/test_ibeta_inv_ab.hpp +++ b/test/test_ibeta_inv_ab.hpp @@ -10,9 +10,10 @@ #define BOOST_TEST_MAIN #include #include +#include #include #include -#include +#include "../include_private/boost/math/tools/test.hpp" #include #include #include diff --git a/test/test_ibeta_inv_double.cu b/test/test_ibeta_inv_double.cu new file mode 100644 index 0000000000..ef62c5e162 --- /dev/null +++ b/test/test_ibeta_inv_double.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibeta_inv(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibeta_inv(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibeta_inv_float.cu b/test/test_ibeta_inv_float.cu new file mode 100644 index 0000000000..a0d48bfbda --- /dev/null +++ b/test/test_ibeta_inv_float.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibeta_inv(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibeta_inv(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibeta_inv_nvrtc_double.cpp b/test/test_ibeta_inv_nvrtc_double.cpp new file mode 100644 index 0000000000..2f01012bbe --- /dev/null +++ b/test/test_ibeta_inv_nvrtc_double.cpp @@ -0,0 +1,207 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ibeta_inv_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibeta_inv(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_inv_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibeta_inv_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_inv_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ibeta_inv(h_in1[i], h_in2[i], h_in3[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibeta_inv_nvrtc_float.cpp b/test/test_ibeta_inv_nvrtc_float.cpp new file mode 100644 index 0000000000..5d804398cb --- /dev/null +++ b/test/test_ibeta_inv_nvrtc_float.cpp @@ -0,0 +1,207 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_ibeta_inv_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibeta_inv(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_inv_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibeta_inv_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_inv_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ibeta_inv(h_in1[i], h_in2[i], h_in3[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibeta_inva_double.cu b/test/test_ibeta_inva_double.cu new file mode 100644 index 0000000000..7783eb21bb --- /dev/null +++ b/test/test_ibeta_inva_double.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibeta_inva(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibeta_inva(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibeta_inva_float.cu b/test/test_ibeta_inva_float.cu new file mode 100644 index 0000000000..ff918f9436 --- /dev/null +++ b/test/test_ibeta_inva_float.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibeta_inva(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibeta_inva(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibeta_inva_nvrtc_double.cpp b/test/test_ibeta_inva_nvrtc_double.cpp new file mode 100644 index 0000000000..a392eaea65 --- /dev/null +++ b/test/test_ibeta_inva_nvrtc_double.cpp @@ -0,0 +1,220 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ibeta_inva_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibeta_inva(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_inva_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibeta_inva_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_inva_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + // Sometimes the ignore error policy is ignored so the below throws + // Rather than terminating we can continue to process through our results array + double res; + try + { + res = boost::math::ibeta_inva(h_in1[i], h_in2[i], h_in3[i]); + } + catch (...) + { + continue; + } + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibeta_inva_nvrtc_float.cpp b/test/test_ibeta_inva_nvrtc_float.cpp new file mode 100644 index 0000000000..ba5745c321 --- /dev/null +++ b/test/test_ibeta_inva_nvrtc_float.cpp @@ -0,0 +1,220 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false +typedef float float_type; +#include +#include +extern "C" __global__ +void test_ibeta_inva_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibeta_inva(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_inva_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibeta_inva_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_inva_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + // Sometimes the ignore error policy is ignored so the below throws + // Rather than terminating we can continue to process through our results array + double res; + try + { + res = boost::math::ibeta_inva(h_in1[i], h_in2[i], h_in3[i]); + } + catch (...) + { + continue; + } + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibeta_invb_double.cu b/test/test_ibeta_invb_double.cu new file mode 100644 index 0000000000..562f5349dd --- /dev/null +++ b/test/test_ibeta_invb_double.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibeta_invb(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibeta_invb(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibeta_invb_float.cu b/test/test_ibeta_invb_float.cu new file mode 100644 index 0000000000..86f5615c36 --- /dev/null +++ b/test/test_ibeta_invb_float.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibeta_invb(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibeta_invb(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibeta_invb_nvrtc_double.cpp b/test/test_ibeta_invb_nvrtc_double.cpp new file mode 100644 index 0000000000..6f046f09f3 --- /dev/null +++ b/test/test_ibeta_invb_nvrtc_double.cpp @@ -0,0 +1,220 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ibeta_invb_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibeta_invb(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_invb_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibeta_invb_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_invb_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + // Sometimes the ignore error policy is ignored so the below throws + // Rather than terminating we can continue to process through our results array + double res; + try + { + res = boost::math::ibeta_invb(h_in1[i], h_in2[i], h_in3[i]); + } + catch (...) + { + continue; + } + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibeta_invb_nvrtc_float.cpp b/test/test_ibeta_invb_nvrtc_float.cpp new file mode 100644 index 0000000000..f2d17b8447 --- /dev/null +++ b/test/test_ibeta_invb_nvrtc_float.cpp @@ -0,0 +1,220 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false +typedef float float_type; +#include +#include +extern "C" __global__ +void test_ibeta_invb_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibeta_invb(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_invb_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibeta_invb_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_invb_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + // Sometimes the ignore error policy is ignored so the below throws + // Rather than terminating we can continue to process through our results array + double res; + try + { + res = boost::math::ibeta_invb(h_in1[i], h_in2[i], h_in3[i]); + } + catch (...) + { + continue; + } + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibeta_nvrtc_double.cpp b/test/test_ibeta_nvrtc_double.cpp new file mode 100644 index 0000000000..bc920b6368 --- /dev/null +++ b/test/test_ibeta_nvrtc_double.cpp @@ -0,0 +1,207 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ibeta_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibeta(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibeta_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ibeta(h_in1[i], h_in2[i], h_in3[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibeta_nvrtc_float.cpp b/test/test_ibeta_nvrtc_float.cpp new file mode 100644 index 0000000000..ee15748628 --- /dev/null +++ b/test/test_ibeta_nvrtc_float.cpp @@ -0,0 +1,207 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_ibeta_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibeta(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibeta_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibeta_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibeta_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ibeta(h_in1[i], h_in2[i], h_in3[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibetac_inv_double.cu b/test/test_ibetac_inv_double.cu new file mode 100644 index 0000000000..a983d16677 --- /dev/null +++ b/test/test_ibetac_inv_double.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibetac_inv(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibetac_inv(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibetac_inv_float.cu b/test/test_ibetac_inv_float.cu new file mode 100644 index 0000000000..94583b45e2 --- /dev/null +++ b/test/test_ibetac_inv_float.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibetac_inv(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibetac_inv(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibetac_inv_nvrtc_double.cpp b/test/test_ibetac_inv_nvrtc_double.cpp new file mode 100644 index 0000000000..a99d53b3cd --- /dev/null +++ b/test/test_ibetac_inv_nvrtc_double.cpp @@ -0,0 +1,207 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ibetac_inv_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibetac_inv(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_inv_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibetac_inv_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_inv_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ibetac_inv(h_in1[i], h_in2[i], h_in3[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibetac_inv_nvrtc_float.cpp b/test/test_ibetac_inv_nvrtc_float.cpp new file mode 100644 index 0000000000..47e89db4c1 --- /dev/null +++ b/test/test_ibetac_inv_nvrtc_float.cpp @@ -0,0 +1,207 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_ibetac_inv_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibetac_inv(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_inv_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibetac_inv_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_inv_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ibetac_inv(h_in1[i], h_in2[i], h_in3[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibetac_inva_double.cu b/test/test_ibetac_inva_double.cu new file mode 100644 index 0000000000..2efbee265d --- /dev/null +++ b/test/test_ibetac_inva_double.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibetac_inva(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibetac_inva(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibetac_inva_float.cu b/test/test_ibetac_inva_float.cu new file mode 100644 index 0000000000..9bd1a29a07 --- /dev/null +++ b/test/test_ibetac_inva_float.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibetac_inva(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibetac_inva(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibetac_inva_nvrtc_double.cpp b/test/test_ibetac_inva_nvrtc_double.cpp new file mode 100644 index 0000000000..7c7bf992b3 --- /dev/null +++ b/test/test_ibetac_inva_nvrtc_double.cpp @@ -0,0 +1,220 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ibetac_inva_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibetac_inva(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_inva_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibetac_inva_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_inva_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + // Sometimes the ignore error policy is ignored so the below throws + // Rather than terminating we can continue to process through our results array + double res; + try + { + res = boost::math::ibetac_inva(h_in1[i], h_in2[i], h_in3[i]); + } + catch (...) + { + continue; + } + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibetac_inva_nvrtc_float.cpp b/test/test_ibetac_inva_nvrtc_float.cpp new file mode 100644 index 0000000000..c79b8b02f1 --- /dev/null +++ b/test/test_ibetac_inva_nvrtc_float.cpp @@ -0,0 +1,220 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false +typedef float float_type; +#include +#include +extern "C" __global__ +void test_ibetac_inva_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibetac_inva(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_inva_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibetac_inva_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_inva_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + // Sometimes the ignore error policy is ignored so the below throws + // Rather than terminating we can continue to process through our results array + double res; + try + { + res = boost::math::ibetac_inva(h_in1[i], h_in2[i], h_in3[i]); + } + catch (...) + { + continue; + } + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibetac_invb_double.cu b/test/test_ibetac_invb_double.cu new file mode 100644 index 0000000000..fddd655af2 --- /dev/null +++ b/test/test_ibetac_invb_double.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibetac_invb(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibetac_invb(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibetac_invb_float.cu b/test/test_ibetac_invb_float.cu new file mode 100644 index 0000000000..fddd655af2 --- /dev/null +++ b/test/test_ibetac_invb_float.cu @@ -0,0 +1,149 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// floating-point value does not fit in required floating-point type +#pragma nv_diag_suppress 221 + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::ibetac_invb(in1[i], in2[i], in3[i]); + } +} + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +/** + * Host main routine + */ +int main(void) +{ + try{ + // Consolidate the test data: + std::vector v1, v2, v3; + + for(unsigned i = 0; i < ibeta_data.size(); ++i) + { + v1.push_back(ibeta_data[i][0]); + v2.push_back(ibeta_data[i][1]); + v3.push_back(ibeta_data[i][2]); + } + for(unsigned i = 0; i < ibeta_small_data.size(); ++i) + { + v1.push_back(ibeta_small_data[i][0]); + v2.push_back(ibeta_small_data[i][1]); + v3.push_back(ibeta_small_data[i][2]); + } + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr input_vector3(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + for (int i = 0; i < numElements; ++i) + { + int table_id = i % v1.size(); + input_vector1[i] = v1[table_id]; + input_vector2[i] = v2[table_id]; + input_vector3[i] = v3[table_id]; + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + cuda_test<<>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + results.push_back(boost::math::ibetac_invb(input_vector1[i], input_vector2[i], input_vector3[i])); + double t = w.elapsed(); + bool failed = false; + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::isfinite(output_vector[i])) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl; + failed = true; + } + } + } + + if (failed) + { + return EXIT_FAILURE; + } + + std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + } + return 0; +} + + diff --git a/test/test_ibetac_invb_nvrtc_double.cpp b/test/test_ibetac_invb_nvrtc_double.cpp new file mode 100644 index 0000000000..76f6318901 --- /dev/null +++ b/test/test_ibetac_invb_nvrtc_double.cpp @@ -0,0 +1,220 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ibetac_invb_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibetac_invb(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_invb_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibetac_invb_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_invb_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + // Sometimes the ignore error policy is ignored so the below throws + // Rather than terminating we can continue to process through our results array + double res; + try + { + res = boost::math::ibetac_invb(h_in1[i], h_in2[i], h_in3[i]); + } + catch (...) + { + continue; + } + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibetac_invb_nvrtc_float.cpp b/test/test_ibetac_invb_nvrtc_float.cpp new file mode 100644 index 0000000000..48d0a31eec --- /dev/null +++ b/test/test_ibetac_invb_nvrtc_float.cpp @@ -0,0 +1,220 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false +typedef float float_type; +#include +#include +extern "C" __global__ +void test_ibetac_invb_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibetac_invb(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_invb_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibetac_invb_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_invb_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + // Sometimes the ignore error policy is ignored so the below throws + // Rather than terminating we can continue to process through our results array + double res; + try + { + res = boost::math::ibetac_invb(h_in1[i], h_in2[i], h_in3[i]); + } + catch (...) + { + continue; + } + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibetac_nvrtc_double.cpp b/test/test_ibetac_nvrtc_double.cpp new file mode 100644 index 0000000000..6a59473e18 --- /dev/null +++ b/test/test_ibetac_nvrtc_double.cpp @@ -0,0 +1,207 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_ibetac_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibetac(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibetac_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ibetac(h_in1[i], h_in2[i], h_in3[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_ibetac_nvrtc_float.cpp b/test/test_ibetac_nvrtc_float.cpp new file mode 100644 index 0000000000..a989191e51 --- /dev/null +++ b/test/test_ibetac_nvrtc_float.cpp @@ -0,0 +1,207 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef float float_type; + +const char* cuda_kernel = R"( +typedef float float_type; +#include +#include +extern "C" __global__ +void test_ibetac_kernel(const float_type *in1, const float_type *in2, const float_type *in3, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::ibetac(in1[i], in2[i], in3[i]); + } +} +)"; + +template struct table_type { typedef T type; }; +typedef float_type T; +#define SC_(x) static_cast(x) + +#include "ibeta_data.ipp" +#include "ibeta_small_data.ipp" + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_ibetac_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_ibetac_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_ibetac_kernel"), "Failed to get kernel function"); + + int numElements = ibeta_data.size() + ibeta_small_data.size(); + float_type *h_in1, *h_in2, *h_in3, *h_out; + float_type *d_in1, *d_in2, *d_in3, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_in3 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + for (int i = 0; i < ibeta_data.size(); ++i) + { + h_in1[i] = ibeta_data[i][0]; + h_in2[i] = ibeta_data[i][1]; + h_in3[i] = ibeta_data[i][2]; + } + for (int i = 0; i < ibeta_small_data.size(); ++i) + { + h_in1[i + ibeta_data.size()] = ibeta_small_data[i][0]; + h_in2[i + ibeta_data.size()] = ibeta_small_data[i][1]; + h_in3[i + ibeta_data.size()] = ibeta_small_data[i][2]; + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_in3, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + checkCUDAError(cudaMemcpy(d_in3, h_in3, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in3"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_in3, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::ibetac(h_in1[i], h_in2[i], h_in3[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_in3); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_in3; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_igamma.cpp b/test/test_igamma.cpp index 0ad7019963..6e034f3c60 100644 --- a/test/test_igamma.cpp +++ b/test/test_igamma.cpp @@ -12,7 +12,7 @@ # pragma clang diagnostic ignored "-Wliteral-range" #elif defined(__GNUC__) # pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wliteral-range" +# pragma GCC diagnostic ignored "-Woverflow" #endif #include "test_igamma.hpp" diff --git a/test/test_igamma_inv.cpp b/test/test_igamma_inv.cpp index 17e0bfb54f..80a553427c 100644 --- a/test/test_igamma_inv.cpp +++ b/test/test_igamma_inv.cpp @@ -12,7 +12,7 @@ # pragma clang diagnostic ignored "-Wliteral-range" #elif defined(__GNUC__) # pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wliteral-range" +# pragma GCC diagnostic ignored "-Woverflow" #endif #define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error #include "test_igamma_inv.hpp" diff --git a/test/test_igamma_inva.cpp b/test/test_igamma_inva.cpp index 8d0e965962..443ad7bbc6 100644 --- a/test/test_igamma_inva.cpp +++ b/test/test_igamma_inva.cpp @@ -12,7 +12,7 @@ # pragma clang diagnostic ignored "-Wliteral-range" #elif defined(__GNUC__) # pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wliteral-range" +# pragma GCC diagnostic ignored "-Woverflow" #endif #include "test_igamma_inva.hpp" diff --git a/test/test_landau.cpp b/test/test_landau.cpp index 1625b21777..c69c208177 100644 --- a/test/test_landau.cpp +++ b/test/test_landau.cpp @@ -20,7 +20,7 @@ # pragma clang diagnostic ignored "-Wliteral-range" #elif defined(__GNUC__) # pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wliteral-range" +# pragma GCC diagnostic ignored "-Woverflow" #endif using boost::math::landau_distribution; diff --git a/test/test_mapairy.cpp b/test/test_mapairy.cpp index ee8e43bf00..ca3b415d76 100644 --- a/test/test_mapairy.cpp +++ b/test/test_mapairy.cpp @@ -9,7 +9,7 @@ # pragma clang diagnostic ignored "-Wliteral-range" #elif defined(__GNUC__) # pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wliteral-range" +# pragma GCC diagnostic ignored "-Woverflow" #endif #define BOOST_TEST_MAIN